diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2010-08-09 01:49:58 +0200 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2010-08-09 02:14:15 +0200 |
commit | d9a145fb6e5f37b9903dea8371ab5c3e34e8e2d1 (patch) | |
tree | e2b4bb46fa00f0ad20447e40dba6fb21a4ae0815 /kernel | |
parent | c9243f5bdd6637b2bb7dc254b54d9edf957ef17e (diff) | |
parent | 45d7f32c7a43cbb9592886d38190e379e2eb2226 (diff) |
Merge commit 'linus/master' into bkl/core
Merge reason: The staging tree has introduced the easycap
driver lately. We need the latest updates to pushdown the
bkl in its ioctl helper.
Diffstat (limited to 'kernel')
86 files changed, 7438 insertions, 6262 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 057472fbc272..c53e491e25a8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -76,8 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ -obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o +obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o @@ -99,8 +99,6 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o -obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/async.c b/kernel/async.c index 15319d6c18fe..cd9dbb913c77 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel. */ #include <linux/async.h> -#include <linux/bug.h> #include <linux/module.h> #include <linux/wait.h> #include <linux/sched.h> -#include <linux/init.h> -#include <linux/kthread.h> -#include <linux/delay.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include <asm/atomic.h> static async_cookie_t next_cookie = 1; -#define MAX_THREADS 256 #define MAX_WORK 32768 static LIST_HEAD(async_pending); static LIST_HEAD(async_running); static DEFINE_SPINLOCK(async_lock); -static int async_enabled = 0; - struct async_entry { - struct list_head list; - async_cookie_t cookie; - async_func_ptr *func; - void *data; - struct list_head *running; + struct list_head list; + struct work_struct work; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); -static DECLARE_WAIT_QUEUE_HEAD(async_new); static atomic_t entry_count; -static atomic_t thread_count; extern int initcall_debug; @@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running) spin_unlock_irqrestore(&async_lock, flags); return ret; } + /* * pick the first pending entry and run it */ -static void run_one_entry(void) +static void async_run_entry_fn(struct work_struct *work) { + struct async_entry *entry = + container_of(work, struct async_entry, work); unsigned long flags; - struct async_entry *entry; ktime_t calltime, delta, rettime; - /* 1) pick one task from the pending queue */ - + /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - if (list_empty(&async_pending)) - goto out; - entry = list_first_entry(&async_pending, struct async_entry, list); - - /* 2) move it to the running queue */ list_move_tail(&entry->list, entry->running); spin_unlock_irqrestore(&async_lock, flags); - /* 3) run it (and print duration)*/ + /* 2) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); @@ -153,31 +142,25 @@ static void run_one_entry(void) (long long)ktime_to_ns(delta) >> 10); } - /* 4) remove it from the running queue */ + /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); - /* 5) free the entry */ + /* 4) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* 6) wake up any waiters. */ + /* 5) wake up any waiters */ wake_up(&async_done); - return; - -out: - spin_unlock_irqrestore(&async_lock, flags); } - static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) { struct async_entry *entry; unsigned long flags; async_cookie_t newcookie; - /* allow irq-off callers */ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); @@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l * If we're out of memory or if there's too much work * pending already, we execute synchronously. */ - if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { + if (!entry || atomic_read(&entry_count) > MAX_WORK) { kfree(entry); spin_lock_irqsave(&async_lock, flags); newcookie = next_cookie++; @@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l ptr(data, newcookie); return newcookie; } + INIT_WORK(&entry->work, async_run_entry_fn); entry->func = ptr; entry->data = data; entry->running = running; @@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l list_add_tail(&entry->list, &async_pending); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - wake_up(&async_new); + + /* schedule for execution */ + queue_work(system_unbound_wq, &entry->work); + return newcookie; } @@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie) async_synchronize_cookie_domain(cookie, &async_running); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); - - -static int async_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int ret = HZ; - set_current_state(TASK_INTERRUPTIBLE); - /* - * check the list head without lock.. false positives - * are dealt with inside run_one_entry() while holding - * the lock. - */ - rmb(); - if (!list_empty(&async_pending)) - run_one_entry(); - else - ret = schedule_timeout(HZ); - - if (ret == 0) { - /* - * we timed out, this means we as thread are redundant. - * we sign off and die, but we to avoid any races there - * is a last-straw check to see if work snuck in. - */ - atomic_dec(&thread_count); - wmb(); /* manager must see our departure first */ - if (list_empty(&async_pending)) - break; - /* - * woops work came in between us timing out and us - * signing off; we need to stay alive and keep working. - */ - atomic_inc(&thread_count); - } - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int async_manager_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int tc, ec; - - set_current_state(TASK_INTERRUPTIBLE); - - tc = atomic_read(&thread_count); - rmb(); - ec = atomic_read(&entry_count); - - while (tc < ec && tc < MAX_THREADS) { - if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", - tc))) { - msleep(100); - continue; - } - atomic_inc(&thread_count); - tc++; - } - - schedule(); - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int __init async_init(void) -{ - async_enabled = - !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); - - WARN_ON(!async_enabled); - return 0; -} - -core_initcall(async_init); diff --git a/kernel/audit.c b/kernel/audit.c index c71bd26631a2..8296aa516c5a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -407,7 +407,7 @@ static void kauditd_send_skb(struct sk_buff *skb) audit_hold_skb(skb); } else /* drop the extra reference if sent ok */ - kfree_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ac6f5b0a64b..d83cab06da87 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct kobject *cgroup_kobj; + static inline struct cgroup *__d_cgrp(struct dentry *dentry) { return dentry->d_fsdata; @@ -1788,6 +1790,29 @@ out: return retval; } +/** + * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup + * @tsk: the task to be attached + */ +int cgroup_attach_task_current_cg(struct task_struct *tsk) +{ + struct cgroupfs_root *root; + struct cgroup *cur_cg; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + cur_cg = task_cgroup_from_root(current, root); + retval = cgroup_attach_task(cur_cg, tsk); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); + /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex * held. May take task_lock of task @@ -3871,9 +3896,18 @@ int __init cgroup_init(void) hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); BUG_ON(!init_root_id(&rootnode)); + + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); + if (!cgroup_kobj) { + err = -ENOMEM; + goto out; + } + err = register_filesystem(&cgroup_fs_type); - if (err < 0) + if (err < 0) { + kobject_put(cgroup_kobj); goto out; + } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4ac..f6e726f18491 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - set_cpu_active(cpu, false); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { - set_cpu_active(cpu, true); - nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", @@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { - set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); @@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - set_cpu_active(cpu, true); - /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611eadde..b23c0979bbe7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -105,7 +105,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset heirarchy */ + /* used for walking a cpuset hierarchy */ struct list_head stack_list; }; @@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_online_map on each CPU hotplug (cpuhp) event. + * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ -static int cpuset_track_online_cpus(struct notifier_block *unused_nb, - unsigned long phase, void *unused_cpu) +void cpuset_update_active_cpus(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; - switch (phase) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - - default: - return NOTIFY_DONE; - } - cgroup_lock(); mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); @@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - - return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); cpuset_wq = create_singlethread_workqueue("cpuset"); diff --git a/kernel/cred.c b/kernel/cred.c index a2d5504fbcc2..60bc8b1e32e6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk) } } +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cb7cd1de10c..3c2d4972d235 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -605,13 +605,15 @@ cpu_master_loop: if (dbg_kdb_mode) { kgdb_connected = 1; error = kdb_stub(ks); + if (error == -1) + continue; + kgdb_connected = 0; } else { error = gdb_serial_stub(ks); } if (error == DBG_PASS_EVENT) { dbg_kdb_mode = !dbg_kdb_mode; - kgdb_connected = 0; } else if (error == DBG_SWITCH_CPU_EVENT) { dbg_cpu_switch(cpu, dbg_switch_cpu); goto cpu_loop; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 4b17b3269525..481a7bd2dfe7 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES + * GDB remote protocol parser: */ -static int hex(char ch) -{ - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - if ((ch >= 'A') && (ch <= 'F')) - return ch - 'A' + 10; - return -1; -} - #ifdef CONFIG_KGDB_KDB static int gdbstub_read_wait(void) { @@ -123,8 +112,8 @@ static void get_packet(char *buffer) buffer[count] = 0; if (ch == '#') { - xmitcsum = hex(gdbstub_read_wait()) << 4; - xmitcsum += hex(gdbstub_read_wait()); + xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; + xmitcsum += hex_to_bin(gdbstub_read_wait()); if (checksum != xmitcsum) /* failed checksum */ @@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len) * buf. Return a pointer to the last char put in buf (null). May * return an error. */ -int kgdb_mem2hex(char *mem, char *buf, int count) +char *kgdb_mem2hex(char *mem, char *buf, int count) { char *tmp; int err; @@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count) tmp = buf + count; err = probe_kernel_read(tmp, mem, count); - if (!err) { - while (count > 0) { - buf = pack_hex_byte(buf, *tmp); - tmp++; - count--; - } - - *buf = 0; + if (err) + return NULL; + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; } + *buf = 0; - return err; + return buf; } /* @@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count) tmp_hex = tmp_raw - 1; while (tmp_hex >= buf) { tmp_raw--; - *tmp_raw = hex(*tmp_hex--); - *tmp_raw |= hex(*tmp_hex--) << 4; + *tmp_raw = hex_to_bin(*tmp_hex--); + *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; } return probe_kernel_write(mem, tmp_raw, count); @@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } while (**ptr) { - hex_val = hex(**ptr); + hex_val = hex_to_bin(**ptr); if (hex_val < 0) break; @@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) return probe_kernel_write(mem, c, size); } +#if DBG_MAX_REG_NUM > 0 +void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_get_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} + +void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_set_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Write memory due to an 'M' or 'X' packet. */ static int write_mem_msg(int binary) { @@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error) * remapped to negative TIDs. */ -#define BUF_THREAD_ID_SIZE 16 +#define BUF_THREAD_ID_SIZE 8 static char *pack_threadid(char *pkt, unsigned char *id) { - char *limit; + unsigned char *limit; + int lzero = 1; + + limit = id + (BUF_THREAD_ID_SIZE / 2); + while (id < limit) { + if (!lzero || *id != 0) { + pkt = pack_hex_byte(pkt, *id); + lzero = 0; + } + id++; + } - limit = pkt + BUF_THREAD_ID_SIZE; - while (pkt < limit) - pkt = pack_hex_byte(pkt, *id++); + if (lzero) + pkt = pack_hex_byte(pkt, 0); return pkt; } static void int_to_threadref(unsigned char *id, int value) { - unsigned char *scan; - int i = 4; - - scan = (unsigned char *)id; - while (i--) - *scan++ = 0; - put_unaligned_be32(value, scan); + put_unaligned_be32(value, id); } static struct task_struct *getthread(struct pt_regs *regs, int tid) @@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) pack_hex_byte(&remcom_out_buffer[1], ks->signo); } -/* Handle the 'g' get registers request */ -static void gdb_cmd_getregs(struct kgdb_state *ks) +static void gdb_get_regs_helper(struct kgdb_state *ks) { struct task_struct *thread; void *local_debuggerinfo; @@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) */ sleeping_thread_to_gdb_regs(gdb_regs, thread); } +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + gdb_get_regs_helper(ks); kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); } @@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks) char *ptr = &remcom_in_buffer[1]; unsigned long length; unsigned long addr; - int err; + char *err; if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && kgdb_hex2long(&ptr, &length) > 0) { err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); - if (err) - error_packet(remcom_out_buffer, err); + if (!err) + error_packet(remcom_out_buffer, -EINVAL); } else { error_packet(remcom_out_buffer, -EINVAL); } @@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) strcpy(remcom_out_buffer, "OK"); } +#if DBG_MAX_REG_NUM > 0 +static char *gdb_hex_reg_helper(int regnum, char *out) +{ + int i; + int offset = 0; + + for (i = 0; i < regnum; i++) + offset += dbg_reg_def[i].size; + return kgdb_mem2hex((char *)gdb_regs + offset, out, + dbg_reg_def[i].size); +} + +/* Handle the 'p' individual regster get */ +static void gdb_cmd_reg_get(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + + kgdb_hex2long(&ptr, ®num); + if (regnum >= DBG_MAX_REG_NUM) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + gdb_get_regs_helper(ks); + gdb_hex_reg_helper(regnum, remcom_out_buffer); +} + +/* Handle the 'P' individual regster set */ +static void gdb_cmd_reg_set(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + int i = 0; + + kgdb_hex2long(&ptr, ®num); + if (*ptr++ != '=' || + !(!kgdb_usethread || kgdb_usethread == current) || + !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + memset(gdb_regs, 0, sizeof(gdb_regs)); + while (i < sizeof(gdb_regs) * 2) + if (hex_to_bin(ptr[i]) >= 0) + i++; + else + break; + i = i / 2; + kgdb_hex2mem(ptr, (char *)gdb_regs, i); + dbg_set_reg(regnum, gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Handle the 'X' memory binary write bytes */ static void gdb_cmd_binwrite(struct kgdb_state *ks) { @@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) { struct task_struct *g; struct task_struct *p; - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; int i; int cpu; @@ -621,10 +697,8 @@ static void gdb_cmd_query(struct kgdb_state *ks) switch (remcom_in_buffer[1]) { case 's': case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) break; - } i = 0; remcom_out_buffer[0] = 'm'; @@ -634,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) for_each_online_cpu(cpu) { ks->thr_query = 0; int_to_threadref(thref, -cpu - 2); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; i++; } @@ -644,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) do_each_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; ks->thr_query++; if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) @@ -665,10 +737,9 @@ static void gdb_cmd_query(struct kgdb_state *ks) pack_threadid(remcom_out_buffer + 2, thref); break; case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) break; - } + ks->threadid = 0; ptr = remcom_in_buffer + 17; kgdb_hex2long(&ptr, &ks->threadid); @@ -861,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks) int error = 0; int tmp; - /* Clear the out buffer. */ + /* Initialize comm buffer and globals. */ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; if (kgdb_connected) { - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; /* Reply to host that an exception has occurred */ @@ -879,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks) put_packet(remcom_out_buffer); } - kgdb_usethread = kgdb_info[ks->cpu].task; - ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); - ks->pass_exception = 0; - while (1) { error = 0; @@ -907,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks) case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_memwrite(ks); break; +#if DBG_MAX_REG_NUM > 0 + case 'p': /* pXX Return gdb register XX (in hex) */ + gdb_cmd_reg_get(ks); + break; + case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ + gdb_cmd_reg_set(ks); + break; +#endif /* DBG_MAX_REG_NUM > 0 */ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_binwrite(ks); break; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 184cd8209c36..28b844118bbd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value) if (endp == arg) { /* - * Try base 16, for us folks too lazy to type the + * Also try base 16, for us folks too lazy to type the * leading 0x... */ val = simple_strtoul(arg, &endp, 16); @@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value) return 0; } +int kdbgetu64arg(const char *arg, u64 *value) +{ + char *endp; + u64 val; + + val = simple_strtoull(arg, &endp, 0); + + if (endp == arg) { + + val = simple_strtoull(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + /* * kdb_set - This function implements the 'set' command. Alter an * existing environment variable or create a new one. @@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv) */ static int kdb_rd(int argc, const char **argv) { - int diag = kdb_check_regs(); - if (diag) - return diag; + int len = kdb_check_regs(); +#if DBG_MAX_REG_NUM > 0 + int i; + char *rname; + int rsize; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; + + if (len) + return len; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + rsize = dbg_reg_def[i].size * 2; + if (rsize > 16) + rsize = 2; + if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { + len = 0; + kdb_printf("\n"); + } + if (len) + len += kdb_printf(" "); + switch(dbg_reg_def[i].size * 8) { + case 8: + rname = dbg_get_reg(i, ®8, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %02x", rname, reg8); + break; + case 16: + rname = dbg_get_reg(i, ®16, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %04x", rname, reg16); + break; + case 32: + rname = dbg_get_reg(i, ®32, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %08x", rname, reg32); + break; + case 64: + rname = dbg_get_reg(i, ®64, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %016llx", rname, reg64); + break; + default: + len += kdb_printf("%s: ??", dbg_reg_def[i].name); + } + } + kdb_printf("\n"); +#else + if (len) + return len; kdb_dumpregs(kdb_current_regs); +#endif return 0; } @@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv) * kdb_rm - This function implements the 'rm' (register modify) command. * rm register-name new-contents * Remarks: - * Currently doesn't allow modification of control or - * debug registers. + * Allows register modification with the same restrictions as gdb */ static int kdb_rm(int argc, const char **argv) { +#if DBG_MAX_REG_NUM > 0 int diag; - int ind = 0; - unsigned long contents; + const char *rname; + int i; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; if (argc != 2) return KDB_ARGCOUNT; /* * Allow presence or absence of leading '%' symbol. */ - if (argv[1][0] == '%') - ind = 1; + rname = argv[1]; + if (*rname == '%') + rname++; - diag = kdbgetularg(argv[2], &contents); + diag = kdbgetu64arg(argv[2], ®64); if (diag) return diag; diag = kdb_check_regs(); if (diag) return diag; + + diag = KDB_BADREG; + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + if (strcmp(rname, dbg_reg_def[i].name) == 0) { + diag = 0; + break; + } + } + if (!diag) { + switch(dbg_reg_def[i].size * 8) { + case 8: + reg8 = reg64; + dbg_set_reg(i, ®8, kdb_current_regs); + break; + case 16: + reg16 = reg64; + dbg_set_reg(i, ®16, kdb_current_regs); + break; + case 32: + reg32 = reg64; + dbg_set_reg(i, ®32, kdb_current_regs); + break; + case 64: + dbg_set_reg(i, ®64, kdb_current_regs); + break; + } + } + return diag; +#else kdb_printf("ERROR: Register set currently not implemented\n"); - return 0; + return 0; +#endif } #if defined(CONFIG_MAGIC_SYSRQ) @@ -1820,9 +1928,8 @@ static int kdb_sr(int argc, const char **argv) { if (argc != 1) return KDB_ARGCOUNT; - sysrq_toggle_support(1); kdb_trap_printk++; - handle_sysrq(*argv[1], NULL); + __handle_sysrq(*argv[1], NULL, 0); kdb_trap_printk--; return 0; @@ -1883,6 +1990,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); + kdb_printf(" 0x%p", mod->module_core); #ifdef CONFIG_MODULE_UNLOAD { @@ -2291,6 +2399,9 @@ static int kdb_ll(int argc, const char **argv) while (va) { char buf[80]; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); diag = kdb_parse(buf); if (diag) @@ -2437,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { + struct timespec now; struct kdb_tm tm; struct sysinfo val; @@ -2451,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - kdb_gmtime(&xtime, &tm); + now = __current_kernel_time(); + kdb_gmtime(&now, &tm); kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 97d3ba69775d..c438f545a321 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); extern int kdb_putword(unsigned long, unsigned long, size_t); extern int kdbgetularg(const char *, unsigned long *); -extern int kdb_set(int, const char **); extern char *kdbgetenv(const char *); -extern int kdbgetintenv(const char *, int *); extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); extern int kdbgetsymval(const char *, kdb_symtab_t *); diff --git a/kernel/early_res.c b/kernel/early_res.c index 31aa9332ef3f..7bfae887f211 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -7,6 +7,8 @@ #include <linux/bootmem.h> #include <linux/mm.h> #include <linux/early_res.h> +#include <linux/slab.h> +#include <linux/kmemleak.h> /* * Early reserved memory areas. @@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + i = find_overlapped_early(start, end); r = &early_res[i]; if (i >= max_early_res || r->end != end || r->start != start) @@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + if (start == end) return; diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba047..a82a65cef741 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0f..ce669174f355 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) do { seq = read_seqbegin(&xtime_lock); xts = __current_kernel_time(); - tom = wall_to_monotonic; + tom = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); xtim = timespec_to_ktime(xts); @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, static int hrtimer_get_target(int this_cpu, int pinned) { #ifdef CONFIG_NO_HZ - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - return preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) + return get_nohz_timer_target(); #endif return this_cpu; } @@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset; + struct timespec realtime_offset, wtm; unsigned long seq; if (!hrtimer_hres_active()) @@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg) do { seq = read_seqbegin(&xtime_lock); - set_normalized_timespec(&realtime_offset, - -wall_to_monotonic.tv_sec, - -wall_to_monotonic.tv_nsec); + wtm = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 7a56b22e0602..d71a987fd2bf 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -41,6 +41,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/list.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); static int nr_slots[TYPE_MAX]; +/* Keep track of the breakpoints attached to tasks */ +static LIST_HEAD(bp_task_head); + static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ @@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) return 0; } -static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) +/* + * Count the number of breakpoints of the same type and same task. + * The given event must be not on the list. + */ +static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = tsk->perf_event_ctxp; - struct list_head *list; - struct perf_event *bp; - unsigned long flags; + struct perf_event_context *ctx = bp->ctx; + struct perf_event *iter; int count = 0; - if (WARN_ONCE(!ctx, "No perf context for this task")) - return 0; - - list = &ctx->event_list; - - raw_spin_lock_irqsave(&ctx->lock, flags); - - /* - * The current breakpoint counter is not included in the list - * at the open() callback time - */ - list_for_each_entry(bp, list, event_entry) { - if (bp->attr.type == PERF_TYPE_BREAKPOINT) - if (find_slot_idx(bp) == type) - count += hw_breakpoint_weight(bp); + list_for_each_entry(iter, &bp_task_head, hw.bp_list) { + if (iter->ctx == ctx && find_slot_idx(iter) == type) + count += hw_breakpoint_weight(iter); } - raw_spin_unlock_irqrestore(&ctx->lock, flags); - return count; } @@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk, type); + slots->pinned += task_bp_pinned(bp, type); slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; @@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk, type); + nr += task_bp_pinned(bp, type); if (nr > slots->pinned) slots->pinned = nr; @@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; @@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, int old_idx = 0; int idx = 0; - old_count = task_bp_pinned(tsk, type); + old_count = task_bp_pinned(bp, type); old_idx = old_count - 1; idx = old_idx + weight; + /* tsk_pinned[n] is the number of tasks having n breakpoints */ tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { tsk_pinned[idx]++; @@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; + /* Pinned counter cpu profiling */ + if (!tsk) { + + if (enable) + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + else + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + return; + } + /* Pinned counter task profiling */ - if (tsk) { - if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; - } + if (!enable) + list_del(&bp->hw.bp_list); + + if (cpu >= 0) { + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } else { for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; + toggle_bp_task_slot(bp, cpu, enable, type, weight); } - /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + list_add_tail(&bp->hw.bp_list, &bp_task_head); +} + +/* + * Function to perform processor-specific cleanup during unregistration + */ +__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * A weak stub function here for those archs that don't define + * it inside arch/.../kernel/hw_breakpoint.c + */ } /* @@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp) weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); + /* + * Simulate the addition of this breakpoint to the constraints + * and see the result. + */ fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ @@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); + arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e1497481fe8a..c3003e9d91a3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) { if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_TIMER)) + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; desc->status |= IRQ_SUSPENDED; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 83911c780175..2dc3786349d1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -14,6 +14,8 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/freezer.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -35,6 +37,7 @@ struct kthread_create_info struct kthread { int should_stop; + void *data; struct completion exited; }; @@ -54,6 +57,19 @@ int kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +/** + * kthread_data - return data value specified on kthread creation + * @task: kthread task in question + * + * Return the data value specified when kthread @task was created. + * The caller is responsible for ensuring the validity of @task when + * calling this function. + */ +void *kthread_data(struct task_struct *task) +{ + return to_kthread(task)->data; +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -64,6 +80,7 @@ static int kthread(void *_create) int ret; self.should_stop = 0; + self.data = data; init_completion(&self.exited); current->vfork_done = &self.exited; @@ -247,3 +264,150 @@ int kthreadd(void *unused) return 0; } + +/** + * kthread_worker_fn - kthread function to process kthread_worker + * @worker_ptr: pointer to initialized kthread_worker + * + * This function can be used as @threadfn to kthread_create() or + * kthread_run() with @worker_ptr argument pointing to an initialized + * kthread_worker. The started kthread will process work_list until + * the it is stopped with kthread_stop(). A kthread can also call + * this function directly after extra initialization. + * + * Different kthreads can be used for the same kthread_worker as long + * as there's only one kthread attached to it at any given time. A + * kthread_worker without an attached kthread simply collects queued + * kthread_works. + */ +int kthread_worker_fn(void *worker_ptr) +{ + struct kthread_worker *worker = worker_ptr; + struct kthread_work *work; + + WARN_ON(worker->task); + worker->task = current; +repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + spin_lock_irq(&worker->lock); + worker->task = NULL; + spin_unlock_irq(&worker->lock); + return 0; + } + + work = NULL; + spin_lock_irq(&worker->lock); + if (!list_empty(&worker->work_list)) { + work = list_first_entry(&worker->work_list, + struct kthread_work, node); + list_del_init(&work->node); + } + spin_unlock_irq(&worker->lock); + + if (work) { + __set_current_state(TASK_RUNNING); + work->func(work); + smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ + work->done_seq = work->queue_seq; + smp_mb(); /* mb worker-b1 paired with flush-b0 */ + if (atomic_read(&work->flushing)) + wake_up_all(&work->done); + } else if (!freezing(current)) + schedule(); + + try_to_freeze(); + goto repeat; +} +EXPORT_SYMBOL_GPL(kthread_worker_fn); + +/** + * queue_kthread_work - queue a kthread_work + * @worker: target kthread_worker + * @work: kthread_work to queue + * + * Queue @work to work processor @task for async execution. @task + * must have been created with kthread_worker_create(). Returns %true + * if @work was successfully queued, %false if it was already pending. + */ +bool queue_kthread_work(struct kthread_worker *worker, + struct kthread_work *work) +{ + bool ret = false; + unsigned long flags; + + spin_lock_irqsave(&worker->lock, flags); + if (list_empty(&work->node)) { + list_add_tail(&work->node, &worker->work_list); + work->queue_seq++; + if (likely(worker->task)) + wake_up_process(worker->task); + ret = true; + } + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_kthread_work); + +/** + * flush_kthread_work - flush a kthread_work + * @work: work to flush + * + * If @work is queued or executing, wait for it to finish execution. + */ +void flush_kthread_work(struct kthread_work *work) +{ + int seq = work->queue_seq; + + atomic_inc(&work->flushing); + + /* + * mb flush-b0 paired with worker-b1, to make sure either + * worker sees the above increment or we see done_seq update. + */ + smp_mb__after_atomic_inc(); + + /* A - B <= 0 tests whether B is in front of A regardless of overflow */ + wait_event(work->done, seq - work->done_seq <= 0); + atomic_dec(&work->flushing); + + /* + * rmb flush-b1 paired with worker-b0, to make sure our caller + * sees every change made by work->func(). + */ + smp_mb__after_atomic_dec(); +} +EXPORT_SYMBOL_GPL(flush_kthread_work); + +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + +/** + * flush_kthread_worker - flush all current works on a kthread_worker + * @worker: worker to flush + * + * Wait until all currently executing or pending works on @worker are + * finished. + */ +void flush_kthread_worker(struct kthread_worker *worker) +{ + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + + queue_kthread_work(worker, &fwork.work); + wait_for_completion(&fwork.done); +} +EXPORT_SYMBOL_GPL(flush_kthread_worker); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37b..f2852a510232 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], static inline u64 lockstat_clock(void) { - return cpu_clock(smp_processor_id()); + return local_clock(); } static int lock_point(unsigned long points[], unsigned long ip) diff --git a/kernel/module.c b/kernel/module.c index 5d2d28197c82..d0b5f8db11b4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,6 +1,6 @@ /* Copyright (C) 2002 Richard Henderson - Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. + Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb) } EXPORT_SYMBOL(unregister_module_notifier); +struct load_info { + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long *strmap; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + /* We require a truly strong try_module_get(): 0 means failure due to ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) @@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code) EXPORT_SYMBOL(__module_put_and_exit); /* Find a module section: 0 means not found. */ -static unsigned int find_sec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, - const char *name) +static unsigned int find_sec(const struct load_info *info, const char *name) { unsigned int i; - for (i = 1; i < hdr->e_shnum; i++) + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; /* Alloc bit cleared means "ignore it." */ - if ((sechdrs[i].sh_flags & SHF_ALLOC) - && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) + if ((shdr->sh_flags & SHF_ALLOC) + && strcmp(info->secstrings + shdr->sh_name, name) == 0) return i; + } return 0; } /* Find a module section, or NULL. */ -static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, - const char *secstrings, const char *name) +static void *section_addr(const struct load_info *info, const char *name) { /* Section 0 has sh_addr 0. */ - return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; + return (void *)info->sechdrs[find_sec(info, name)].sh_addr; } /* Find a module section, or NULL. Fill in number of "objects" in section. */ -static void *section_objs(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, +static void *section_objs(const struct load_info *info, const char *name, size_t object_size, unsigned int *num) { - unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); + unsigned int sec = find_sec(info, name); /* Section 0 has sh_addr 0 and sh_size 0. */ - *num = sechdrs[sec].sh_size / object_size; - return (void *)sechdrs[sec].sh_addr; + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; } /* Provided by the linker */ @@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, unsigned int symnum, void *data), void *data) { struct module *mod; - const struct symsearch arr[] = { + static const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, NOT_GPL_ONLY, false }, { __start___ksymtab_gpl, __stop___ksymtab_gpl, @@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod, mod->percpu = __alloc_reserved_percpu(size, align); if (!mod->percpu) { printk(KERN_WARNING - "Could not allocate %lu bytes percpu data\n", size); + "%s: Could not allocate %lu bytes percpu data\n", + mod->name, size); return -ENOMEM; } mod->percpu_size = size; @@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod) free_percpu(mod->percpu); } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { - return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); + return find_sec(info, ".data..percpu"); } static void percpu_modcopy(struct module *mod, @@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod, static inline void percpu_modfree(struct module *mod) { } -static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { return 0; } @@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); /* Init the unload section of the module. */ -static void module_unload_init(struct module *mod) +static int module_unload_init(struct module *mod) { - int cpu; + mod->refptr = alloc_percpu(struct module_ref); + if (!mod->refptr) + return -ENOMEM; INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); - for_each_possible_cpu(cpu) { - per_cpu_ptr(mod->refptr, cpu)->incs = 0; - per_cpu_ptr(mod->refptr, cpu)->decs = 0; - } /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; + + return 0; } /* Does a already use b? */ @@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); + + free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -787,7 +796,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - ddebug_remove_module(mod->name); free_module(mod); return 0; @@ -892,8 +900,9 @@ int ref_module(struct module *a, struct module *b) } EXPORT_SYMBOL_GPL(ref_module); -static inline void module_unload_init(struct module *mod) +static inline int module_unload_init(struct module *mod) { + return 0; } #endif /* CONFIG_MODULE_UNLOAD */ @@ -1052,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic, #endif /* CONFIG_MODVERSIONS */ /* Resolve a symbol for this module. I.e. if we find one, record usage. */ -static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, - unsigned int versindex, +static const struct kernel_symbol *resolve_symbol(struct module *mod, + const struct load_info *info, const char *name, - struct module *mod, char ownername[]) { struct module *owner; @@ -1069,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, if (!sym) goto unlock; - if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, + owner)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -1088,21 +1097,20 @@ unlock: return sym; } -static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *name, - struct module *mod) +static const struct kernel_symbol * +resolve_symbol_wait(struct module *mod, + const struct load_info *info, + const char *name) { const struct kernel_symbol *ksym; - char ownername[MODULE_NAME_LEN]; + char owner[MODULE_NAME_LEN]; if (wait_event_interruptible_timeout(module_wq, - !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, - mod, ownername)) || - PTR_ERR(ksym) != -EBUSY, + !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) + || PTR_ERR(ksym) != -EBUSY, 30 * HZ) <= 0) { printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", - mod->name, ownername); + mod->name, owner); } return ksym; } @@ -1111,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, * /sys/module/foo/sections stuff * J. Corbet <corbet@lwn.net> */ -#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) +#ifdef CONFIG_SYSFS +#ifdef CONFIG_KALLSYMS static inline bool sect_empty(const Elf_Shdr *sect) { return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; @@ -1149,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) kfree(sect_attrs); } -static void add_sect_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_sect_attrs(struct module *mod, const struct load_info *info) { unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; @@ -1158,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, struct attribute **gattr; /* Count loaded sections and allocate structures */ - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1176,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; - for (i = 0; i < nsect; i++) { - if (sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + if (sect_empty(sec)) continue; - sattr->address = sechdrs[i].sh_addr; - sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, + sattr->address = sec->sh_addr; + sattr->name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); if (sattr->name == NULL) goto out; @@ -1248,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, kfree(notes_attrs); } -static void add_notes_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; @@ -1261,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i]) && - (sechdrs[i].sh_type == SHT_NOTE)) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + (info->sechdrs[i].sh_type == SHT_NOTE)) ++notes; if (notes == 0) @@ -1277,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < nsect; ++i) { - if (sect_empty(&sechdrs[i])) + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) continue; - if (sechdrs[i].sh_type == SHT_NOTE) { + if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); nattr->attr.name = mod->sect_attrs->attrs[loaded].name; nattr->attr.mode = S_IRUGO; - nattr->size = sechdrs[i].sh_size; - nattr->private = (void *) sechdrs[i].sh_addr; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *) info->sechdrs[i].sh_addr; nattr->read = module_notes_read; ++nattr; } @@ -1316,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod) #else -static inline void add_sect_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_sect_attrs(struct module *mod, + const struct load_info *info) { } @@ -1325,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod) { } -static inline void add_notes_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_notes_attrs(struct module *mod, + const struct load_info *info) { } static inline void remove_notes_attrs(struct module *mod) { } -#endif +#endif /* CONFIG_KALLSYMS */ -#ifdef CONFIG_SYSFS static void add_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD @@ -1440,6 +1447,7 @@ out: } static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { @@ -1464,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod, goto out_unreg_param; add_usage_links(mod); + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1480,33 +1490,26 @@ out: static void mod_sysfs_fini(struct module *mod) { + remove_notes_attrs(mod); + remove_sect_attrs(mod); kobject_put(&mod->mkobj.kobj); } -#else /* CONFIG_SYSFS */ - -static inline int mod_sysfs_init(struct module *mod) -{ - return 0; -} +#else /* !CONFIG_SYSFS */ -static inline int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { return 0; } -static inline int module_add_modinfo_attrs(struct module *mod) -{ - return 0; -} - -static inline void module_remove_modinfo_attrs(struct module *mod) +static void mod_sysfs_fini(struct module *mod) { } -static void mod_sysfs_fini(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { } @@ -1516,7 +1519,7 @@ static void del_usage_links(struct module *mod) #endif /* CONFIG_SYSFS */ -static void mod_kobject_remove(struct module *mod) +static void mod_sysfs_teardown(struct module *mod) { del_usage_links(mod); module_remove_modinfo_attrs(mod); @@ -1546,9 +1549,10 @@ static void free_module(struct module *mod) mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); mutex_unlock(&module_mutex); - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_remove(mod); + mod_sysfs_teardown(mod); + + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1563,10 +1567,7 @@ static void free_module(struct module *mod) module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); -#if defined(CONFIG_MODULE_UNLOAD) - if (mod->refptr) - free_percpu(mod->refptr); -#endif + /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -1632,25 +1633,23 @@ static int verify_export_symbols(struct module *mod) } /* Change all symbols so that st_value encodes the pointer directly. */ -static int simplify_symbols(Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab, - unsigned int versindex, - unsigned int pcpuindex, - struct module *mod) -{ - Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; +static int simplify_symbols(struct module *mod, const struct load_info *info) +{ + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + Elf_Sym *sym = (void *)symsec->sh_addr; unsigned long secbase; - unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); + unsigned int i; int ret = 0; const struct kernel_symbol *ksym; - for (i = 1; i < n; i++) { + for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { + const char *name = info->strtab + sym[i].st_name; + switch (sym[i].st_shndx) { case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); + DEBUGP("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1663,9 +1662,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol_wait(sechdrs, versindex, - strtab + sym[i].st_name, - mod); + ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; @@ -1677,17 +1674,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", - mod->name, strtab + sym[i].st_name, - PTR_ERR(ksym)); + mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; break; default: /* Divert to percpu allocation if a percpu var. */ - if (sym[i].st_shndx == pcpuindex) + if (sym[i].st_shndx == info->index.pcpu) secbase = (unsigned long)mod_percpu(mod); else - secbase = sechdrs[sym[i].st_shndx].sh_addr; + secbase = info->sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; break; } @@ -1696,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs, return ret; } +static int apply_relocations(struct module *mod, const struct load_info *info) +{ + unsigned int i; + int err = 0; + + /* Now do relocations. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + unsigned int infosec = info->sechdrs[i].sh_info; + + /* Not a valid relocation section? */ + if (infosec >= info->hdr->e_shnum) + continue; + + /* Don't bother with non-allocated sections */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + continue; + + if (info->sechdrs[i].sh_type == SHT_REL) + err = apply_relocate(info->sechdrs, info->strtab, + info->index.sym, i, mod); + else if (info->sechdrs[i].sh_type == SHT_RELA) + err = apply_relocate_add(info->sechdrs, info->strtab, + info->index.sym, i, mod); + if (err < 0) + break; + } + return err; +} + /* Additional bytes needed by arch in front of individual sections */ unsigned int __weak arch_mod_section_prepend(struct module *mod, unsigned int section) @@ -1720,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size, might -- code, read-only data, read-write data, small data. Tally sizes, and place the offsets into sh_entsize fields: high bit means it belongs in init. */ -static void layout_sections(struct module *mod, - const Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static void layout_sections(struct module *mod, struct load_info *info) { static unsigned long const masks[][2] = { /* NOTE: all executable code must be the first section @@ -1736,21 +1758,22 @@ static void layout_sections(struct module *mod, }; unsigned int m, i; - for (i = 0; i < hdr->e_shnum; i++) - sechdrs[i].sh_entsize = ~0UL; + for (i = 0; i < info->hdr->e_shnum; i++) + info->sechdrs[i].sh_entsize = ~0UL; DEBUGP("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(secstrings + s->sh_name, ".init")) + || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", name); } if (m == 0) mod->core_text_size = mod->core_size; @@ -1758,17 +1781,18 @@ static void layout_sections(struct module *mod, DEBUGP("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(secstrings + s->sh_name, ".init")) + || !strstarts(sname, ".init")) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", sname); } if (m == 0) mod->init_text_size = mod->init_size; @@ -1807,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize) return string; } -static char *get_modinfo(Elf_Shdr *sechdrs, - unsigned int info, - const char *tag) +static char *get_modinfo(struct load_info *info, const char *tag) { char *p; unsigned int taglen = strlen(tag); - unsigned long size = sechdrs[info].sh_size; + Elf_Shdr *infosec = &info->sechdrs[info->index.info]; + unsigned long size = infosec->sh_size; - for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { + for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } return NULL; } -static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, - unsigned int infoindex) +static void setup_modinfo(struct module *mod, struct load_info *info) { struct module_attribute *attr; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) - attr->setup(mod, - get_modinfo(sechdrs, - infoindex, - attr->attr.name)); + attr->setup(mod, get_modinfo(info, attr->attr.name)); } } @@ -1874,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value, } /* As per nm */ -static char elf_type(const Elf_Sym *sym, - Elf_Shdr *sechdrs, - const char *secstrings, - struct module *mod) +static char elf_type(const Elf_Sym *sym, const struct load_info *info) { + const Elf_Shdr *sechdrs = info->sechdrs; + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) return 'v'; @@ -1908,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym, else return 'b'; } - if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { return 'n'; + } return '?'; } @@ -1934,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } -static unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static void layout_symtab(struct module *mod, struct load_info *info) { - unsigned long symoffs; - Elf_Shdr *symsect = sechdrs + symindex; - Elf_Shdr *strsect = sechdrs + strindex; + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - const char *strtab; unsigned int i, nsrc, ndst; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, - symindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + symsect->sh_name); + info->index.sym) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); - src = (void *)hdr + symsect->sh_offset; + src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - strtab = (void *)hdr + strsect->sh_offset; for (ndst = i = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { unsigned int j = src->st_name; - while(!__test_and_set_bit(j, strmap) && strtab[j]) + while (!__test_and_set_bit(j, info->strmap) + && info->strtab[j]) ++j; ++ndst; } /* Append room for core symbols at end of core part. */ - symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, - strindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + strsect->sh_name); + info->index.str) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); /* Append room for core symbols' strings at end of core part. */ - *pstroffs = mod->core_size; - __set_bit(0, strmap); - mod->core_size += bitmap_weight(strmap, strsect->sh_size); - - return symoffs; + info->stroffs = mod->core_size; + __set_bit(0, info->strmap); + mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } -static void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - unsigned long *strmap) +static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; const Elf_Sym *src; Elf_Sym *dst; char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)sechdrs[symindex].sh_addr; - mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); - mod->strtab = (void *)sechdrs[strindex].sh_addr; + mod->symtab = (void *)symsec->sh_addr; + mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info - = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - mod->core_symtab = dst = mod->module_core + symoffs; + mod->core_symtab = dst = mod->module_core + info->symoffs; src = mod->symtab; *dst = *src; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, sechdrs, shnum)) + if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + dst[ndst].st_name = bitmap_weight(info->strmap, + dst[ndst].st_name); ++ndst; } mod->core_num_syms = ndst; - mod->core_strtab = s = mod->module_core + stroffs; - for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) - if (test_bit(i, strmap)) + mod->core_strtab = s = mod->module_core + info->stroffs; + for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) + if (test_bit(i, info->strmap)) *++s = mod->strtab[i]; } #else -static inline unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static inline void layout_symtab(struct module *mod, struct load_info *info) { - return 0; } -static inline void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - const unsigned long *strmap) +static void add_kallsyms(struct module *mod, struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) { + if (!debug) + return; #ifdef CONFIG_DYNAMIC_DEBUG if (ddebug_add_module(debug, num, debug->modname)) printk(KERN_ERR "dynamic debug error adding module: %s\n", @@ -2085,65 +2074,47 @@ static void *module_alloc_update_bounds(unsigned long size) } #ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { unsigned int i; /* only scan the sections containing data */ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - for (i = 1; i < hdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + for (i = 1; i < info->hdr->e_shnum; i++) { + const char *name = info->secstrings + info->sechdrs[i].sh_name; + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) continue; - if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 - && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) + if (!strstarts(name, ".data") && !strstarts(name, ".bss")) continue; - kmemleak_scan_area((void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size, GFP_KERNEL); + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); } } #else -static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } #endif -/* Allocate and load the module: note that size of section 0 is always - zero, and we rely on this for optional sections. */ -static noinline struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +/* Sets info->hdr and info->len. */ +static int copy_and_check(struct load_info *info, + const void __user *umod, unsigned long len, + const char __user *uargs) { + int err; Elf_Ehdr *hdr; - Elf_Shdr *sechdrs; - char *secstrings, *args, *modmagic, *strtab = NULL; - char *staging; - unsigned int i; - unsigned int symindex = 0; - unsigned int strindex = 0; - unsigned int modindex, versindex, infoindex, pcpuindex; - struct module *mod; - long err = 0; - void *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long symoffs, stroffs, *strmap; - void __percpu *percpu; - struct _ddebug *debug = NULL; - unsigned int num_debug = 0; - - mm_segment_t old_fs; - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); if (len < sizeof(*hdr)) - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ /* vmalloc barfs on "unusual" numbers. Check here */ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) - return ERR_PTR(-ENOMEM); + return -ENOMEM; if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; @@ -2151,135 +2122,225 @@ static noinline struct module *load_module(void __user *umod, } /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ + weird elf version */ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(*sechdrs)) { + || hdr->e_shentsize != sizeof(Elf_Shdr)) { + err = -ENOEXEC; + goto free_hdr; + } + + if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { err = -ENOEXEC; goto free_hdr; } - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) - goto truncated; + info->hdr = hdr; + info->len = len; + return 0; + +free_hdr: + vfree(hdr); + return err; +} + +static void free_copy(struct load_info *info) +{ + vfree(info->hdr); +} + +static int rewrite_section_headers(struct load_info *info) +{ + unsigned int i; - /* Convenience variables */ - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - sechdrs[0].sh_addr = 0; + /* This should always be true, but let's be sure. */ + info->sechdrs[0].sh_addr = 0; - for (i = 1; i < hdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_NOBITS - && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) - goto truncated; + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (shdr->sh_type != SHT_NOBITS + && info->len < shdr->sh_offset + shdr->sh_size) { + printk(KERN_ERR "Module len %lu truncated\n", + info->len); + return -ENOEXEC; + } /* Mark all sections sh_addr with their address in the temporary image. */ - sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; + shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; - /* Internal symbols and strings. */ - if (sechdrs[i].sh_type == SHT_SYMTAB) { - symindex = i; - strindex = sechdrs[i].sh_link; - strtab = (char *)hdr + sechdrs[strindex].sh_offset; - } #ifndef CONFIG_MODULE_UNLOAD /* Don't load .exit sections */ - if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) - sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; + if (strstarts(info->secstrings+shdr->sh_name, ".exit")) + shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; #endif } - modindex = find_sec(hdr, sechdrs, secstrings, - ".gnu.linkonce.this_module"); - if (!modindex) { + /* Track but don't keep modinfo and version sections. */ + info->index.vers = find_sec(info, "__versions"); + info->index.info = find_sec(info, ".modinfo"); + info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; +} + +/* + * Set up our basic convenience variables (pointers to section headers, + * search for module section index etc), and do some basic section + * verification. + * + * Return the temporary module pointer (we'll replace it with the final + * one when we move the module sections around). + */ +static struct module *setup_load_info(struct load_info *info) +{ + unsigned int i; + int err; + struct module *mod; + + /* Set up the convenience variables */ + info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; + info->secstrings = (void *)info->hdr + + info->sechdrs[info->hdr->e_shstrndx].sh_offset; + + err = rewrite_section_headers(info); + if (err) + return ERR_PTR(err); + + /* Find internal symbols and strings. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + if (info->sechdrs[i].sh_type == SHT_SYMTAB) { + info->index.sym = i; + info->index.str = info->sechdrs[i].sh_link; + info->strtab = (char *)info->hdr + + info->sechdrs[info->index.str].sh_offset; + break; + } + } + + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); + if (!info->index.mod) { printk(KERN_WARNING "No module found in object\n"); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ - mod = (void *)sechdrs[modindex].sh_addr; + mod = (void *)info->sechdrs[info->index.mod].sh_addr; - if (symindex == 0) { + if (info->index.sym == 0) { printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", mod->name); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } - versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); - infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); - pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); - - /* Don't keep modinfo and version sections. */ - sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(sechdrs, versindex, mod)) { - err = -ENOEXEC; - goto free_hdr; - } + if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + return ERR_PTR(-ENOEXEC); + + return mod; +} + +static int check_modinfo(struct module *mod, struct load_info *info) +{ + const char *modmagic = get_modinfo(info, "vermagic"); + int err; - modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); if (err) - goto free_hdr; - } else if (!same_magic(modmagic, vermagic, versindex)) { + return err; + } else if (!same_magic(modmagic, vermagic, info->index.vers)) { printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); - err = -ENOEXEC; - goto free_hdr; + return -ENOEXEC; } - staging = get_modinfo(sechdrs, infoindex, "staging"); - if (staging) { + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); } - /* Now copy in args */ - args = strndup_user(uargs, ~0UL >> 1); - if (IS_ERR(args)) { - err = PTR_ERR(args); - goto free_hdr; - } + /* Set up license info based on the info section */ + set_license(mod, get_modinfo(info, "license")); - strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) - * sizeof(long), GFP_KERNEL); - if (!strmap) { - err = -ENOMEM; - goto free_mod; - } + return 0; +} - mod->state = MODULE_STATE_COMING; +static void find_module_sections(struct module *mod, struct load_info *info) +{ + mod->kp = section_objs(info, "__param", + sizeof(*mod->kp), &mod->num_kp); + mod->syms = section_objs(info, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(info, "__kcrctab"); + mod->gpl_syms = section_objs(info, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(info, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); - /* Allow arches to frob section contents and sizes. */ - err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); - if (err < 0) - goto free_mod; +#ifdef CONFIG_UNUSED_SYMBOLS + mod->unused_syms = section_objs(info, "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(info, "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); +#endif +#ifdef CONFIG_CONSTRUCTORS + mod->ctors = section_objs(info, ".ctors", + sizeof(*mod->ctors), &mod->num_ctors); +#endif - if (pcpuindex) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign); - if (err) - goto free_mod; - sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - } - /* Keep this around for failure path. */ - percpu = mod_percpu(mod); +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = section_objs(info, "__tracepoints", + sizeof(*mod->tracepoints), + &mod->num_tracepoints); +#endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(info, "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(info, "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif - /* Determine total sizes, and put offsets in sh_entsize. For now - this is done generically; there doesn't appear to be any - special cases for the architectures. */ - layout_sections(mod, hdr, sechdrs, secstrings); - symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, - secstrings, &stroffs, strmap); + mod->extable = section_objs(info, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + + if (section_addr(info, "__obsparm")) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); + + info->debug = section_objs(info, "__verbose", + sizeof(*info->debug), &info->num_debug); +} + +static int move_module(struct module *mod, struct load_info *info) +{ + int i; + void *ptr; /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2289,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod, * leak. */ kmemleak_not_leak(ptr); - if (!ptr) { - err = -ENOMEM; - goto free_percpu; - } + if (!ptr) + return -ENOMEM; + memset(ptr, 0, mod->core_size); mod->module_core = ptr; @@ -2305,50 +2365,40 @@ static noinline struct module *load_module(void __user *umod, */ kmemleak_ignore(ptr); if (!ptr && mod->init_size) { - err = -ENOMEM; - goto free_core; + module_free(mod, mod->module_core); + return -ENOMEM; } memset(ptr, 0, mod->init_size); mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ DEBUGP("final section addresses:\n"); - for (i = 0; i < hdr->e_shnum; i++) { + for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; + Elf_Shdr *shdr = &info->sechdrs[i]; - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (!(shdr->sh_flags & SHF_ALLOC)) continue; - if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) + if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->module_init - + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); + + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + sechdrs[i].sh_entsize; + dest = mod->module_core + shdr->sh_entsize; - if (sechdrs[i].sh_type != SHT_NOBITS) - memcpy(dest, (void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size); + if (shdr->sh_type != SHT_NOBITS) + memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ - sechdrs[i].sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); + shdr->sh_addr = (unsigned long)dest; + DEBUGP("\t0x%lx %s\n", + shdr->sh_addr, info->secstrings + shdr->sh_name); } - /* Module has been moved. */ - mod = (void *)sechdrs[modindex].sh_addr; - kmemleak_load_module(mod, hdr, sechdrs, secstrings); -#if defined(CONFIG_MODULE_UNLOAD) - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) { - err = -ENOMEM; - goto free_init; - } -#endif - /* Now we've moved module, initialize linked lists, etc. */ - module_unload_init(mod); - - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + return 0; +} +static int check_module_license_and_versions(struct module *mod) +{ /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -2361,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod, if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, sechdrs, infoindex); - - /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, - mod); - if (err < 0) - goto cleanup; - - /* Now we've got everything in the final locations, we can - * find optional sections. */ - mod->kp = section_objs(hdr, sechdrs, secstrings, "__param", - sizeof(*mod->kp), &mod->num_kp); - mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", - sizeof(*mod->syms), &mod->num_syms); - mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); - mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", - sizeof(*mod->gpl_syms), - &mod->num_gpl_syms); - mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_gpl_future", - sizeof(*mod->gpl_future_syms), - &mod->num_gpl_future_syms); - mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_gpl_future"); - -#ifdef CONFIG_UNUSED_SYMBOLS - mod->unused_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused", - sizeof(*mod->unused_syms), - &mod->num_unused_syms); - mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused"); - mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused_gpl", - sizeof(*mod->unused_gpl_syms), - &mod->num_unused_gpl_syms); - mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused_gpl"); -#endif -#ifdef CONFIG_CONSTRUCTORS - mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors", - sizeof(*mod->ctors), &mod->num_ctors); -#endif - -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints = section_objs(hdr, sechdrs, secstrings, - "__tracepoints", - sizeof(*mod->tracepoints), - &mod->num_tracepoints); -#endif -#ifdef CONFIG_EVENT_TRACING - mod->trace_events = section_objs(hdr, sechdrs, secstrings, - "_ftrace_events", - sizeof(*mod->trace_events), - &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); -#endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD - /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, - "__mcount_loc", - sizeof(*mod->ftrace_callsites), - &mod->num_ftrace_callsites); -#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2441,56 +2420,16 @@ static noinline struct module *load_module(void __user *umod, || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) #endif ) { - err = try_to_force_load(mod, - "no versions for exported symbols"); - if (err) - goto cleanup; + return try_to_force_load(mod, + "no versions for exported symbols"); } #endif + return 0; +} - /* Now do relocations. */ - for (i = 1; i < hdr->e_shnum; i++) { - const char *strtab = (char *)sechdrs[strindex].sh_addr; - unsigned int info = sechdrs[i].sh_info; - - /* Not a valid relocation section? */ - if (info >= hdr->e_shnum) - continue; - - /* Don't bother with non-allocated sections */ - if (!(sechdrs[info].sh_flags & SHF_ALLOC)) - continue; - - if (sechdrs[i].sh_type == SHT_REL) - err = apply_relocate(sechdrs, strtab, symindex, i,mod); - else if (sechdrs[i].sh_type == SHT_RELA) - err = apply_relocate_add(sechdrs, strtab, symindex, i, - mod); - if (err < 0) - goto cleanup; - } - - /* Set up and sort exception table */ - mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", - sizeof(*mod->extable), &mod->num_exentries); - sort_extable(mod->extable, mod->extable + mod->num_exentries); - - /* Finally, copy percpu area over. */ - percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, - sechdrs[pcpuindex].sh_size); - - add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, - symoffs, stroffs, secstrings, strmap); - kfree(strmap); - strmap = NULL; - - if (!mod->taints) - debug = section_objs(hdr, sechdrs, secstrings, "__verbose", - sizeof(*debug), &num_debug); - - err = module_finalize(hdr, sechdrs, mod); - if (err < 0) - goto cleanup; +static void flush_module_icache(const struct module *mod) +{ + mm_segment_t old_fs; /* flush the icache in correct context */ old_fs = get_fs(); @@ -2509,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod, (unsigned long)mod->module_core + mod->core_size); set_fs(old_fs); +} - mod->args = args; - if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); +static struct module *layout_and_allocate(struct load_info *info) +{ + /* Module within temporary copy. */ + struct module *mod; + Elf_Shdr *pcpusec; + int err; + + mod = setup_load_info(info); + if (IS_ERR(mod)) + return mod; + + err = check_modinfo(mod, info); + if (err) + return ERR_PTR(err); + + /* Allow arches to frob section contents and sizes. */ + err = module_frob_arch_sections(info->hdr, info->sechdrs, + info->secstrings, mod); + if (err < 0) + goto out; + + pcpusec = &info->sechdrs[info->index.pcpu]; + if (pcpusec->sh_size) { + /* We have a special allocation for this section. */ + err = percpu_modalloc(mod, + pcpusec->sh_size, pcpusec->sh_addralign); + if (err) + goto out; + pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; + } + + /* Determine total sizes, and put offsets in sh_entsize. For now + this is done generically; there doesn't appear to be any + special cases for the architectures. */ + layout_sections(mod, info); + + info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) + * sizeof(long), GFP_KERNEL); + if (!info->strmap) { + err = -ENOMEM; + goto free_percpu; + } + layout_symtab(mod, info); + + /* Allocate and move to the final place */ + err = move_module(mod, info); + if (err) + goto free_strmap; + + /* Module has been copied to its final place now: return it. */ + mod = (void *)info->sechdrs[info->index.mod].sh_addr; + kmemleak_load_module(mod, info); + return mod; + +free_strmap: + kfree(info->strmap); +free_percpu: + percpu_modfree(mod); +out: + return ERR_PTR(err); +} + +/* mod is no longer valid after this! */ +static void module_deallocate(struct module *mod, struct load_info *info) +{ + kfree(info->strmap); + percpu_modfree(mod); + module_free(mod, mod->module_init); + module_free(mod, mod->module_core); +} + +static int post_relocation(struct module *mod, const struct load_info *info) +{ + /* Sort exception table now relocations are done. */ + sort_extable(mod->extable, mod->extable + mod->num_exentries); + + /* Copy relocated percpu area over. */ + percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, + info->sechdrs[info->index.pcpu].sh_size); + + /* Setup kallsyms-specific fields. */ + add_kallsyms(mod, info); + + /* Arch-specific module finalizing. */ + return module_finalize(info->hdr, info->sechdrs, mod); +} + +/* Allocate and load the module: note that size of section 0 is always + zero, and we rely on this for optional sections. */ +static struct module *load_module(void __user *umod, + unsigned long len, + const char __user *uargs) +{ + struct load_info info = { NULL, }; + struct module *mod; + long err; + + DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); + + /* Copy in the blobs from userspace, check they are vaguely sane. */ + err = copy_and_check(&info, umod, len, uargs); + if (err) + return ERR_PTR(err); + + /* Figure out module layout, and allocate all the memory. */ + mod = layout_and_allocate(&info); + if (IS_ERR(mod)) { + err = PTR_ERR(mod); + goto free_copy; + } + + /* Now module is in final location, initialize linked lists, etc. */ + err = module_unload_init(mod); + if (err) + goto free_module; + + /* Now we've got everything in the final locations, we can + * find optional sections. */ + find_module_sections(mod, &info); + + err = check_module_license_and_versions(mod); + if (err) + goto free_unload; + + /* Set up MODINFO_ATTR fields */ + setup_modinfo(mod, &info); + + /* Fix up syms, so that st_value is a pointer to location. */ + err = simplify_symbols(mod, &info); + if (err < 0) + goto free_modinfo; + + err = apply_relocations(mod, &info); + if (err < 0) + goto free_modinfo; + + err = post_relocation(mod, &info); + if (err < 0) + goto free_modinfo; + + flush_module_icache(mod); + + /* Now copy in args */ + mod->args = strndup_user(uargs, ~0UL >> 1); + if (IS_ERR(mod->args)) { + err = PTR_ERR(mod->args); + goto free_arch_cleanup; + } + + /* Mark state as coming so strong_try_module_get() ignores us. */ + mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops * info during argument parsing. Noone should access us, since @@ -2528,8 +2616,9 @@ static noinline struct module *load_module(void __user *umod, goto unlock; } - if (debug) - dynamic_debug_setup(debug, num_debug); + /* This has to be done once we're sure module name is unique. */ + if (!mod->taints) + dynamic_debug_setup(info.debug, info.num_debug); /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2539,23 +2628,22 @@ static noinline struct module *load_module(void __user *umod, list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); + /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) goto unlink; - err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); + /* Link in to syfs. */ + err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); if (err < 0) goto unlink; - add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - - /* Get rid of temporary copy */ - vfree(hdr); - - trace_module_load(mod); + /* Get rid of temporary copy and strmap. */ + kfree(info.strmap); + free_copy(&info); /* Done! */ + trace_module_load(mod); return mod; unlink: @@ -2563,35 +2651,23 @@ static noinline struct module *load_module(void __user *umod, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); ddebug: - dynamic_debug_remove(debug); + if (!mod->taints) + dynamic_debug_remove(info.debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); + kfree(mod->args); + free_arch_cleanup: module_arch_cleanup(mod); - cleanup: + free_modinfo: free_modinfo(mod); + free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) - free_percpu(mod->refptr); - free_init: -#endif - module_free(mod, mod->module_init); - free_core: - module_free(mod, mod->module_core); - /* mod will be freed with core. Don't access it beyond this line! */ - free_percpu: - free_percpu(percpu); - free_mod: - kfree(args); - kfree(strmap); - free_hdr: - vfree(hdr); + free_module: + module_deallocate(mod, &info); + free_copy: + free_copy(&info); return ERR_PTR(err); - - truncated: - printk(KERN_ERR "Module len %lu truncated\n", len); - err = -ENOEXEC; - goto free_hdr; } /* Call module constructors. */ diff --git a/kernel/padata.c b/kernel/padata.c index fdd8ae609ce3..751019415d23 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -26,18 +26,19 @@ #include <linux/mutex.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/rcupdate.h> -#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_SEQ_NR (INT_MAX - NR_CPUS) #define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; - target_cpu = cpumask_first(pd->cpumask); + target_cpu = cpumask_first(pd->cpumask.pcpu); for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask); + target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); return target_cpu; } @@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata) * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *work) +static void padata_parallel_worker(struct work_struct *parallel_work) { - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, pwork); - pd = queue->pd; + pqueue = container_of(parallel_work, + struct padata_parallel_queue, work); + pd = pqueue->pd; pinst = pd->pinst; - spin_lock(&queue->parallel.lock); - list_replace_init(&queue->parallel.list, &local_list); - spin_unlock(&queue->parallel.lock); + spin_lock(&pqueue->parallel.lock); + list_replace_init(&pqueue->parallel.list, &local_list); + spin_unlock(&pqueue->parallel.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work) * @pinst: padata instance * @padata: object to be parallelized * @cb_cpu: cpu the serialization callback function will run on, - * must be in the cpumask of padata. + * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). * * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel @@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu) { int target_cpu, err; - struct padata_queue *queue; + struct padata_parallel_queue *queue; struct parallel_data *pd; rcu_read_lock_bh(); pd = rcu_dereference(pinst->pd); - err = 0; - if (!(pinst->flags & PADATA_INIT)) + err = -EINVAL; + if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) + goto out; + + if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) goto out; err = -EBUSY; @@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst, if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) goto out; - err = -EINVAL; - if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) - goto out; - - err = -EINPROGRESS; + err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = cb_cpu; @@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst, padata->seq_nr = atomic_inc_return(&pd->seq_nr); target_cpu = padata_cpu_hash(padata); - queue = per_cpu_ptr(pd->queue, target_cpu); + queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->wq, &queue->pwork); + queue_work_on(target_cpu, pinst->wq, &queue->work); out: rcu_read_unlock_bh(); @@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus, empty, calc_seq_nr; - int seq_nr, next_nr, overrun, next_overrun; - struct padata_queue *queue, *next_queue; + int cpu, num_cpus; + int next_nr, next_index; + struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; - empty = 0; - next_nr = -1; - next_overrun = 0; - next_queue = NULL; - - num_cpus = cpumask_weight(pd->cpumask); - - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - reorder = &queue->reorder; - - /* - * Calculate the seq_nr of the object that should be - * next in this reorder queue. - */ - overrun = 0; - calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) - + queue->cpu_index; + num_cpus = cpumask_weight(pd->cpumask.pcpu); - if (unlikely(calc_seq_nr > pd->max_seq_nr)) { - calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; - overrun = 1; - } - - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - seq_nr = padata->seq_nr; - BUG_ON(calc_seq_nr != seq_nr); - } else { - seq_nr = calc_seq_nr; - empty++; - } - - if (next_nr < 0 || seq_nr < next_nr - || (next_overrun && !overrun)) { - next_nr = seq_nr; - next_overrun = overrun; - next_queue = queue; - } + /* + * Calculate the percpu reorder queue and the sequence + * number of the next object. + */ + next_nr = pd->processed; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + + if (unlikely(next_nr > pd->max_seq_nr)) { + next_nr = next_nr - pd->max_seq_nr - 1; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + pd->processed = 0; } padata = NULL; - if (empty == num_cpus) - goto out; - reorder = &next_queue->reorder; if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - if (unlikely(next_overrun)) { - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - atomic_set(&queue->num_obj, 0); - } - } + BUG_ON(next_nr != padata->seq_nr); spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); spin_unlock(&reorder->lock); - atomic_inc(&next_queue->num_obj); + pd->processed++; goto out; } - queue = per_cpu_ptr(pd->queue, smp_processor_id()); + queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); if (queue->cpu_index == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; @@ -262,7 +231,7 @@ out: static void padata_reorder(struct parallel_data *pd) { struct padata_priv *padata; - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; /* @@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd) return; } - queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); - spin_lock(&queue->serial.lock); - list_add_tail(&padata->list, &queue->serial.list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_add_tail(&padata->list, &squeue->serial.list); + spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); @@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg) padata_reorder(pd); } -static void padata_serial_worker(struct work_struct *work) +static void padata_serial_worker(struct work_struct *serial_work) { - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, swork); - pd = queue->pd; + squeue = container_of(serial_work, struct padata_serial_queue, work); + pd = squeue->pd; - spin_lock(&queue->serial.lock); - list_replace_init(&queue->serial.list, &local_list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_replace_init(&squeue->serial.list, &local_list); + spin_unlock(&squeue->serial.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work) void padata_do_serial(struct padata_priv *padata) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; pd = padata->pd; cpu = get_cpu(); - queue = per_cpu_ptr(pd->queue, cpu); + pqueue = per_cpu_ptr(pd->pqueue, cpu); - spin_lock(&queue->reorder.lock); + spin_lock(&pqueue->reorder.lock); atomic_inc(&pd->reorder_objects); - list_add_tail(&padata->list, &queue->reorder.list); - spin_unlock(&queue->reorder.lock); + list_add_tail(&padata->list, &pqueue->reorder.list); + spin_unlock(&pqueue->reorder.lock); put_cpu(); @@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata) } EXPORT_SYMBOL(padata_do_serial); -/* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *cpumask) +static int padata_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { - int cpu, cpu_index, num_cpus; - struct padata_queue *queue; - struct parallel_data *pd; - - cpu_index = 0; + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + return -ENOMEM; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); - if (!pd) - goto err; + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pd->cpumask.cbcpu); + return -ENOMEM; + } - pd->queue = alloc_percpu(struct padata_queue); - if (!pd->queue) - goto err_free_pd; + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + return 0; +} - if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) - goto err_free_queue; +static void __padata_list_init(struct padata_list *pd_list) +{ + INIT_LIST_HEAD(&pd_list->list); + spin_lock_init(&pd_list->lock); +} - cpumask_and(pd->cpumask, cpumask, cpu_active_mask); +/* Initialize all percpu queues used by serial workers */ +static void padata_init_squeues(struct parallel_data *pd) +{ + int cpu; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + squeue->pd = pd; + __padata_list_init(&squeue->serial); + INIT_WORK(&squeue->work, padata_serial_worker); + } +} - queue->pd = pd; +/* Initialize all percpu queues used by parallel workers */ +static void padata_init_pqueues(struct parallel_data *pd) +{ + int cpu_index, num_cpus, cpu; + struct padata_parallel_queue *pqueue; - queue->cpu_index = cpu_index; + cpu_index = 0; + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + pqueue->pd = pd; + pqueue->cpu_index = cpu_index; cpu_index++; - INIT_LIST_HEAD(&queue->reorder.list); - INIT_LIST_HEAD(&queue->parallel.list); - INIT_LIST_HEAD(&queue->serial.list); - spin_lock_init(&queue->reorder.lock); - spin_lock_init(&queue->parallel.lock); - spin_lock_init(&queue->serial.lock); - - INIT_WORK(&queue->pwork, padata_parallel_worker); - INIT_WORK(&queue->swork, padata_serial_worker); - atomic_set(&queue->num_obj, 0); + __padata_list_init(&pqueue->reorder); + __padata_list_init(&pqueue->parallel); + INIT_WORK(&pqueue->work, padata_parallel_worker); + atomic_set(&pqueue->num_obj, 0); } - num_cpus = cpumask_weight(pd->cpumask); - pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + num_cpus = cpumask_weight(pd->cpumask.pcpu); + pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; +} + +/* Allocate and initialize the internal cpumask dependend resources. */ +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + struct parallel_data *pd; + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->pqueue = alloc_percpu(struct padata_parallel_queue); + if (!pd->pqueue) + goto err_free_pd; + + pd->squeue = alloc_percpu(struct padata_serial_queue); + if (!pd->squeue) + goto err_free_pqueue; + if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + goto err_free_squeue; + + padata_init_pqueues(pd); + padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); @@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, return pd; -err_free_queue: - free_percpu(pd->queue); +err_free_squeue: + free_percpu(pd->squeue); +err_free_pqueue: + free_percpu(pd->pqueue); err_free_pd: kfree(pd); err: @@ -456,8 +464,10 @@ err: static void padata_free_pd(struct parallel_data *pd) { - free_cpumask_var(pd->cpumask); - free_percpu(pd->queue); + free_cpumask_var(pd->cpumask.pcpu); + free_cpumask_var(pd->cpumask.cbcpu); + free_percpu(pd->pqueue); + free_percpu(pd->squeue); kfree(pd); } @@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd) static void padata_flush_queues(struct parallel_data *pd) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->pwork); + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + flush_work(&pqueue->work); } del_timer_sync(&pd->timer); @@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd) if (atomic_read(&pd->reorder_objects)) padata_reorder(pd); - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->swork); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + flush_work(&squeue->work); } BUG_ON(atomic_read(&pd->refcnt) != 0); } +static void __padata_start(struct padata_instance *pinst) +{ + pinst->flags |= PADATA_INIT; +} + +static void __padata_stop(struct padata_instance *pinst) +{ + if (!(pinst->flags & PADATA_INIT)) + return; + + pinst->flags &= ~PADATA_INIT; + + synchronize_rcu(); + + get_online_cpus(); + padata_flush_queues(pinst->pd); + put_online_cpus(); +} + /* Replace the internal control stucture with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { struct parallel_data *pd_old = pinst->pd; + int notification_mask = 0; pinst->flags |= PADATA_RESET; @@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst, synchronize_rcu(); + if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + notification_mask |= PADATA_CPU_PARALLEL; + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + notification_mask |= PADATA_CPU_SERIAL; + padata_flush_queues(pd_old); padata_free_pd(pd_old); + if (notification_mask) + blocking_notifier_call_chain(&pinst->cpumask_change_notifier, + notification_mask, + &pd_new->cpumask); + pinst->flags &= ~PADATA_RESET; } /** - * padata_set_cpumask - set the cpumask that padata should use + * padata_register_cpumask_notifier - Registers a notifier that will be called + * if either pcpu or cbcpu or both cpumasks change. * - * @pinst: padata instance - * @cpumask: the cpumask to use + * @pinst: A poineter to padata instance + * @nblock: A pointer to notifier block. */ -int padata_set_cpumask(struct padata_instance *pinst, - cpumask_var_t cpumask) +int padata_register_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) { + return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_register_cpumask_notifier); + +/** + * padata_unregister_cpumask_notifier - Unregisters cpumask notifier + * registered earlier using padata_register_cpumask_notifier + * + * @pinst: A pointer to data instance. + * @nlock: A pointer to notifier block. + */ +int padata_unregister_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) +{ + return blocking_notifier_chain_unregister( + &pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_unregister_cpumask_notifier); + + +/* If cpumask contains no active cpu, we mark the instance as invalid. */ +static bool padata_validate_cpumask(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + if (!cpumask_intersects(cpumask, cpu_active_mask)) { + pinst->flags |= PADATA_INVALID; + return false; + } + + pinst->flags &= ~PADATA_INVALID; + return true; +} + +static int __padata_set_cpumasks(struct padata_instance *pinst, + cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int valid; struct parallel_data *pd; - int err = 0; + + valid = padata_validate_cpumask(pinst, pcpumask); + if (!valid) { + __padata_stop(pinst); + goto out_replace; + } + + valid = padata_validate_cpumask(pinst, cbcpumask); + if (!valid) + __padata_stop(pinst); + +out_replace: + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + return -ENOMEM; + + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + + padata_replace(pinst, pd); + + if (valid) + __padata_start(pinst); + + return 0; +} + +/** + * padata_set_cpumasks - Set both parallel and serial cpumasks. The first + * one is used by parallel workers and the second one + * by the wokers doing serialization. + * + * @pinst: padata instance + * @pcpumask: the cpumask to use for parallel workers + * @cbcpumask: the cpumsak to use for serial workers + */ +int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int err; mutex_lock(&pinst->lock); + get_online_cpus(); + err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); + + put_online_cpus(); + mutex_unlock(&pinst->lock); + + return err; + +} +EXPORT_SYMBOL(padata_set_cpumasks); + +/** + * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. + * + * @pinst: padata instance + * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding + * to parallel and serial cpumasks respectively. + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, + cpumask_var_t cpumask) +{ + struct cpumask *serial_mask, *parallel_mask; + int err = -EINVAL; + + mutex_lock(&pinst->lock); get_online_cpus(); - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) { - err = -ENOMEM; - goto out; + switch (cpumask_type) { + case PADATA_CPU_PARALLEL: + serial_mask = pinst->cpumask.cbcpu; + parallel_mask = cpumask; + break; + case PADATA_CPU_SERIAL: + parallel_mask = pinst->cpumask.pcpu; + serial_mask = cpumask; + break; + default: + goto out; } - cpumask_copy(pinst->cpumask, cpumask); - - padata_replace(pinst, pd); + err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); out: put_online_cpus(); - mutex_unlock(&pinst->lock); return err; @@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) struct parallel_data *pd; if (cpumask_test_cpu(cpu, cpu_active_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; padata_replace(pinst, pd); + + if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && + padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_start(pinst); } return 0; } -/** - * padata_add_cpu - add a cpu to the padata cpumask + /** + * padata_add_cpu - add a cpu to one or both(parallel and serial) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to add + * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_add_cpu(struct padata_instance *pinst, int cpu) + +int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_set_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_set_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_add_cpu(pinst, cpu); put_online_cpus(); @@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu); static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd; + struct parallel_data *pd = NULL; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + + if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || + !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_stop(pinst); + + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; @@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return 0; } -/** - * padata_remove_cpu - remove a cpu from the padata cpumask + /** + * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to remove + * @mask: bitmask specifying from which cpumask @cpu should be removed + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu) +int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_clear_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_remove_cpu(pinst, cpu); put_online_cpus(); @@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu); * * @pinst: padata instance to start */ -void padata_start(struct padata_instance *pinst) +int padata_start(struct padata_instance *pinst) { + int err = 0; + mutex_lock(&pinst->lock); - pinst->flags |= PADATA_INIT; + + if (pinst->flags & PADATA_INVALID) + err =-EINVAL; + + __padata_start(pinst); + mutex_unlock(&pinst->lock); + + return err; } EXPORT_SYMBOL(padata_start); @@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start); void padata_stop(struct padata_instance *pinst) { mutex_lock(&pinst->lock); - pinst->flags &= ~PADATA_INIT; + __padata_stop(pinst); mutex_unlock(&pinst->lock); } EXPORT_SYMBOL(padata_stop); #ifdef CONFIG_HOTPLUG_CPU + +static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) +{ + return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || + cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); +} + + static int padata_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_add_cpu(pinst, cpu); @@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_remove_cpu(pinst, cpu); @@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_remove_cpu(pinst, cpu); @@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_add_cpu(pinst, cpu); @@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb, } #endif +static void __padata_free(struct padata_instance *pinst) +{ +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + padata_stop(pinst); + padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); + kfree(pinst); +} + +#define kobj2pinst(_kobj) \ + container_of(_kobj, struct padata_instance, kobj) +#define attr2pentry(_attr) \ + container_of(_attr, struct padata_sysfs_entry, attr) + +static void padata_sysfs_release(struct kobject *kobj) +{ + struct padata_instance *pinst = kobj2pinst(kobj); + __padata_free(pinst); +} + +struct padata_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct padata_instance *, struct attribute *, char *); + ssize_t (*store)(struct padata_instance *, struct attribute *, + const char *, size_t); +}; + +static ssize_t show_cpumask(struct padata_instance *pinst, + struct attribute *attr, char *buf) +{ + struct cpumask *cpumask; + ssize_t len; + + mutex_lock(&pinst->lock); + if (!strcmp(attr->name, "serial_cpumask")) + cpumask = pinst->cpumask.cbcpu; + else + cpumask = pinst->cpumask.pcpu; + + len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), + nr_cpu_ids); + if (PAGE_SIZE - len < 2) + len = -EINVAL; + else + len += sprintf(buf + len, "\n"); + + mutex_unlock(&pinst->lock); + return len; +} + +static ssize_t store_cpumask(struct padata_instance *pinst, + struct attribute *attr, + const char *buf, size_t count) +{ + cpumask_var_t new_cpumask; + ssize_t ret; + int mask_type; + + if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), + nr_cpumask_bits); + if (ret < 0) + goto out; + + mask_type = !strcmp(attr->name, "serial_cpumask") ? + PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; + ret = padata_set_cpumask(pinst, mask_type, new_cpumask); + if (!ret) + ret = count; + +out: + free_cpumask_var(new_cpumask); + return ret; +} + +#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0644, _show_name, _store_name) +#define PADATA_ATTR_RO(_name, _show_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0400, _show_name, NULL) + +PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); +PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); + +/* + * Padata sysfs provides the following objects: + * serial_cpumask [RW] - cpumask for serial workers + * parallel_cpumask [RW] - cpumask for parallel workers + */ +static struct attribute *padata_default_attrs[] = { + &serial_cpumask_attr.attr, + ¶llel_cpumask_attr.attr, + NULL, +}; + +static ssize_t padata_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->show(pinst, attr, buf); + + return ret; +} + +static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->store(pinst, attr, buf, count); + + return ret; +} + +static const struct sysfs_ops padata_sysfs_ops = { + .show = padata_sysfs_show, + .store = padata_sysfs_store, +}; + +static struct kobj_type padata_attr_type = { + .sysfs_ops = &padata_sysfs_ops, + .default_attrs = padata_default_attrs, + .release = padata_sysfs_release, +}; + /** - * padata_alloc - allocate and initialize a padata instance + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. * - * @cpumask: cpumask that padata uses for parallelization * @wq: workqueue to use for the allocated padata instance */ -struct padata_instance *padata_alloc(const struct cpumask *cpumask, - struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** + * padata_alloc - allocate and initialize a padata instance and specify + * cpumasks for serial and parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * @pcpumask: cpumask that will be used for padata parallelization + * @cbcpumask: cpumask that will be used for padata serialization + */ +struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; - struct parallel_data *pd; + struct parallel_data *pd = NULL; pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); if (!pinst) goto err; get_online_cpus(); - - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pinst->cpumask.pcpu); + goto err_free_inst; + } + if (!padata_validate_cpumask(pinst, pcpumask) || + !padata_validate_cpumask(pinst, cbcpumask)) + goto err_free_masks; - if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) - goto err_free_pd; + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + goto err_free_masks; rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; - cpumask_copy(pinst->cpumask, cpumask); + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); pinst->flags = 0; @@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, put_online_cpus(); + BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); + kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); return pinst; -err_free_pd: - padata_free_pd(pd); +err_free_masks: + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); put_online_cpus(); @@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc); */ void padata_free(struct padata_instance *pinst) { - padata_stop(pinst); - - synchronize_rcu(); - -#ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); -#endif - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); - - padata_free_pd(pinst->pd); - free_cpumask_var(pinst->cpumask); - kfree(pinst); + kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4c..403d1804b198 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } /* @@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event, struct perf_event *event, *partial_group = NULL; const struct pmu *pmu = group_event->pmu; bool txn = false; - int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -703,14 +702,8 @@ group_sched_in(struct perf_event *group_event, } } - if (!txn) - return 0; - - ret = pmu->commit_txn(pmu); - if (!ret) { - pmu->cancel_txn(pmu); + if (!txn || !pmu->commit_txn(pmu)) return 0; - } group_error: /* @@ -1155,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event, * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ - value = atomic64_read(&next_event->count); - value = atomic64_xchg(&event->count, value); - atomic64_set(&next_event->count, value); + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); @@ -1547,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; - if (atomic64_read(&hwc->period_left) > 8*sample_period) { + if (local64_read(&hwc->period_left) > 8*sample_period) { perf_disable(); perf_event_stop(event); - atomic64_set(&hwc->period_left, 0); + local64_set(&hwc->period_left, 0); perf_event_start(event); perf_enable(); } @@ -1591,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) perf_disable(); event->pmu->read(event); - now = atomic64_read(&event->count); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; @@ -1743,6 +1736,11 @@ static void __perf_event_read(void *info) event->pmu->read(event); } +static inline u64 perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + static u64 perf_event_read(struct perf_event *event) { /* @@ -1762,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event) raw_spin_unlock_irqrestore(&ctx->lock, flags); } - return atomic64_read(&event->count); + return perf_event_count(event); } /* @@ -1883,7 +1881,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); -static void perf_mmap_data_put(struct perf_mmap_data *data); +static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { @@ -1891,7 +1889,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { atomic_dec(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); @@ -1899,9 +1897,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->data) { - perf_mmap_data_put(event->data); - event->data = NULL; + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; } if (event->destroy) @@ -2126,13 +2124,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned int events = POLL_HUP; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) - events = atomic_xchg(&data->poll, 0); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -2143,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); - atomic64_set(&event->count, 0); + local64_set(&event->count, 0); perf_event_update_userpage(event); } @@ -2342,14 +2340,14 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; - userpg = data->user_page; + userpg = buffer->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -2359,9 +2357,9 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = atomic64_read(&event->count); + userpg->offset = perf_event_count(event); if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= atomic64_read(&event->hw.prev_count); + userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -2376,6 +2374,25 @@ unlock: rcu_read_unlock(); } +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* @@ -2383,15 +2400,15 @@ unlock: */ static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > data->nr_pages) + if (pgoff > buffer->nr_pages) return NULL; if (pgoff == 0) - return virt_to_page(data->user_page); + return virt_to_page(buffer->user_page); - return virt_to_page(data->data_pages[pgoff - 1]); + return virt_to_page(buffer->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) @@ -2407,42 +2424,44 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; int i; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - data->user_page = perf_mmap_alloc_page(event->cpu); - if (!data->user_page) + buffer->user_page = perf_mmap_alloc_page(cpu); + if (!buffer->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = perf_mmap_alloc_page(event->cpu); - if (!data->data_pages[i]) + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!buffer->data_pages[i]) goto fail_data_pages; } - data->nr_pages = nr_pages; + buffer->nr_pages = nr_pages; + + perf_buffer_init(buffer, watermark, flags); - return data; + return buffer; fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); + free_page((unsigned long)buffer->data_pages[i]); - free_page((unsigned long)data->user_page); + free_page((unsigned long)buffer->user_page); fail_user_page: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2456,17 +2475,17 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { int i; - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - kfree(data); + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); } -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { return 0; } @@ -2479,18 +2498,18 @@ static inline int page_order(struct perf_mmap_data *data) * Required for architectures that have d-cache aliasing issues. */ -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { - return data->page_order; + return buffer->page_order; } static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > (1UL << page_order(data))) + if (pgoff > (1UL << page_order(buffer))) return NULL; - return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) @@ -2500,57 +2519,59 @@ static void perf_mmap_unmark_page(void *addr) page->mapping = NULL; } -static void perf_mmap_data_free_work(struct work_struct *work) +static void perf_buffer_free_work(struct work_struct *work) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; void *base; int i, nr; - data = container_of(work, struct perf_mmap_data, work); - nr = 1 << page_order(data); + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); - base = data->user_page; + base = buffer->user_page; for (i = 0; i < nr + 1; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); - kfree(data); + kfree(buffer); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { - schedule_work(&data->work); + schedule_work(&buffer->work); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; void *all_buf; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - INIT_WORK(&data->work, perf_mmap_data_free_work); + INIT_WORK(&buffer->work, perf_buffer_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; - data->user_page = all_buf; - data->data_pages[0] = all_buf + PAGE_SIZE; - data->page_order = ilog2(nr_pages); - data->nr_pages = 1; + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; + + perf_buffer_init(buffer, watermark, flags); - return data; + return buffer; fail_all_buf: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2558,15 +2579,15 @@ fail: #endif -static unsigned long perf_data_size(struct perf_mmap_data *data) +static unsigned long perf_data_size(struct perf_buffer *buffer) { - return data->nr_pages << (PAGE_SHIFT + page_order(data)); + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -2576,14 +2597,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(data, vmf->pgoff); + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); if (!vmf->page) goto unlock; @@ -2598,52 +2619,35 @@ unlock: return ret; } -static void -perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) -{ - long max_size = perf_data_size(data); - - if (event->attr.watermark) { - data->watermark = min_t(long, max_size, - event->attr.wakeup_watermark); - } - - if (!data->watermark) - data->watermark = max_size / 2; - - atomic_set(&data->refcount, 1); - rcu_assign_pointer(event->data, data); -} - -static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_data_free(data); + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); } -static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) +static struct perf_buffer *perf_buffer_get(struct perf_event *event) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) { - if (!atomic_inc_not_zero(&data->refcount)) - data = NULL; + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; } rcu_read_unlock(); - return data; + return buffer; } -static void perf_mmap_data_put(struct perf_mmap_data *data) +static void perf_buffer_put(struct perf_buffer *buffer) { - if (!atomic_dec_and_test(&data->refcount)) + if (!atomic_dec_and_test(&buffer->refcount)) return; - call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2658,16 +2662,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->data); + unsigned long size = perf_data_size(event->buffer); struct user_struct *user = event->mmap_user; - struct perf_mmap_data *data = event->data; + struct perf_buffer *buffer = event->buffer; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->data, NULL); + rcu_assign_pointer(event->buffer, NULL); mutex_unlock(&event->mmap_mutex); - perf_mmap_data_put(data); + perf_buffer_put(buffer); free_uid(user); } } @@ -2685,11 +2689,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; - int ret = 0; + int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -2706,7 +2710,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have data pages ensure they're a power-of-two number, so we + * If we have buffer pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -2720,9 +2724,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->data) { - if (event->data->nr_pages == nr_pages) - atomic_inc(&event->data->refcount); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); else ret = -EINVAL; goto unlock; @@ -2752,17 +2756,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->data); + WARN_ON(event->buffer); + + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; - data = perf_mmap_data_alloc(event, nr_pages); - if (!data) { + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); + if (!buffer) { ret = -ENOMEM; goto unlock; } - - perf_mmap_data_init(event, data); - if (vma->vm_flags & VM_WRITE) - event->data->writable = 1; + rcu_assign_pointer(event->buffer, buffer); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; @@ -2941,11 +2946,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } -__weak -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ -} - /* * We assume there is only KVM supporting the callbacks. @@ -2971,15 +2971,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); /* * Output */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, unsigned long offset, unsigned long head) { unsigned long mask; - if (!data->writable) + if (!buffer->writable) return true; - mask = perf_data_size(data) - 1; + mask = perf_data_size(buffer) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2992,7 +2992,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->data->poll, POLL_IN); + atomic_set(&handle->buffer->poll, POLL_IN); if (handle->nmi) { handle->event->pending_wakeup = 1; @@ -3012,45 +3012,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle) */ static void perf_output_get_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; preempt_disable(); - local_inc(&data->nest); - handle->wakeup = local_read(&data->wakeup); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; unsigned long head; again: - head = local_read(&data->head); + head = local_read(&buffer->head); /* * IRQ/NMI can happen here, which means we can miss a head update. */ - if (!local_dec_and_test(&data->nest)) + if (!local_dec_and_test(&buffer->nest)) goto out; /* * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the data->head read and this + * by atomic_dec_and_test() order the buffer->head read and this * write. */ - data->user_page->data_head = head; + buffer->user_page->data_head = head; /* * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read data->head. + * barrier in atomic_dec_and_test() to re-read buffer->head. */ - if (unlikely(head != local_read(&data->head))) { - local_inc(&data->nest); + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); goto again; } - if (handle->wakeup != local_read(&data->wakeup)) + if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); out: @@ -3070,12 +3070,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, buf += size; handle->size -= size; if (!handle->size) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; handle->page++; - handle->page &= data->nr_pages - 1; - handle->addr = data->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(data); + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); } } while (len); } @@ -3084,7 +3084,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; struct { @@ -3100,19 +3100,19 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto out; - handle->data = data; + handle->buffer = buffer; handle->event = event; handle->nmi = nmi; handle->sample = sample; - if (!data->nr_pages) + if (!buffer->nr_pages) goto out; - have_lost = local_read(&data->lost); + have_lost = local_read(&buffer->lost); if (have_lost) size += sizeof(lost_event); @@ -3124,30 +3124,30 @@ int perf_output_begin(struct perf_output_handle *handle, * tail pointer. So that all reads will be completed before the * write is issued. */ - tail = ACCESS_ONCE(data->user_page->data_tail); + tail = ACCESS_ONCE(buffer->user_page->data_tail); smp_rmb(); - offset = head = local_read(&data->head); + offset = head = local_read(&buffer->head); head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) + if (unlikely(!perf_output_space(buffer, tail, offset, head))) goto fail; - } while (local_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&buffer->head, offset, head) != offset); - if (head - local_read(&data->wakeup) > data->watermark) - local_add(data->watermark, &data->wakeup); + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(data)); - handle->page &= data->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); - handle->addr = data->data_pages[handle->page]; + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(data)) - handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = local_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); } @@ -3155,7 +3155,7 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - local_inc(&data->lost); + local_inc(&buffer->lost); perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3166,15 +3166,15 @@ out: void perf_output_end(struct perf_output_handle *handle) { struct perf_event *event = handle->event; - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = local_inc_return(&data->events); + int events = local_inc_return(&buffer->events); if (events >= wakeup_events) { - local_sub(wakeup_events, &data->events); - local_inc(&data->wakeup); + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); } } @@ -3211,7 +3211,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[4]; int n = 0; - values[n++] = atomic64_read(&event->count); + values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -3248,7 +3248,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (leader != event) leader->pmu->read(leader); - values[n++] = atomic64_read(&leader->count); + values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); @@ -3260,7 +3260,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (sub != event) sub->pmu->read(sub); - values[n++] = atomic64_read(&sub->count); + values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event, /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task */ struct perf_task_event { @@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event) if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.comm || event->attr.mmap || event->attr.task) + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) return 1; return 0; @@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event, } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; @@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.mmap) + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) return 1; return 0; } static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event)) + if (perf_event_mmap_match(event, mmap_event, executable)) perf_event_mmap_output(event, mmap_event); } } @@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; } name = strncpy(tmp, "//anon", sizeof(tmp)); @@ -3846,17 +3858,17 @@ got_name: rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_event_mmap_ctx(ctx, mmap_event); + perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); put_cpu_var(perf_cpu_context); rcu_read_unlock(); kfree(buf); } -void __perf_event_mmap(struct vm_area_struct *vma) +void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -4018,14 +4030,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; again: - old = val = atomic64_read(&hwc->period_left); + old = val = local64_read(&hwc->period_left); if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + if (local64_cmpxchg(&hwc->period_left, old, val) != old) goto again; return nr; @@ -4064,7 +4076,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, { struct hw_perf_event *hwc = &event->hw; - atomic64_add(nr, &event->count); + local64_add(nr, &event->count); if (!regs) return; @@ -4075,7 +4087,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, nmi, data, regs); - if (atomic64_add_negative(nr, &hwc->period_left)) + if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, nmi, data, regs); @@ -4213,14 +4225,12 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void perf_swevent_put_recursion_context(int rctx) +void inline perf_swevent_put_recursion_context(int rctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); barrier(); cpuctx->recursion[rctx]--; } -EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); - void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4368,8 +4378,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_xchg(&event->hw.prev_count, now); - atomic64_add(now - prev, &event->count); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } static int cpu_clock_perf_event_enable(struct perf_event *event) @@ -4377,7 +4387,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int cpu = raw_smp_processor_id(); - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + local64_set(&hwc->prev_count, cpu_clock(cpu)); perf_swevent_start_hrtimer(event); return 0; @@ -4409,9 +4419,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now) u64 prev; s64 delta; - prev = atomic64_xchg(&event->hw.prev_count, now); + prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } static int task_clock_perf_event_enable(struct perf_event *event) @@ -4421,7 +4431,7 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; - atomic64_set(&hwc->prev_count, now); + local64_set(&hwc->prev_count, now); perf_swevent_start_hrtimer(event); @@ -4601,7 +4611,7 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head) + struct pt_regs *regs, struct hlist_head *head, int rctx) { struct perf_sample_data data; struct perf_event *event; @@ -4615,12 +4625,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_sample_data_init(&data, addr); data.raw = &raw; - rcu_read_lock(); hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_add(event, count, 1, &data, regs); } - rcu_read_unlock(); + + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -4864,7 +4874,7 @@ perf_event_alloc(struct perf_event_attr *attr, hwc->sample_period = 1; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); /* * we currently do not support PERF_FORMAT_GROUP on inherited events @@ -4913,7 +4923,7 @@ done: if (!event->parent) { atomic_inc(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); @@ -5007,7 +5017,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_mmap_data *data = NULL, *old_data = NULL; + struct perf_buffer *buffer = NULL, *old_buffer = NULL; int ret = -EINVAL; if (!output_event) @@ -5037,19 +5047,19 @@ set: if (output_event) { /* get the buffer we want to redirect to */ - data = perf_mmap_data_get(output_event); - if (!data) + buffer = perf_buffer_get(output_event); + if (!buffer) goto unlock; } - old_data = event->data; - rcu_assign_pointer(event->data, data); + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_data) - perf_mmap_data_put(old_data); + if (old_buffer) + perf_buffer_put(old_buffer); out: return ret; } @@ -5298,7 +5308,7 @@ inherit_event(struct perf_event *parent_event, hwc->sample_period = sample_period; hwc->last_period = sample_period; - atomic64_set(&hwc->period_left, sample_period); + local64_set(&hwc->period_left, sample_period); } child_event->overflow_handler = parent_event->overflow_handler; @@ -5359,12 +5369,12 @@ static void sync_child_event(struct perf_event *child_event, if (child_event->attr.inherit_stat) perf_event_read_event(child_event, child); - child_val = atomic64_read(&child_event->count); + child_val = perf_event_count(child_event); /* * Add back the child's count to the parent's count: */ - atomic64_add(child_val, &parent_event->count); + atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index f42d3f737a33..996a4dec5f96 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -48,59 +48,49 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_request_list { - struct list_head list; - union { - s32 value; - s32 usec; - s32 kbps; - }; - int pm_qos_class; +enum pm_qos_type { + PM_QOS_MAX, /* return the largest value */ + PM_QOS_MIN /* return the smallest value */ }; -static s32 max_compare(s32 v1, s32 v2); -static s32 min_compare(s32 v1, s32 v2); - struct pm_qos_object { - struct pm_qos_request_list requests; + struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - atomic_t target_value; - s32 (*comparitor)(s32, s32); + enum pm_qos_type type; }; +static DEFINE_SPINLOCK(pm_qos_lock); + static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = ATOMIC_INIT(0), - .comparitor = max_compare + .type = PM_QOS_MAX, }; @@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = { &network_throughput_pm_qos }; -static DEFINE_SPINLOCK(pm_qos_lock); - static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); @@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = { .release = pm_qos_power_release, }; -/* static helper functions */ -static s32 max_compare(s32 v1, s32 v2) +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_object *o) { - return max(v1, v2); -} + if (plist_head_empty(&o->requests)) + return o->default_value; -static s32 min_compare(s32 v1, s32 v2) -{ - return min(v1, v2); -} + switch (o->type) { + case PM_QOS_MIN: + return plist_last(&o->requests)->prio; + case PM_QOS_MAX: + return plist_first(&o->requests)->prio; -static void update_target(int pm_qos_class) + default: + /* runtime check for not using enum */ + BUG(); + } +} + +static void update_target(struct pm_qos_object *o, struct plist_node *node, + int del, int value) { - s32 extreme_value; - struct pm_qos_request_list *node; unsigned long flags; - int call_notifier = 0; + int prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[pm_qos_class]->default_value; - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requests.list, list) { - extreme_value = pm_qos_array[pm_qos_class]->comparitor( - extreme_value, node->value); - } - if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != - extreme_value) { - call_notifier = 1; - atomic_set(&pm_qos_array[pm_qos_class]->target_value, - extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, - atomic_read(&pm_qos_array[pm_qos_class]->target_value)); + prev_value = pm_qos_get_value(o); + /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ + if (value != PM_QOS_DEFAULT_VALUE) { + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &o->requests); + plist_node_init(node, value); + plist_add(node, &o->requests); + } else if (del) { + plist_del(node, &o->requests); + } else { + plist_add(node, &o->requests); } + curr_value = pm_qos_get_value(o); spin_unlock_irqrestore(&pm_qos_lock, flags); - if (call_notifier) - blocking_notifier_call_chain( - pm_qos_array[pm_qos_class]->notifiers, - (unsigned long) extreme_value, NULL); + if (prev_value != curr_value) + blocking_notifier_call_chain(o->notifiers, + (unsigned long)curr_value, + NULL); } static int register_pm_qos_misc(struct pm_qos_object *qos) @@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return atomic_read(&pm_qos_array[pm_qos_class]->target_value); + unsigned long flags; + int value; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[pm_qos_class]); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return value; } EXPORT_SYMBOL_GPL(pm_qos_request); +int pm_qos_request_active(struct pm_qos_request_list *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + /** * pm_qos_add_request - inserts new qos request into the list * @pm_qos_class: identifies which list of qos request to us @@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request); * element as a handle for use in updating and removal. Call needs to save * this handle for later use. */ -struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) +void pm_qos_add_request(struct pm_qos_request_list *dep, + int pm_qos_class, s32 value) { - struct pm_qos_request_list *dep; - unsigned long flags; + struct pm_qos_object *o = pm_qos_array[pm_qos_class]; + int new_value; - dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); - if (dep) { - if (value == PM_QOS_DEFAULT_VALUE) - dep->value = pm_qos_array[pm_qos_class]->default_value; - else - dep->value = value; - dep->pm_qos_class = pm_qos_class; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requests.list); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(pm_qos_class); + if (pm_qos_request_active(dep)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; } - - return dep; + if (value == PM_QOS_DEFAULT_VALUE) + new_value = o->default_value; + else + new_value = value; + plist_node_init(&dep->list, new_value); + dep->pm_qos_class = pm_qos_class; + update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); } EXPORT_SYMBOL_GPL(pm_qos_add_request); @@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * Attempts are made to make this code callable on hot code paths. */ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, - s32 new_value) + s32 new_value) { - unsigned long flags; - int pending_update = 0; s32 temp; + struct pm_qos_object *o; + + if (!pm_qos_req) /*guard against callers passing in null */ + return; - if (pm_qos_req) { /*guard against callers passing in null */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->value) { - pending_update = 1; - pm_qos_req->value = temp; - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_req->pm_qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = o->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->list.prio) + update_target(o, &pm_qos_req->list, 0, temp); } EXPORT_SYMBOL_GPL(pm_qos_update_request); @@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request); */ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) { - unsigned long flags; - int qos_class; + struct pm_qos_object *o; if (pm_qos_req == NULL) return; /* silent return to keep pcm code cleaner */ - qos_class = pm_qos_req->pm_qos_class; - spin_lock_irqsave(&pm_qos_lock, flags); - list_del(&pm_qos_req->list); - kfree(pm_qos_req); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); + memset(pm_qos_req, 0, sizeof(*pm_qos_req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - filp->private_data = (void *) pm_qos_add_request(pm_qos_class, - PM_QOS_DEFAULT_VALUE); + struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; if (filp->private_data) return 0; @@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) { struct pm_qos_request_list *req; - req = (struct pm_qos_request_list *)filp->private_data; + req = filp->private_data; pm_qos_remove_request(req); + kfree(req); return 0; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646d399c..f66bdd33a6c6 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { - struct sighand_struct *sighand; - struct signal_struct *sig; + struct signal_struct *sig = tsk->signal; struct task_struct *t; - *times = INIT_CPUTIME; + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - if (!sighand) + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) goto out; - sig = tsk->signal; - t = tsk; do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); times->sum_exec_runtime += t->se.sum_exec_runtime; - - t = next_thread(t); - } while (t != tsk); - - times->utime = cputime_add(times->utime, sig->utime); - times->stime = cputime_add(times->stime, sig->stime); - times->sum_exec_runtime += sig->sum_sched_runtime; + } while_each_thread(tsk, t); out: rcu_read_unlock(); } @@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - /* tsk == current, ensure it is safe to use ->signal/sighand */ - if (unlikely(tsk->exit_state)) - return 0; - if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { .utime = tsk->utime, @@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - thread_group_cputimer(tsk, &group_sample); + spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + spin_unlock(&sig->cputimer.lock); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) if (!fastpath_timer_check(tsk)) return; - spin_lock(&tsk->sighand->siglock); + if (!lock_task_sighand(tsk, &flags)) + return; /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) * that gets the timer lock before we do will give it up and * spin until we've taken care of that timer below. */ - spin_unlock(&tsk->sighand->siglock); + unlock_task_sighand(tsk, &flags); /* * Now that all the timers on our list have the firing flag, diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad723420acc3..9ca4973f736d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->it_clock = which_clock; new_timer->it_overrun = -1; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { - error = -EFAULT; - goto out; - } if (timer_event_spec) { if (copy_from_user(&event, timer_event_spec, sizeof (event))) { error = -EFAULT; @@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); if (error) goto out; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aa9e916da4d5..8dc31e02ae12 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * * This file is released under the GPLv2. @@ -277,7 +277,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE)) + if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) goto Power_up; in_suspend = 1; @@ -288,8 +288,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) + if (!in_suspend) { + events_check_enabled = false; platform_leave(platform_mode); + } Power_up: sysdev_resume(); @@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode) error = platform_begin(platform_mode); if (error) - return error; + goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); @@ -511,18 +513,24 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + if (!pm_check_wakeup_events()) { + error = -EAGAIN; + goto Power_up; + } + hibernation_ops->enter(); /* We should never get here */ while (1); - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ + Power_up: + sysdev_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + Platform_finish: hibernation_ops->finish(); - dpm_suspend_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index b58800b21fc0..62b0bc6e4983 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(state); +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned long val; + + return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (sscanf(buf, "%lu", &val) == 1) { + if (pm_save_wakeup_count(val)) + return n; + } + return -EINVAL; +} + +power_attr(wakeup_count); +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -236,6 +290,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, + &wakeup_count_attr.attr, #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index 71ae29052ab6..028a99598f49 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -15,6 +15,7 @@ #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> +#include <linux/workqueue.h> /* * Timeout for stopping processes @@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only) struct task_struct *g, *p; unsigned long end_time; unsigned int todo; + bool wq_busy = false; struct timeval start, end; u64 elapsed_csecs64; unsigned int elapsed_csecs; @@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only) do_gettimeofday(&start); end_time = jiffies + TIMEOUT; + + if (!sig_only) + freeze_workqueues_begin(); + while (true) { todo = 0; read_lock(&tasklist_lock); @@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); + + if (!sig_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; + } + if (!todo || time_after(jiffies, end_time)) break; @@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only) */ printk("\n"); printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " - "(%d tasks refusing to freeze):\n", - elapsed_csecs / 100, elapsed_csecs % 100, todo); + "(%d tasks refusing to freeze, wq_busy=%d):\n", + elapsed_csecs / 100, elapsed_csecs % 100, + todo - wq_busy, wq_busy); + + thaw_workqueues(); + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); @@ -157,6 +173,7 @@ void thaw_processes(void) oom_killer_enable(); printk("Restarting tasks ... "); + thaw_workqueues(); thaw_tasks(true); thaw_tasks(false); schedule(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 25ce010e9f8b..f6cd6faf84fd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -3,7 +3,7 @@ * * This file provides system snapshot/restore functionality for swsusp. * - * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f37cb7dd4402..7335952ee473 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - return error; + goto Platform_finish; } error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; + goto Platform_finish; } if (suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) - goto Power_up_devices; + goto Platform_wake; } if (suspend_test(TEST_PLATFORM)) @@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE)) + if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { error = suspend_ops->enter(state); + events_check_enabled = false; + } sysdev_resume(); } @@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->wake) suspend_ops->wake(); - Power_up_devices: dpm_resume_noirq(PMSG_RESUME); - Platfrom_finish: + Platform_finish: if (suspend_ops->finish) suspend_ops->finish(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b0bb21778391..e6a5bdf61a37 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -4,7 +4,7 @@ * This file provides functions for reading the suspend image from * and writing it to a swap partition. * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -32,7 +32,7 @@ /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. * These structures are stored on the swap and linked together with the * help of the .next_swap member. * @@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap) /** * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been + * It also frees the extents used to register which swap entries had been * allocated. */ diff --git a/kernel/printk.c b/kernel/printk.c index 444b770c9595..4ab0164bcf84 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -37,6 +37,8 @@ #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> +#include <linux/cpu.h> +#include <linux/notifier.h> #include <asm/uaccess.h> @@ -985,6 +987,32 @@ void resume_console(void) } /** + * console_cpu_notify - print deferred console messages after CPU hotplug + * @self: notifier struct + * @action: CPU hotplug event + * @hcpu: unused + * + * If printk() is called from a CPU that is not online yet, the messages + * will be spooled but will not show up on the console. This function is + * called when a new CPU comes online (or fails to come up), and ensures + * that any such output gets printed. + */ +static int __cpuinit console_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_DEAD: + case CPU_DYING: + case CPU_DOWN_FAILED: + case CPU_UP_CANCELED: + acquire_console_sem(); + release_console_sem(); + } + return NOTIFY_OK; +} + +/** * acquire_console_sem - lock the console system for exclusive use. * * Acquires a semaphore which guarantees that the caller has @@ -1371,7 +1399,7 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); -static int __init disable_boot_consoles(void) +static int __init printk_late_init(void) { struct console *con; @@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void) unregister_console(con); } } + hotcpu_notifier(console_cpu_notify, 0); return 0; } -late_initcall(disable_boot_consoles); +late_initcall(printk_late_init); #if defined CONFIG_PRINTK diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9567f5..4d169835fb36 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void) } EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); #endif /* #ifdef CONFIG_PROVE_RCU */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static inline void debug_init_rcu_head(struct rcu_head *head) +{ + debug_object_init(head, &rcuhead_debug_descr); +} + +static inline void debug_rcu_head_free(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_init(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + * Activation is performed internally by call_rcu(). + */ +static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. We just make sure that it is + * tracked in the object tracker. + */ + debug_object_init(head, &rcuhead_debug_descr); + debug_object_activate(head, &rcuhead_debug_descr); + return 0; + + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_activate(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#else + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_free(head, &rcuhead_debug_descr); + return 1; +#endif + default: + return 0; + } +} + +/** + * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects of a new rcu_head structure that + * has been allocated as an auto variable on the stack. This function + * is not required for rcu_head structures that are statically defined or + * that are dynamically allocated on the heap. This function has no + * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void init_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_init_on_stack(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); + +/** + * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects that an on-stack rcu_head structure + * is about to go out of scope. As with init_rcu_head_on_stack(), this + * function is not required for rcu_head structures that are statically + * defined or that are dynamically allocated on the heap. Also as with + * init_rcu_head_on_stack(), this function has no effect for + * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void destroy_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); + +struct debug_obj_descr rcuhead_debug_descr = { + .name = "rcu_head", + .fixup_init = rcuhead_fixup_init, + .fixup_activate = rcuhead_fixup_activate, + .fixup_free = rcuhead_fixup_free, +}; +EXPORT_SYMBOL_GPL(rcuhead_debug_descr); +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3cd236..196ec02f8be0 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; } @@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head, { unsigned long flags; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a5..2e2726d790b9 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -239,8 +239,7 @@ static unsigned long rcu_random(struct rcu_random_state *rrsp) { if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); + rrsp->rrs_state += (unsigned long)local_clock(); rrsp->rrs_count = RCU_RANDOM_REFRESH; } rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d4437345706f..d5bc43976c5a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; if (++count >= rdp->blimit) @@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/kernel/sched.c b/kernel/sched.c index f52a8801b7a2..41541d79e3c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -77,6 +77,7 @@ #include <asm/irq_regs.h> #include "sched_cpupri.h" +#include "workqueue_sched.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -456,9 +457,10 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp; - unsigned char in_nohz_recently; + unsigned char nohz_balance_kick; #endif unsigned int skip_clock_update; @@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu) #ifdef CONFIG_NO_HZ /* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) + if (!idle_cpu(i)) + return i; + } + return cpu; +} +/* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely @@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -int nohz_ratelimit(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 diff = rq->clock - rq->nohz_stamp; - - rq->nohz_stamp = rq->clock; - - return diff < (NSEC_PER_SEC / HZ) >> 1; -} - #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd) if (root_task_group_empty()) return; - now = cpu_clock(raw_smp_processor_id()); + now = local_clock(); elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { @@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample) } #endif -/*** +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, + bool is_sync, bool is_migrate, bool is_local, + unsigned long en_flags) +{ + schedstat_inc(p, se.statistics.nr_wakeups); + if (is_sync) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + if (is_migrate) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + if (is_local) + schedstat_inc(p, se.statistics.nr_wakeups_local); + else + schedstat_inc(p, se.statistics.nr_wakeups_remote); + + activate_task(rq, p, en_flags); +} + +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, + int wake_flags, bool success) +{ + trace_sched_wakeup(p, success); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif + /* if a worker is waking up, notify workqueue */ + if ((p->flags & PF_WQ_WORKER) && success) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/** * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread + * @p: the thread to be awakened * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.statistics.nr_wakeups); - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); - activate_task(rq, p, en_flags); + ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, + cpu == this_cpu, en_flags); success = 1; - out_running: - trace_sched_wakeup(p, success); - check_preempt_curr(rq, p, wake_flags); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); - - if (unlikely(rq->idle_stamp)) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } -#endif + ttwu_post_activation(p, rq, wake_flags, success); out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2399,6 +2431,37 @@ out: } /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not alredy there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + bool success = false; + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!(p->state & TASK_NORMAL)) + return; + + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); + success = true; + } + ttwu_post_activation(p, rq, 0, success); +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. */ static void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; + unsigned long curr_jiffies = jiffies; + unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; + /* Avoid repeated calls on same jiffy, when moving in and out of idle */ + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { unsigned long old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq) * example. */ if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } +} + +static void update_cpu_load_active(struct rq *this_rq) +{ + update_cpu_load(this_rq); calc_load_account_active(this_rq); } @@ -3426,7 +3574,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load(rq); + update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); @@ -3598,7 +3746,6 @@ need_resched: rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; - switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3611,11 +3758,26 @@ need_resched_nonpreemptible: raw_spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } deactivate_task(rq, prev, DEQUEUE_SLEEP); + } switch_count = &prev->nvcsw; } @@ -3637,8 +3799,10 @@ need_resched_nonpreemptible: context_switch(rq, prev, next); /* unlocks the rq */ /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3647,11 +3811,8 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - prev = rq->curr; - switch_count = &prev->nivcsw; + if (unlikely(reacquire_kernel_lock(prev))) goto need_resched_nonpreemptible; - } preempt_enable_no_resched(); if (need_resched()) @@ -3726,7 +3887,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched preempt_schedule(void) +asmlinkage void __sched notrace preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -3738,9 +3899,9 @@ asmlinkage void __sched preempt_schedule(void) return; do { - add_preempt_count(PREEMPT_ACTIVE); + add_preempt_count_notrace(PREEMPT_ACTIVE); schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count_notrace(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -4441,12 +4602,8 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &flags); + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) @@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, - .priority = 10 + .priority = CPU_PRI_MIGRATION, }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; - /* Start one for the boot CPU: */ + /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + return 0; } early_initcall(migration_init); @@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd) { - gfp_t gfp = GFP_KERNEL; - memset(rd, 0, sizeof(*rd)); - if (bootmem) - gfp = GFP_NOWAIT; - - if (!alloc_cpumask_var(&rd->span, gfp)) + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, gfp)) + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, gfp)) + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - if (cpupri_init(&rd->cpupri, bootmem) != 0) + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6096,7 +6277,7 @@ out: static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain, true); + init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } @@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - if (init_rootdomain(rd, false) != 0) { + if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } @@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - partition_sched_domains(1, NULL, NULL); + cpuset_update_active_cpus(); return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; default: return NOTIFY_DONE; } } -#endif static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -7356,10 +7543,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); /* RT runtime code needs to handle some hotplug events */ hotcpu_notifier(update_runtime, 0); @@ -7604,6 +7789,9 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7617,6 +7805,10 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7661,8 +7853,11 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); #endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb3..52f1a149bfb1 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -10,19 +10,55 @@ * Ingo Molnar <mingo@redhat.com> * Guillaume Chazarain <guichaz@gmail.com> * - * Create a semi stable clock from a mixture of other events, including: - * - gtod + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) * - sched_clock() * - explicit idle events * - * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 2 jiffies difference). + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -170,6 +206,11 @@ again: return val; } +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; @@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -unsigned long long cpu_clock(int cpu) +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) { - unsigned long long clock; + u64 clock; unsigned long flags; local_irq_save(flags); @@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) return clock; } +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -unsigned long long cpu_clock(int cpu) +u64 cpu_clock(int cpu) { return sched_clock_cpu(cpu); } +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb3fc83..2722dc1b4138 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp) { - gfp_t gfp = GFP_KERNEL; int i; - if (bootmem) - gfp = GFP_NOWAIT; - memset(cp, 0, sizeof(*cp)); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) raw_spin_lock_init(&vec->lock); vec->count = 0; - if (!zalloc_cpumask_var(&vec->mask, gfp)) + if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6b95be..9fc7d386fea4 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -27,7 +27,7 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp, bool bootmem); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 35565395d00d..2e1b0d17dd9b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_child_runs_first); + P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332daa..806d1b227a21 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); - else - power *= default_scale_freq_power(sd, cpu); - - power >>= SCHED_LOAD_SHIFT; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) power *= arch_scale_smt_power(sd, cpu); @@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_LOAD_SHIFT; } + sdg->cpu_power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + power *= scale_rt_power(cpu); power >>= SCHED_LOAD_SHIFT; @@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) sdg->cpu_power = power; } +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_LOAD_SCALE + */ + if (sd->level != SD_LV_SIBLING) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->cpu_power * 32 > group->cpu_power_orig * 29) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. @@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu) { - *balance = 0; - return; + if (idle != CPU_NEWLY_IDLE && local_group) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); } - update_group_power(sd, this_cpu); - /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; @@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * @this_cpu: the current cpu + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs, + int this_cpu) +{ + if (sgs->avg_load <= sds->max_load) + return false; + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + this_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + return false; } /** @@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing group. + * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; - struct sched_group *group = sd->groups; + struct sched_group *sg = sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; @@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, do { int local_group; - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->cpu_power; + sds->total_pwr += sg->cpu_power; /* * In case the child domain prefers tasks go to siblings - * first, lower the group capacity to one so that we'll try + * first, lower the sg capacity to one so that we'll try * and move all the excess tasks away. */ if (prefer_sibling) @@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, if (local_group) { sds->this_load = sgs.avg_load; - sds->this = group; + sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds->max_load && - (sgs.sum_nr_running > sgs.group_capacity || - sgs.group_imb)) { + } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; - sds->busiest = group; + sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(group, sds, local_group, &sgs); - group = group->next; - } while (group != sd->groups); + update_sd_power_savings_stats(sg, sds, local_group, &sgs); + sg = sg->next; + } while (sg != sd->groups); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + */ +static int check_asym_packing(struct sched_domain *sd, + struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + int busiest_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (this_cpu > busiest_cpu) + return 0; + + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + SCHED_LOAD_SCALE); + return 1; } /** @@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!(*balance)) goto ret; + if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && + check_asym_packing(sd, &sds, this_cpu, imbalance)) + return sds.busiest; + if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; @@ -2726,8 +2850,9 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const struct cpumask *cpus) +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, + enum cpu_idle_type idle, unsigned long imbalance, + const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; + if (!capacity) + capacity = fix_small_capacity(sd, group); + if (!cpumask_test_cpu(i, cpus)) continue; @@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) +static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, + int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + return 1; + /* * The only task running in a non-idle cpu can be moved to this * cpu in an attempt to completely freeup the other CPU @@ -2854,7 +2992,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, cpus); + busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2898,7 +3036,8 @@ redo: schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle)) { + if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), + this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3093,13 +3232,40 @@ out_unlock: } #ifdef CONFIG_NO_HZ + +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); + +static void trigger_sched_softirq(void *data) +{ + raise_softirq_irqoff(SCHED_SOFTIRQ); +} + +static inline void init_sched_softirq_csd(struct call_single_data *csd) +{ + csd->func = trigger_sched_softirq; + csd->info = NULL; + csd->flags = 0; + csd->priv = 0; +} + +/* + * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ static struct { atomic_t load_balancer; - cpumask_var_t cpu_mask; - cpumask_var_t ilb_grp_nohz_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), -}; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; + cpumask_var_t idle_cpus_mask; + cpumask_var_t grp_idle_mask; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; int get_nohz_load_balancer(void) { @@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) */ static inline int is_semi_idle_group(struct sched_group *ilb_group) { - cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, sched_group_cpus(ilb_group)); /* * A sched_group is semi-idle when it has atleast one busy cpu * and atleast one idle cpu. */ - if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + if (cpumask_empty(nohz.grp_idle_mask)) return 0; - if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) return 0; return 1; @@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu) * Optimize for the case when we have no idle CPUs or only one * idle CPU. Don't walk the sched_domain hierarchy in such cases */ - if (cpumask_weight(nohz.cpu_mask) < 2) + if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { @@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu) do { if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.ilb_grp_nohz_mask); + return cpumask_first(nohz.grp_idle_mask); ilb_group = ilb_group->next; @@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu) } out_done: - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #endif /* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = get_nohz_load_balancer(); + + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } + + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + struct call_single_data *cp; + + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } + return; +} + +/* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. + * load balancing on behalf of all those cpus. * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). */ -int select_nohz_load_balancer(int stop_tick) +void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); if (stop_tick) { - cpu_rq(cpu)->in_nohz_recently = 1; - if (!cpu_active(cpu)) { if (atomic_read(&nohz.load_balancer) != cpu) - return 0; + return; /* * If we are going offline and still the leader, * give up! */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); - return 0; + return; } - cpumask_set_cpu(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { int new_ilb; - if (!(sched_smt_power_savings || - sched_mc_power_savings)) - return 1; + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + /* * Check to see if there is a more power-efficient * ilb. */ new_ilb = find_new_ilb(cpu); if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, -1); + atomic_set(&nohz.load_balancer, nr_cpu_ids); resched_cpu(new_ilb); - return 0; + return; } - return 1; + return; } } else { - if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) - return 0; + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; - cpumask_clear_cpu(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); } - return 0; + return; } #endif @@ -3385,11 +3569,102 @@ out: rq->next_balance = next_balance; } +#ifdef CONFIG_NO_HZ /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the + * In CONFIG_NO_HZ case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + this_rq->nohz_balance_kick = 0; + break; + } + + raw_spin_lock_irq(&this_rq->lock); + update_rq_clock(this_rq); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; + this_rq->nohz_balance_kick = 0; +} + +/* + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + int ret; + int first_pick_cpu, second_pick_cpu; + + if (time_before(now, nohz.next_balance)) + return 0; + + if (!rq->nr_running) + return 0; + + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; + + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } + } + return 0; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); @@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(this_cpu, idle); -#ifdef CONFIG_NO_HZ /* - * If this cpu is the owner for idle load balancing, then do the + * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - struct rq *rq; - int balance_cpu; - - for_each_cpu(balance_cpu, nohz.cpu_mask) { - if (balance_cpu == this_cpu) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif + nohz_idle_balance(this_cpu, idle); } static inline int on_null_domain(int cpu) @@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpumask_clear_cpu(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - int ilb = find_new_ilb(cpu); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpumask_test_cpu(cpu, nohz.cpu_mask)) - return; -#endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif } static void rq_online_fair(struct rq *rq) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953e31c6..d10c80ebb67a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; - if (!p->signal) - return; - /* max may change after cur was read, this will be fixed next tick */ soft = task_rlimit(p, RLIMIT_RTTIME); hard = task_rlimit_max(p, RLIMIT_RTTIME); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4061b0..25c2f962f6fc 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct thread_group_cputimer *cputimer; - struct signal_struct *sig; - - sig = tsk->signal; - /* see __exit_signal()->task_rq_unlock_wait() */ - barrier(); - if (unlikely(!sig)) - return; - - cputimer = &sig->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; diff --git a/kernel/signal.c b/kernel/signal.c index 906ae5a1779c..bded65187780 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info) /* * Bad permissions for sending the signal - * - the caller must hold at least the RCU read lock + * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) @@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long /* * send signal info to all the members of a group - * - the caller must hold the RCU read lock at least */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret = check_kill_permission(sig, info, p); + int ret; + + rcu_read_lock(); + ret = check_kill_permission(sig, info, p); + rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, true); diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c deleted file mode 100644 index e45c43645298..000000000000 --- a/kernel/slow-work-debugfs.c +++ /dev/null @@ -1,227 +0,0 @@ -/* Slow work debugging - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/seq_file.h> -#include "slow-work.h" - -#define ITERATOR_SHIFT (BITS_PER_LONG - 4) -#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) -#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) - -void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) -{ - seq_puts(m, "Slow-work: New thread"); -} - -/* - * Render the time mark field on a work item into a 5-char time with units plus - * a space - */ -static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) -{ - struct timespec now, diff; - - now = CURRENT_TIME; - diff = timespec_sub(now, work->mark); - - if (diff.tv_sec < 0) - seq_puts(m, " -ve "); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) - seq_printf(m, "%3luns ", diff.tv_nsec); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) - seq_printf(m, "%3luus ", diff.tv_nsec / 1000); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) - seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); - else if (diff.tv_sec <= 1) - seq_puts(m, " 1s "); - else if (diff.tv_sec < 60) - seq_printf(m, "%4lus ", diff.tv_sec); - else if (diff.tv_sec < 60 * 60) - seq_printf(m, "%4lum ", diff.tv_sec / 60); - else if (diff.tv_sec < 60 * 60 * 24) - seq_printf(m, "%4luh ", diff.tv_sec / 3600); - else - seq_puts(m, "exces "); -} - -/* - * Describe a slow work item for debugfs - */ -static int slow_work_runqueue_show(struct seq_file *m, void *v) -{ - struct slow_work *work; - struct list_head *p = v; - unsigned long id; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); - return 0; - case 2: - seq_puts(m, "=== ===== ================ == ===== ==========\n"); - return 0; - - case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: - id = (unsigned long) v - 3; - - read_lock(&slow_work_execs_lock); - work = slow_work_execs[id]; - if (work) { - smp_read_barrier_depends(); - - seq_printf(m, "%3lu %5d %16p %2lx ", - id, slow_work_pids[id], work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - } - read_unlock(&slow_work_execs_lock); - return 0; - - default: - work = list_entry(p, struct slow_work, link); - seq_printf(m, "%3s - %16p %2lx ", - work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", - work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - return 0; - } -} - -/* - * map the iterator to a work item - */ -static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) -{ - struct list_head *p; - unsigned long count, id; - - switch (*_pos >> ITERATOR_SHIFT) { - case 0x0: - if (*_pos == 0) - *_pos = 1; - if (*_pos < 3) - return (void *)(unsigned long) *_pos; - if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) - for (id = *_pos - 3; - id < SLOW_WORK_THREAD_LIMIT; - id++, (*_pos)++) - if (slow_work_execs[id]) - return (void *)(unsigned long) *_pos; - *_pos = 0x1UL << ITERATOR_SHIFT; - - case 0x1: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &slow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - - case 0x2: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &vslow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) -{ - spin_lock_irq(&slow_work_queue_lock); - return slow_work_runqueue_index(m, _pos); -} - -/* - * move to the next line - */ -static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) -{ - struct list_head *p = v; - unsigned long selector = *_pos >> ITERATOR_SHIFT; - - (*_pos)++; - switch (selector) { - case 0x0: - return slow_work_runqueue_index(m, _pos); - - case 0x1: - if (*_pos >> ITERATOR_SHIFT == 0x1) { - p = p->next; - if (p != &slow_work_queue) - return p; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - p = &vslow_work_queue; - - case 0x2: - if (*_pos >> ITERATOR_SHIFT == 0x2) { - p = p->next; - if (p != &vslow_work_queue) - return p; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * clean up after reading - */ -static void slow_work_runqueue_stop(struct seq_file *m, void *v) -{ - spin_unlock_irq(&slow_work_queue_lock); -} - -static const struct seq_operations slow_work_runqueue_ops = { - .start = slow_work_runqueue_start, - .stop = slow_work_runqueue_stop, - .next = slow_work_runqueue_next, - .show = slow_work_runqueue_show, -}; - -/* - * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents - */ -static int slow_work_runqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slow_work_runqueue_ops); -} - -const struct file_operations slow_work_runqueue_fops = { - .owner = THIS_MODULE, - .open = slow_work_runqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c deleted file mode 100644 index 7d3f4fa9ef4f..000000000000 --- a/kernel/slow-work.c +++ /dev/null @@ -1,1068 +0,0 @@ -/* Worker thread pool for slow items, such as filesystem lookups or mkdirs - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - * - * See Documentation/slow-work.txt - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/kthread.h> -#include <linux/freezer.h> -#include <linux/wait.h> -#include <linux/debugfs.h> -#include "slow-work.h" - -static void slow_work_cull_timeout(unsigned long); -static void slow_work_oom_timeout(unsigned long); - -#ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -static int slow_work_max_threads_sysctl(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -#endif - -/* - * The pool of threads has at least min threads in it as long as someone is - * using the facility, and may have as many as max. - * - * A portion of the pool may be processing very slow operations. - */ -static unsigned slow_work_min_threads = 2; -static unsigned slow_work_max_threads = 4; -static unsigned vslow_work_proportion = 50; /* % of threads that may process - * very slow work */ - -#ifdef CONFIG_SYSCTL -static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; -static const int slow_work_min_vslow = 1; -static const int slow_work_max_vslow = 99; - -ctl_table slow_work_sysctls[] = { - { - .procname = "min-threads", - .data = &slow_work_min_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_min_threads_sysctl, - .extra1 = (void *) &slow_work_min_min_threads, - .extra2 = &slow_work_max_threads, - }, - { - .procname = "max-threads", - .data = &slow_work_max_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_max_threads_sysctl, - .extra1 = &slow_work_min_threads, - .extra2 = (void *) &slow_work_max_max_threads, - }, - { - .procname = "vslow-percentage", - .data = &vslow_work_proportion, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &slow_work_min_vslow, - .extra2 = (void *) &slow_work_max_vslow, - }, - {} -}; -#endif - -/* - * The active state of the thread pool - */ -static atomic_t slow_work_thread_count; -static atomic_t vslow_work_executing_count; - -static bool slow_work_may_not_start_new_thread; -static bool slow_work_cull; /* cull a thread due to lack of activity */ -static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); -static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); -static struct slow_work slow_work_new_thread; /* new thread starter */ - -/* - * slow work ID allocation (use slow_work_queue_lock) - */ -static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - -/* - * Unregistration tracking to prevent put_ref() from disappearing during module - * unload - */ -#ifdef CONFIG_MODULES -static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; -static struct module *slow_work_unreg_module; -static struct slow_work *slow_work_unreg_work_item; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); -static DEFINE_MUTEX(slow_work_unreg_sync_lock); - -static void slow_work_set_thread_processing(int id, struct slow_work *work) -{ - if (work) - slow_work_thread_processing[id] = work->owner; -} -static void slow_work_done_thread_processing(int id, struct slow_work *work) -{ - struct module *module = slow_work_thread_processing[id]; - - slow_work_thread_processing[id] = NULL; - smp_mb(); - if (slow_work_unreg_work_item == work || - slow_work_unreg_module == module) - wake_up_all(&slow_work_unreg_wq); -} -static void slow_work_clear_thread_processing(int id) -{ - slow_work_thread_processing[id] = NULL; -} -#else -static void slow_work_set_thread_processing(int id, struct slow_work *work) {} -static void slow_work_done_thread_processing(int id, struct slow_work *work) {} -static void slow_work_clear_thread_processing(int id) {} -#endif - -/* - * Data for tracking currently executing items for indication through /proc - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; -pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; -DEFINE_RWLOCK(slow_work_execs_lock); -#endif - -/* - * The queues of work items and the lock governing access to them. These are - * shared between all the CPUs. It doesn't make sense to have per-CPU queues - * as the number of threads bears no relation to the number of CPUs. - * - * There are two queues of work items: one for slow work items, and one for - * very slow work items. - */ -LIST_HEAD(slow_work_queue); -LIST_HEAD(vslow_work_queue); -DEFINE_SPINLOCK(slow_work_queue_lock); - -/* - * The following are two wait queues that get pinged when a work item is placed - * on an empty queue. These allow work items that are hogging a thread by - * sleeping in a way that could be deferred to yield their thread and enqueue - * themselves. - */ -static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); -static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); - -/* - * The thread controls. A variable used to signal to the threads that they - * should exit when the queue is empty, a waitqueue used by the threads to wait - * for signals, and a completion set by the last thread to exit. - */ -static bool slow_work_threads_should_exit; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); -static DECLARE_COMPLETION(slow_work_last_thread_exited); - -/* - * The number of users of the thread pool and its lock. Whilst this is zero we - * have no threads hanging around, and when this reaches zero, we wait for all - * active or queued work items to complete and kill all the threads we do have. - */ -static int slow_work_user_count; -static DEFINE_MUTEX(slow_work_user_lock); - -static inline int slow_work_get_ref(struct slow_work *work) -{ - if (work->ops->get_ref) - return work->ops->get_ref(work); - - return 0; -} - -static inline void slow_work_put_ref(struct slow_work *work) -{ - if (work->ops->put_ref) - work->ops->put_ref(work); -} - -/* - * Calculate the maximum number of active threads in the pool that are - * permitted to process very slow work items. - * - * The answer is rounded up to at least 1, but may not equal or exceed the - * maximum number of the threads in the pool. This means we always have at - * least one thread that can process slow work items, and we always have at - * least one thread that won't get tied up doing so. - */ -static unsigned slow_work_calc_vsmax(void) -{ - unsigned vsmax; - - vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; - vsmax /= 100; - vsmax = max(vsmax, 1U); - return min(vsmax, slow_work_max_threads - 1); -} - -/* - * Attempt to execute stuff queued on a slow thread. Return true if we managed - * it, false if there was nothing to do. - */ -static noinline bool slow_work_execute(int id) -{ - struct slow_work *work = NULL; - unsigned vsmax; - bool very_slow; - - vsmax = slow_work_calc_vsmax(); - - /* see if we can schedule a new thread to be started if we're not - * keeping up with the work */ - if (!waitqueue_active(&slow_work_thread_wq) && - (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && - atomic_read(&slow_work_thread_count) < slow_work_max_threads && - !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - - /* find something to execute */ - spin_lock_irq(&slow_work_queue_lock); - if (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax) { - work = list_entry(vslow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - atomic_inc(&vslow_work_executing_count); - very_slow = true; - } else if (!list_empty(&slow_work_queue)) { - work = list_entry(slow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - very_slow = false; - } else { - very_slow = false; /* avoid the compiler warning */ - } - - slow_work_set_thread_processing(id, work); - if (work) { - slow_work_mark_time(work); - slow_work_begin_exec(id, work); - } - - spin_unlock_irq(&slow_work_queue_lock); - - if (!work) - return false; - - if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) - BUG(); - - /* don't execute if the work is in the process of being cancelled */ - if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) - work->ops->execute(work); - - if (very_slow) - atomic_dec(&vslow_work_executing_count); - clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); - - /* wake up anyone waiting for this work to be complete */ - wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); - - slow_work_end_exec(id, work); - - /* if someone tried to enqueue the item whilst we were executing it, - * then it'll be left unenqueued to avoid multiple threads trying to - * execute it simultaneously - * - * there is, however, a race between us testing the pending flag and - * getting the spinlock, and between the enqueuer setting the pending - * flag and getting the spinlock, so we use a deferral bit to tell us - * if the enqueuer got there first - */ - if (test_bit(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irq(&slow_work_queue_lock); - - if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && - test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) - goto auto_requeue; - - spin_unlock_irq(&slow_work_queue_lock); - } - - /* sort out the race between module unloading and put_ref() */ - slow_work_put_ref(work); - slow_work_done_thread_processing(id, work); - - return true; - -auto_requeue: - /* we must complete the enqueue operation - * - we transfer our ref on the item back to the appropriate queue - * - don't wake another thread up as we're awake already - */ - slow_work_mark_time(work); - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); - spin_unlock_irq(&slow_work_queue_lock); - slow_work_clear_thread_processing(id); - return true; -} - -/** - * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work - * work: The work item under execution that wants to sleep - * _timeout: Scheduler sleep timeout - * - * Allow a requeueable work item to sleep on a slow-work processor thread until - * that thread is needed to do some other work or the sleep is interrupted by - * some other event. - * - * The caller must set up a wake up event before calling this and must have set - * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own - * condition before calling this function as no test is made here. - * - * False is returned if there is nothing on the queue; true is returned if the - * work item should be requeued - */ -bool slow_work_sleep_till_thread_needed(struct slow_work *work, - signed long *_timeout) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - - DEFINE_WAIT(wait); - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - if (!list_empty(queue)) - return true; - - add_wait_queue_exclusive(wfo_wq, &wait); - if (list_empty(queue)) - *_timeout = schedule_timeout(*_timeout); - finish_wait(wfo_wq, &wait); - - return !list_empty(queue); -} -EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); - -/** - * slow_work_enqueue - Schedule a slow work item for processing - * @work: The work item to queue - * - * Schedule a slow work item for processing. If the item is already undergoing - * execution, this guarantees not to re-enter the execution routine until the - * first execution finishes. - * - * The item is pinned by this function as it retains a reference to it, managed - * through the item operations. The item is unpinned once it has been - * executed. - * - * An item may hog the thread that is running it for a relatively large amount - * of time, sufficient, for example, to perform several lookup, mkdir, create - * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. - * - * Conversely, if a number of items are awaiting processing, it may take some - * time before any given item is given attention. The number of threads in the - * pool may be increased to deal with demand, but only up to a limit. - * - * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in - * the very slow queue, from which only a portion of the threads will be - * allowed to pick items to execute. This ensures that very slow items won't - * overly block ones that are just ordinarily slow. - * - * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is - * attempted queued) - */ -int slow_work_enqueue(struct slow_work *work) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - unsigned long flags; - int ret; - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - /* when honouring an enqueue request, we only promise that we will run - * the work function in the future; we do not promise to run it once - * per enqueue request - * - * we use the PENDING bit to merge together repeat requests without - * having to disable IRQs and take the spinlock, whilst still - * maintaining our promise - */ - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) - goto cancelled; - - /* we promise that we will not attempt to execute the work - * function in more than one thread simultaneously - * - * this, however, leaves us with a problem if we're asked to - * enqueue the work whilst someone is executing the work - * function as simply queueing the work immediately means that - * another thread may try executing it whilst it is already - * under execution - * - * to deal with this, we set the ENQ_DEFERRED bit instead of - * enqueueing, and the thread currently executing the work - * function will enqueue the work item when the work function - * returns and it has cleared the EXECUTING bit - */ - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - } else { - ret = slow_work_get_ref(work); - if (ret < 0) - goto failed; - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - wake_up(&slow_work_thread_wq); - - /* if someone who could be requeued is sleeping on a - * thread, then ask them to yield their thread */ - if (work->link.prev == queue) - wake_up(wfo_wq); - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - return 0; - -cancelled: - ret = -ECANCELED; -failed: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(slow_work_enqueue); - -static int slow_work_wait(void *word) -{ - schedule(); - return 0; -} - -/** - * slow_work_cancel - Cancel a slow work item - * @work: The work item to cancel - * - * This function will cancel a previously enqueued work item. If we cannot - * cancel the work item, it is guarenteed to have run when this function - * returns. - */ -void slow_work_cancel(struct slow_work *work) -{ - bool wait = true, put = false; - - set_bit(SLOW_WORK_CANCELLING, &work->flags); - smp_mb(); - - /* if the work item is a delayed work item with an active timer, we - * need to wait for the timer to finish _before_ getting the spinlock, - * lest we deadlock against the timer routine - * - * the timer routine will leave DELAYED set if it notices the - * CANCELLING flag in time - */ - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - del_timer_sync(&dwork->timer); - } - - spin_lock_irq(&slow_work_queue_lock); - - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - /* the timer routine aborted or never happened, so we are left - * holding the timer's reference on the item and should just - * drop the pending flag and wait for any ongoing execution to - * finish */ - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - - BUG_ON(timer_pending(&dwork->timer)); - BUG_ON(!list_empty(&work->link)); - - clear_bit(SLOW_WORK_DELAYED, &work->flags); - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && - !list_empty(&work->link)) { - /* the link in the pending queue holds a reference on the item - * that we will need to release */ - list_del_init(&work->link); - wait = false; - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { - /* the executor is holding our only reference on the item, so - * we merely need to wait for it to finish executing */ - clear_bit(SLOW_WORK_PENDING, &work->flags); - } - - spin_unlock_irq(&slow_work_queue_lock); - - /* the EXECUTING flag is set by the executor whilst the spinlock is set - * and before the item is dequeued - so assuming the above doesn't - * actually dequeue it, simply waiting for the EXECUTING flag to be - * released here should be sufficient */ - if (wait) - wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, - TASK_UNINTERRUPTIBLE); - - clear_bit(SLOW_WORK_CANCELLING, &work->flags); - if (put) - slow_work_put_ref(work); -} -EXPORT_SYMBOL(slow_work_cancel); - -/* - * Handle expiry of the delay timer, indicating that a delayed slow work item - * should now be queued if not cancelled - */ -static void delayed_slow_work_timer(unsigned long data) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - struct slow_work *work = (struct slow_work *) data; - unsigned long flags; - bool queued = false, put = false, first = false; - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { - clear_bit(SLOW_WORK_DELAYED, &work->flags); - - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - /* we discard the reference the timer was holding in - * favour of the one the executor holds */ - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - put = true; - } else { - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - queued = true; - if (work->link.prev == queue) - first = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - if (put) - slow_work_put_ref(work); - if (first) - wake_up(wfo_wq); - if (queued) - wake_up(&slow_work_thread_wq); -} - -/** - * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing - * @dwork: The delayed work item to queue - * @delay: When to start executing the work, in jiffies from now - * - * This is similar to slow_work_enqueue(), but it adds a delay before the work - * is actually queued for processing. - * - * The item can have delayed processing requested on it whilst it is being - * executed. The delay will begin immediately, and if it expires before the - * item finishes executing, the item will be placed back on the queue when it - * has done executing. - */ -int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, - unsigned long delay) -{ - struct slow_work *work = &dwork->work; - unsigned long flags; - int ret; - - if (delay == 0) - return slow_work_enqueue(&dwork->work); - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - goto cancelled; - - /* the timer holds a reference whilst it is pending */ - ret = slow_work_get_ref(work); - if (ret < 0) - goto cant_get_ref; - - if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) - BUG(); - dwork->timer.expires = jiffies + delay; - dwork->timer.data = (unsigned long) work; - dwork->timer.function = delayed_slow_work_timer; - add_timer(&dwork->timer); - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - - return 0; - -cancelled: - ret = -ECANCELED; -cant_get_ref: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(delayed_slow_work_enqueue); - -/* - * Schedule a cull of the thread pool at some time in the near future - */ -static void slow_work_schedule_cull(void) -{ - mod_timer(&slow_work_cull_timer, - round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); -} - -/* - * Worker thread culling algorithm - */ -static bool slow_work_cull_thread(void) -{ - unsigned long flags; - bool do_cull = false; - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (slow_work_cull) { - slow_work_cull = false; - - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) { - slow_work_schedule_cull(); - do_cull = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return do_cull; -} - -/* - * Determine if there is slow work available for dispatch - */ -static inline bool slow_work_available(int vsmax) -{ - return !list_empty(&slow_work_queue) || - (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax); -} - -/* - * Worker thread dispatcher - */ -static int slow_work_thread(void *_data) -{ - int vsmax, id; - - DEFINE_WAIT(wait); - - set_freezable(); - set_user_nice(current, -5); - - /* allocate ourselves an ID */ - spin_lock_irq(&slow_work_queue_lock); - id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); - __set_bit(id, slow_work_ids); - slow_work_set_thread_pid(id, current->pid); - spin_unlock_irq(&slow_work_queue_lock); - - sprintf(current->comm, "kslowd%03u", id); - - for (;;) { - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, - TASK_INTERRUPTIBLE); - if (!freezing(current) && - !slow_work_threads_should_exit && - !slow_work_available(vsmax) && - !slow_work_cull) - schedule(); - finish_wait(&slow_work_thread_wq, &wait); - - try_to_freeze(); - - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - if (slow_work_available(vsmax) && slow_work_execute(id)) { - cond_resched(); - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) - slow_work_schedule_cull(); - continue; - } - - if (slow_work_threads_should_exit) - break; - - if (slow_work_cull && slow_work_cull_thread()) - break; - } - - spin_lock_irq(&slow_work_queue_lock); - slow_work_set_thread_pid(id, 0); - __clear_bit(id, slow_work_ids); - spin_unlock_irq(&slow_work_queue_lock); - - if (atomic_dec_and_test(&slow_work_thread_count)) - complete_and_exit(&slow_work_last_thread_exited, 0); - return 0; -} - -/* - * Handle thread cull timer expiration - */ -static void slow_work_cull_timeout(unsigned long data) -{ - slow_work_cull = true; - wake_up(&slow_work_thread_wq); -} - -/* - * Start a new slow work thread - */ -static void slow_work_new_thread_execute(struct slow_work *work) -{ - struct task_struct *p; - - if (slow_work_threads_should_exit) - return; - - if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) - return; - - if (!mutex_trylock(&slow_work_user_lock)) - return; - - slow_work_may_not_start_new_thread = true; - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) { - printk(KERN_DEBUG "Slow work thread pool: OOM\n"); - if (atomic_dec_and_test(&slow_work_thread_count)) - BUG(); /* we're running on a slow work thread... */ - mod_timer(&slow_work_oom_timer, - round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); - } else { - /* ratelimit the starting of new threads */ - mod_timer(&slow_work_oom_timer, jiffies + 1); - } - - mutex_unlock(&slow_work_user_lock); -} - -static const struct slow_work_ops slow_work_new_thread_ops = { - .owner = THIS_MODULE, - .execute = slow_work_new_thread_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = slow_work_new_thread_desc, -#endif -}; - -/* - * post-OOM new thread start suppression expiration - */ -static void slow_work_oom_timeout(unsigned long data) -{ - slow_work_may_not_start_new_thread = false; -} - -#ifdef CONFIG_SYSCTL -/* - * Handle adjustment of the minimum number of threads - */ -static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to start or stop threads */ - n = atomic_read(&slow_work_thread_count) - - slow_work_min_threads; - - if (n < 0 && !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - else if (n > 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} - -/* - * Handle adjustment of the maximum number of threads - */ -static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to stop threads */ - n = slow_work_max_threads - - atomic_read(&slow_work_thread_count); - - if (n < 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} -#endif /* CONFIG_SYSCTL */ - -/** - * slow_work_register_user - Register a user of the facility - * @module: The module about to make use of the facility - * - * Register a user of the facility, starting up the initial threads if there - * aren't any other users at this point. This will return 0 if successful, or - * an error if not. - */ -int slow_work_register_user(struct module *module) -{ - struct task_struct *p; - int loop; - - mutex_lock(&slow_work_user_lock); - - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); - init_completion(&slow_work_last_thread_exited); - - slow_work_threads_should_exit = false; - slow_work_init(&slow_work_new_thread, - &slow_work_new_thread_ops); - slow_work_may_not_start_new_thread = false; - slow_work_cull = false; - - /* start the minimum number of threads */ - for (loop = 0; loop < slow_work_min_threads; loop++) { - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) - goto error; - } - printk(KERN_NOTICE "Slow work thread pool: Ready\n"); - } - - slow_work_user_count++; - mutex_unlock(&slow_work_user_lock); - return 0; - -error: - if (atomic_dec_and_test(&slow_work_thread_count)) - complete(&slow_work_last_thread_exited); - if (loop > 0) { - printk(KERN_ERR "Slow work thread pool:" - " Aborting startup on ENOMEM\n"); - slow_work_threads_should_exit = true; - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_ERR "Slow work thread pool: Aborted\n"); - } - mutex_unlock(&slow_work_user_lock); - return PTR_ERR(p); -} -EXPORT_SYMBOL(slow_work_register_user); - -/* - * wait for all outstanding items from the calling module to complete - * - note that more items may be queued whilst we're waiting - */ -static void slow_work_wait_for_items(struct module *module) -{ -#ifdef CONFIG_MODULES - DECLARE_WAITQUEUE(myself, current); - struct slow_work *work; - int loop; - - mutex_lock(&slow_work_unreg_sync_lock); - add_wait_queue(&slow_work_unreg_wq, &myself); - - for (;;) { - spin_lock_irq(&slow_work_queue_lock); - - /* first of all, we wait for the last queued item in each list - * to be processed */ - list_for_each_entry_reverse(work, &vslow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - list_for_each_entry_reverse(work, &slow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - - /* then we wait for the items being processed to finish */ - slow_work_unreg_module = module; - smp_mb(); - for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { - if (slow_work_thread_processing[loop] == module) - goto do_wait; - } - spin_unlock_irq(&slow_work_queue_lock); - break; /* okay, we're done */ - - do_wait: - spin_unlock_irq(&slow_work_queue_lock); - schedule(); - slow_work_unreg_work_item = NULL; - slow_work_unreg_module = NULL; - } - - remove_wait_queue(&slow_work_unreg_wq, &myself); - mutex_unlock(&slow_work_unreg_sync_lock); -#endif /* CONFIG_MODULES */ -} - -/** - * slow_work_unregister_user - Unregister a user of the facility - * @module: The module whose items should be cleared - * - * Unregister a user of the facility, killing all the threads if this was the - * last one. - * - * This waits for all the work items belonging to the nominated module to go - * away before proceeding. - */ -void slow_work_unregister_user(struct module *module) -{ - /* first of all, wait for all outstanding items from the calling module - * to complete */ - if (module) - slow_work_wait_for_items(module); - - /* then we can actually go about shutting down the facility if need - * be */ - mutex_lock(&slow_work_user_lock); - - BUG_ON(slow_work_user_count <= 0); - - slow_work_user_count--; - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); - slow_work_threads_should_exit = true; - del_timer_sync(&slow_work_cull_timer); - del_timer_sync(&slow_work_oom_timer); - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_NOTICE "Slow work thread pool:" - " Shut down complete\n"); - } - - mutex_unlock(&slow_work_user_lock); -} -EXPORT_SYMBOL(slow_work_unregister_user); - -/* - * Initialise the slow work facility - */ -static int __init init_slow_work(void) -{ - unsigned nr_cpus = num_possible_cpus(); - - if (slow_work_max_threads < nr_cpus) - slow_work_max_threads = nr_cpus; -#ifdef CONFIG_SYSCTL - if (slow_work_max_max_threads < nr_cpus * 2) - slow_work_max_max_threads = nr_cpus * 2; -#endif -#ifdef CONFIG_SLOW_WORK_DEBUG - { - struct dentry *dbdir; - - dbdir = debugfs_create_dir("slow_work", NULL); - if (dbdir && !IS_ERR(dbdir)) - debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, - NULL, &slow_work_runqueue_fops); - } -#endif - return 0; -} - -subsys_initcall(init_slow_work); diff --git a/kernel/slow-work.h b/kernel/slow-work.h deleted file mode 100644 index a29ebd1ef41d..000000000000 --- a/kernel/slow-work.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Slow work private definitions - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ - -#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ - -/* - * slow-work.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern struct slow_work *slow_work_execs[]; -extern pid_t slow_work_pids[]; -extern rwlock_t slow_work_execs_lock; -#endif - -extern struct list_head slow_work_queue; -extern struct list_head vslow_work_queue; -extern spinlock_t slow_work_queue_lock; - -/* - * slow-work-debugfs.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern const struct file_operations slow_work_runqueue_fops; - -extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); -#endif - -/* - * Helper functions - */ -static inline void slow_work_set_thread_pid(int id, pid_t pid) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_pids[id] = pid; -#endif -} - -static inline void slow_work_mark_time(struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - work->mark = CURRENT_TIME; -#endif -} - -static inline void slow_work_begin_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_execs[id] = work; -#endif -} - -static inline void slow_work_end_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - write_lock(&slow_work_execs_lock); - slow_work_execs[id] = NULL; - write_unlock(&slow_work_execs_lock); -#endif -} diff --git a/kernel/softlockup.c b/kernel/softlockup.c deleted file mode 100644 index 4b493f67dcb5..000000000000 --- a/kernel/softlockup.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Detect Soft Lockups - * - * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. - * - * this code detects soft lockups: incidents in where on a CPU - * the kernel does not reschedule for 10 seconds or more. - */ -#include <linux/mm.h> -#include <linux/cpu.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/lockdep.h> -#include <linux/notifier.h> -#include <linux/module.h> -#include <linux/sysctl.h> - -#include <asm/irq_regs.h> - -static DEFINE_SPINLOCK(print_lock); - -static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ -static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); -static DEFINE_PER_CPU(bool, softlock_touch_sync); - -static int __read_mostly did_panic; -int __read_mostly softlockup_thresh = 60; - -/* - * Should we panic (and reboot, if panic_timeout= is set) when a - * soft-lockup occurs: - */ -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - -static int -softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = softlock_panic, -}; - -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(int this_cpu) -{ - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void __touch_softlockup_watchdog(void) -{ - int this_cpu = raw_smp_processor_id(); - - __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); -} - -void touch_softlockup_watchdog(void) -{ - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} -EXPORT_SYMBOL(touch_softlockup_watchdog); - -void touch_softlockup_watchdog_sync(void) -{ - __raw_get_cpu_var(softlock_touch_sync) = true; - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} - -void touch_all_softlockup_watchdogs(void) -{ - int cpu; - - /* Cause each CPU to re-update its timestamp rather than complain */ - for_each_online_cpu(cpu) - per_cpu(softlockup_touch_ts, cpu) = 0; -} -EXPORT_SYMBOL(touch_all_softlockup_watchdogs); - -int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} - -/* - * This callback runs from the timer interrupt, and checks - * whether the watchdog thread has hung or not: - */ -void softlockup_tick(void) -{ - int this_cpu = smp_processor_id(); - unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); - unsigned long print_ts; - struct pt_regs *regs = get_irq_regs(); - unsigned long now; - - /* Is detection switched off? */ - if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { - /* Be sure we don't false trigger if switched back on */ - if (touch_ts) - per_cpu(softlockup_touch_ts, this_cpu) = 0; - return; - } - - if (touch_ts == 0) { - if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { - /* - * If the time stamp was touched atomically - * make sure the scheduler tick is up to date. - */ - per_cpu(softlock_touch_sync, this_cpu) = false; - sched_clock_tick(); - } - __touch_softlockup_watchdog(); - return; - } - - print_ts = per_cpu(softlockup_print_ts, this_cpu); - - /* report at most once a second */ - if (print_ts == touch_ts || did_panic) - return; - - /* do not print during early bootup: */ - if (unlikely(system_state != SYSTEM_RUNNING)) { - __touch_softlockup_watchdog(); - return; - } - - now = get_timestamp(this_cpu); - - /* - * Wake up the high-prio watchdog task twice per - * threshold timespan. - */ - if (time_after(now - softlockup_thresh/2, touch_ts)) - wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); - - /* Warn about unreasonable delays: */ - if (time_before_eq(now - softlockup_thresh, touch_ts)) - return; - - per_cpu(softlockup_print_ts, this_cpu) = touch_ts; - - spin_lock(&print_lock); - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_ts, - current->comm, task_pid_nr(current)); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - spin_unlock(&print_lock); - - if (softlockup_panic) - panic("softlockup: hung tasks"); -} - -/* - * The watchdog thread - runs every second and touches the timestamp. - */ -static int watchdog(void *__bind_cpu) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* initialize timestamp */ - __touch_softlockup_watchdog(); - - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in softlockup_tick(). - */ - while (!kthread_should_stop()) { - __touch_softlockup_watchdog(); - schedule(); - - if (kthread_should_stop()) - break; - - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - - return 0; -} - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); - p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); - if (IS_ERR(p)) { - printk(KERN_ERR "watchdog for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - per_cpu(softlockup_touch_ts, hotcpu) = 0; - per_cpu(softlockup_watchdog, hotcpu) = p; - kthread_bind(p, hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(softlockup_watchdog, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(softlockup_watchdog, hotcpu), - cpumask_any(cpu_online_mask)); - case CPU_DEAD: - case CPU_DEAD_FROZEN: - p = per_cpu(softlockup_watchdog, hotcpu); - per_cpu(softlockup_watchdog, hotcpu) = NULL; - kthread_stop(p); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __initdata nosoftlockup; - -static int __init nosoftlockup_setup(char *str) -{ - nosoftlockup = 1; - return 1; -} -__setup("nosoftlockup", nosoftlockup_setup); - -static int __init spawn_softlockup_task(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - if (nosoftlockup) - return 0; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - if (err == NOTIFY_BAD) { - BUG(); - return 1; - } - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - - return 0; -} -early_initcall(spawn_softlockup_task); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d24f761f4876..6d850bf0a517 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,7 +50,6 @@ #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/slow-work.h> #include <linux/perf_event.h> #include <linux/kprobes.h> #include <linux/pipe_fs_i.h> @@ -76,6 +75,10 @@ #include <scsi/sg.h> #endif +#ifdef CONFIG_LOCKUP_DETECTOR +#include <linux/nmi.h> +#endif + #if defined(CONFIG_SYSCTL) @@ -106,7 +109,7 @@ extern int blk_iopoll_enabled; #endif /* Constants used for minimum and maximum */ -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; static int neg_one = -1; #endif @@ -562,7 +565,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#ifdef CONFIG_HOTPLUG { .procname = "hotplug", .data = &uevent_helper, @@ -710,7 +713,34 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dowatchdog_enabled, + }, + { + .procname = "watchdog_thresh", + .data = &softlockup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dowatchdog_thresh, + .extra1 = &neg_one, + .extra2 = &sixty, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, @@ -813,26 +843,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_SOFTLOCKUP - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "softlockup_thresh", - .data = &softlockup_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dosoftlockup_thresh, - .extra1 = &neg_one, - .extra2 = &sixty, - }, -#endif #ifdef CONFIG_DETECT_HUNG_TASK { .procname = "hung_task_panic", @@ -906,13 +916,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SLOW_WORK - { - .procname = "slow-work", - .mode = 0555, - .child = slow_work_sysctls, - }, -#endif #ifdef CONFIG_PERF_EVENTS { .procname = "perf_event_paranoid", diff --git a/kernel/time.c b/kernel/time.c index 848b1c2ab09a..ba9b338d1835 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -#ifndef CONFIG_GENERIC_TIME -/* - * Simulate gettimeofday using do_gettimeofday which only allows a timeval - * and therefore only yields usec accuracy - */ -void getnstimeofday(struct timespec *tv) -{ - struct timeval x; - - do_gettimeofday(&x); - tv->tv_sec = x.tv_sec; - tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; -} -EXPORT_SYMBOL_GPL(getnstimeofday); -#endif - /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed42951e0a..f06a8a365648 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -6,7 +6,7 @@ config TICK_ONESHOT config NO_HZ bool "Tickless System (Dynamic Ticks)" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables a tickless system: timer interrupts will @@ -15,7 +15,7 @@ config NO_HZ config HIGH_RES_TIMERS bool "High Resolution Timer Support" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables high resolution timer support. If your diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f08e99c1d561..c18d7efa1b4b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) return max_nsecs - (max_nsecs >> 5); } -#ifdef CONFIG_GENERIC_TIME +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET /** * clocksource_select - Select the best clocksource available @@ -577,7 +577,7 @@ static void clocksource_select(void) } } -#else /* CONFIG_GENERIC_TIME */ +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs) #define MAX_UPDATE_LENGTH 5 /* Seconds */ /** - * __clocksource_register_scale - Used to install new clocksources + * __clocksource_updatefreq_scale - Used update clocksource with new freq * @t: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * - * Returns -EBUSY if registration fails, zero otherwise. + * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_register_hz() or clocksource_register_khz helper functions. + * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. */ -int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { - /* * Ideally we want to use some of the limits used in * clocksource_max_deferment, to provide a more informed @@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) NSEC_PER_SEC/scale, MAX_UPDATE_LENGTH*scale); cs->max_idle_ns = clocksource_max_deferment(cs); +} +EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* Intialize mult/shift and max_idle_ns */ + __clocksource_updatefreq_scale(cs, scale, freq); + /* Add clocksource to the clcoksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b3bafd5fc66d..48b2761b5668 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have * periodic mode. We read dev->next_event first and add to it - * when the event alrady expired. clockevents_program_event() + * when the event already expired. clockevents_program_event() * sets dev->next_event only when the event is really * programmed to the device. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 813993b5fb61..3e216e01bbd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { + arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { @@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpumask_clear_cpu(cpu, nohz_cpu_mask); - goto out; - } + select_nohz_load_balancer(1); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -780,7 +774,6 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); - u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -790,10 +783,6 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - offset = ktime_to_ns(tick_period) >> 1; - do_div(offset, num_possible_cpus()); - offset *= smp_processor_id(); - hrtimer_add_expires_ns(&ts->sched_timer, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caf8d4d4f5c8..e14c839e9faa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); * - wall_to_monotonic is no longer the boot time, getboottime must be * used instead. */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); +static struct timespec xtime __attribute__ ((aligned (16))); +static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static struct timespec total_sleep_time; /* @@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } -#ifdef CONFIG_GENERIC_TIME - /** * timekeeping_forward_now - update clock to the current time * @@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv) timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock) tick_clock_notify(); } -#else /* GENERIC_TIME */ - -static inline void timekeeping_forward_now(void) { } - -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - -#endif /* !GENERIC_TIME */ - /** * ktime_get_real - get the real (wall-) time in ktime_t format * @@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev) if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - xtime = timespec_add_safe(xtime, ts); + xtime = timespec_add(xtime, ts); wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); - total_sleep_time = timespec_add_safe(total_sleep_time, ts); + total_sleep_time = timespec_add(total_sleep_time, ts); } /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); @@ -784,10 +738,11 @@ void update_wall_time(void) return; clock = timekeeper.clock; -#ifdef CONFIG_GENERIC_TIME - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; -#else + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = timekeeper.cycle_interval; +#else + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; @@ -856,7 +811,8 @@ void update_wall_time(void) } /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } /** @@ -887,7 +843,7 @@ EXPORT_SYMBOL_GPL(getboottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add_safe(*ts, total_sleep_time); + *ts = timespec_add(*ts, total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); @@ -902,6 +858,11 @@ struct timespec __current_kernel_time(void) return xtime; } +struct timespec __get_wall_to_monotonic(void) +{ + return wall_to_monotonic; +} + struct timespec current_kernel_time(void) { struct timespec now; diff --git a/kernel/timer.c b/kernel/timer.c index ee305c8d4e18..f1b8afe1ad86 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB for - * the new flag to indicate whether the timer is deferrable + * base in timer_list is guaranteed to be zero. Use the LSB to + * indicate whether the timer is deferrable. + * + * A deferrable timer will work normally when the system is busy, but + * will not cause a CPU to come out of idle just to service it; instead, + * the timer will be serviced when the CPU eventually wakes up with a + * subsequent non-deferrable timer. */ #define TBASE_DEFERRABLE_FLAG (0x1) @@ -577,6 +582,19 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } +void setup_deferrable_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) +{ + timer->function = function; + timer->data = data; + init_timer_on_stack_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); + /** * init_timer_key - initialize a timer * @timer: the timer to be initialized @@ -679,12 +697,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - cpu = preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) + cpu = get_nohz_timer_target(); #endif new_base = per_cpu(tvec_bases, cpu); @@ -1289,7 +1303,6 @@ void run_local_timers(void) { hrtimer_run_queues(); raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); } /* @@ -1750,3 +1763,25 @@ unsigned long msleep_interruptible(unsigned int msecs) } EXPORT_SYMBOL(msleep_interruptible); + +static int __sched do_usleep_range(unsigned long min, unsigned long max) +{ + ktime_t kmin; + unsigned long delta; + + kmin = ktime_set(0, min * NSEC_PER_USEC); + delta = (max - min) * NSEC_PER_USEC; + return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); +} + +/** + * usleep_range - Drop in replacement for udelay where wakeup is flexible + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + */ +void usleep_range(unsigned long min, unsigned long max) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + do_usleep_range(min, max); +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c4545b..538501c6ea50 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -153,7 +153,7 @@ config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n depends on TRACE_IRQFLAGS_SUPPORT - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE @@ -175,7 +175,7 @@ config IRQSOFF_TRACER config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET depends on PREEMPT select GENERIC_TRACER select TRACER_MAX_TRACE @@ -194,15 +194,6 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) -config SYSPROF_TRACER - bool "Sysprof Tracer" - depends on X86 - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer provides the trace needed by the 'Sysprof' userspace - tool. - config SCHED_TRACER bool "Scheduling Latency Tracer" select GENERIC_TRACER @@ -229,23 +220,6 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. -config BOOT_TRACER - bool "Trace boot initcalls" - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer helps developers to optimize boot times: it records - the timings of the initcalls and traces key events and the identity - of tasks that can cause boot delays, such as context-switches. - - Its aim is to be parsed by the scripts/bootgraph.pl tool to - produce pretty graphics about boot inefficiencies, giving a visual - representation of the delays during initcalls - but the raw - /debug/tracing/trace text output is readable too. - - You must pass in initcall_debug and ftrace=initcall to the kernel - command line to enable this on bootup. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -325,28 +299,6 @@ config BRANCH_TRACER Say N if unsure. -config KSYM_TRACER - bool "Trace read and write access on kernel memory locations" - depends on HAVE_HW_BREAKPOINT - select TRACING - help - This tracer helps find read and write operations on any given kernel - symbol i.e. /proc/kallsyms. - -config PROFILE_KSYM_TRACER - bool "Profile all kernel memory accesses on 'watched' variables" - depends on KSYM_TRACER - help - This tracer profiles kernel accesses on variables watched through the - ksym tracer ftrace plugin. Depending upon the hardware, all read - and write operations on kernel variables can be monitored for - accesses. - - The results will be displayed in: - /debugfs/tracing/profile_ksym - - Say N if unsure. - config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER @@ -371,37 +323,6 @@ config STACK_TRACER Say N if unsure. -config KMEMTRACE - bool "Trace SLAB allocations" - select GENERIC_TRACER - help - kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected - data is then fed to the userspace application in order to analyse - allocation hotspots, internal fragmentation and so on, making it - possible to see how well an allocator performs, as well as debug - and profile kernel code. - - This requires an userspace application to use. See - Documentation/trace/kmemtrace.txt for more information. - - Saying Y will make the kernel somewhat larger and slower. However, - if you disable kmemtrace at run-time or boot-time, the performance - impact is minimal (depending on the arch the kernel is built for). - - If unsure, say N. - -config WORKQUEUE_TRACER - bool "Trace workqueues" - select GENERIC_TRACER - help - The workqueue tracer provides some statistical information - about each cpu workqueue thread such as the number of the - works inserted and executed since their creation. It can help - to evaluate the amount of work each of them has to perform. - For example it can help a developer to decide whether he should - choose a per-cpu workqueue instead of a singlethreaded one. - config BLK_DEV_IO_TRACE bool "Support for tracing block IO actions" depends on SYSFS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ffb1a5b0550e..53f338190b26 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o -obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o @@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o -obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) @@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14f9449..0d88ce9b9fb8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) struct hlist_head *hhd; struct hlist_node *n; unsigned long key; - int resched; key = hash_long(ip, FTRACE_HASH_BITS); @@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) * period. This syncs the hash iteration and freeing of items * on the hash. rcu_read_lock is too dangerous here. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); hlist_for_each_entry_rcu(entry, n, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_probe_ops __read_mostly = diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb1660b..000000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Memory allocator tracing - * - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - */ - -#include <linux/tracepoint.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/dcache.h> -#include <linux/fs.h> - -#include <linux/kmemtrace.h> - -#include "trace_output.h" -#include "trace.h" - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_KMEM_OPT_MINIMAL 0x1 - -static struct tracer_opt kmem_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, - { } -}; - -static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts -}; - -static struct trace_array *kmemtrace_array; - -/* Trace allocations */ -static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ftrace_event_call *call = &event_kmem_alloc; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_alloc_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_ALLOC; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static inline void kmemtrace_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ftrace_event_call *call = &event_kmem_free; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_free_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_FREE; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static void kmemtrace_kmalloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmem_cache_alloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmalloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void kmemtrace_kmem_cache_alloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void -kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); -} - -static void kmemtrace_kmem_cache_free(void *ignore, - unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); -} - -static int kmemtrace_start_probes(void) -{ - int err; - - err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - if (err) - return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - if (err) - return err; - err = register_trace_kfree(kmemtrace_kfree, NULL); - if (err) - return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - unregister_trace_kfree(kmemtrace_kfree, NULL); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); -} - -static int kmem_trace_init(struct trace_array *tr) -{ - kmemtrace_array = tr; - - tracing_reset_online_cpus(tr); - - kmemtrace_start_probes(); - - return 0; -} - -static void kmem_trace_reset(struct trace_array *tr) -{ - kmemtrace_stop_probes(); -} - -static void kmemtrace_headers(struct seq_file *s) -{ - /* Don't need headers for the original kmemtrace output */ - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return; - - seq_printf(s, "#\n"); - seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " - " POINTER NODE CALLER\n"); - seq_printf(s, "# FREE | | | | " - " | | | |\n"); - seq_printf(s, "# |\n\n"); -} - -/* - * The following functions give the original output from kmemtrace, - * plus the origin CPU, since reordering occurs in-kernel now. - */ - -#define KMEMTRACE_USER_ALLOC 0 -#define KMEMTRACE_USER_FREE 1 - -struct kmemtrace_user_event { - u8 event_id; - u8 type_id; - u16 event_size; - u32 cpu; - u64 timestamp; - unsigned long call_site; - unsigned long ptr; -}; - -struct kmemtrace_user_event_alloc { - size_t bytes_req; - size_t bytes_alloc; - unsigned gfp_flags; - int node; -}; - -static enum print_line_t -kmemtrace_print_alloc(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", - entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, - (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, - (unsigned long)entry->gfp_flags, entry->node); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", - entry->type_id, (void *)entry->call_site, - (unsigned long)entry->ptr); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - struct kmemtrace_user_event *ev; - struct kmemtrace_user_event_alloc *ev_alloc; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_ALLOC; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); - if (!ev_alloc) - return TRACE_TYPE_PARTIAL_LINE; - - ev_alloc->bytes_req = entry->bytes_req; - ev_alloc->bytes_alloc = entry->bytes_alloc; - ev_alloc->gfp_flags = entry->gfp_flags; - ev_alloc->node = entry->node; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - struct kmemtrace_user_event *ev; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_FREE; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - return TRACE_TYPE_HANDLED; -} - -/* The two other following provide a more minimalistic output */ -static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter) -{ - struct kmemtrace_alloc_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Alloc entry */ - ret = trace_seq_printf(s, " + "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Requested */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Allocated */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Flags - * TODO: would be better to see the name of the GFP flag names - */ - ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Node and call site*/ - ret = trace_seq_printf(s, "%4d %pf\n", entry->node, - (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter) -{ - struct kmemtrace_free_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Free entry */ - ret = trace_seq_printf(s, " - "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip requested/allocated/flags */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip node and print call site*/ - ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return TRACE_TYPE_UNHANDLED; - - switch (entry->type) { - case TRACE_KMEM_ALLOC: - return kmemtrace_print_alloc_compress(iter); - case TRACE_KMEM_FREE: - return kmemtrace_print_free_compress(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -static struct trace_event_functions kmem_trace_alloc_funcs = { - .trace = kmemtrace_print_alloc, - .binary = kmemtrace_print_alloc_user, -}; - -static struct trace_event kmem_trace_alloc = { - .type = TRACE_KMEM_ALLOC, - .funcs = &kmem_trace_alloc_funcs, -}; - -static struct trace_event_functions kmem_trace_free_funcs = { - .trace = kmemtrace_print_free, - .binary = kmemtrace_print_free_user, -}; - -static struct trace_event kmem_trace_free = { - .type = TRACE_KMEM_FREE, - .funcs = &kmem_trace_free_funcs, -}; - -static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags -}; - -void kmemtrace_init(void) -{ - /* earliest opportunity to start kmem tracing */ -} - -static int __init init_kmem_tracer(void) -{ - if (!register_ftrace_event(&kmem_trace_alloc)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (!register_ftrace_event(&kmem_trace_free)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (register_tracer(&kmem_tracer) != 0) { - pr_warning("Warning: could not register the kmem tracer\n"); - return 1; - } - - return 0; -} -device_initcall(init_kmem_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1da7b6ea8b85..3632ce87674f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) */ struct ring_buffer_per_cpu { int cpu; + atomic_t record_disabled; struct ring_buffer *buffer; spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; @@ -462,7 +463,6 @@ struct ring_buffer_per_cpu { unsigned long read; u64 write_stamp; u64 read_stamp; - atomic_t record_disabled; }; struct ring_buffer { @@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void) #endif -static DEFINE_PER_CPU(int, rb_need_resched); - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out_nocheck; @@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (!event) goto out; - /* - * Need to store resched state on this cpu. - * Only the first needs to. - */ - - if (preempt_count() == 1) - per_cpu(rb_need_resched, cpu) = resched; - return event; out: trace_recursive_unlock(); out_nocheck: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); @@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return 0; } @@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); @@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer, struct ring_buffer_event *event; void *body; int ret = -EBUSY; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out; @@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 086d36316805..ba14a22be4cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_var_t __read_mostly tracing_buffer_mask; - -#define for_each_tracing_cpu(cpu) \ - for_each_cpu(cpu, tracing_buffer_mask) +cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -428,6 +425,7 @@ static const char *trace_options[] = { "latency-format", "sleep-time", "graph-time", + "record-cmd", NULL }; @@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); @@ -729,18 +736,11 @@ __acquires(kernel_lock) return -1; } - if (strlen(type->name) > MAX_TRACER_SIZE) { + if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } - /* - * When this gets called we hold the BKL which means that - * preemption is disabled. Various trace selftests however - * need to disable and enable preemption for successful tests. - * So we drop the BKL here and grab it after the tests again. - */ - unlock_kernel(); mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -822,7 +822,6 @@ __acquires(kernel_lock) #endif out_unlock: - lock_kernel(); return ret; } @@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) #endif /* CONFIG_STACKTRACE */ -static void -ftrace_trace_special(void *__tr, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - int pc) -{ - struct ftrace_event_call *call = &event_special; - struct ring_buffer_event *event; - struct trace_array *tr = __tr; - struct ring_buffer *buffer = tr->buffer; - struct special_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, - sizeof(*entry), 0, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->arg1 = arg1; - entry->arg2 = arg2; - entry->arg3 = arg3; - - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); -} - -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - - if (tracing_disabled) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (likely(atomic_inc_return(&data->disabled) == 1)) - ftrace_trace_special(tr, arg1, arg2, arg3, pc); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - /** * trace_vbprintk - write binary msg to tracing buffer * @@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct bprint_entry *entry; unsigned long flags; int disable; - int resched; int cpu, len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -1452,7 +1395,7 @@ out_unlock: out: atomic_dec_return(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(trace_vprintk); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; - static void trace_iterator_increment(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ @@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, } /* Find the next real entry, and increment the iterator to the next entry */ -static void *find_next_entry_inc(struct trace_iterator *iter) +void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); @@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return NULL; if (iter->idx < 0) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } -static void tracing_iter_reset(struct trace_iterator *iter, int cpu) +void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct trace_array *tr = iter->tr; struct ring_buffer_event *event; @@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter) } /* Called with trace_event_read_lock() held. */ -static enum print_line_t print_trace_line(struct trace_iterator *iter) +enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .release = seq_release, + .llseek = seq_lseek, }; /* @@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) @@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_flags |= mask; else trace_flags &= ~mask; + + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); } static ssize_t @@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, static const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size) if (ret < 0) return ret; + if (!current_trace->use_max_tr) + goto out; + ret = ring_buffer_resize(max_tr.buffer, size); if (ret < 0) { int r; @@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } + max_tr.entries = size; + out: global_trace.entries = size; return ret; } + /** * tracing_update_buffers - used by tracing facility to expand ring buffers * @@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - + if (current_trace && current_trace->use_max_tr) { + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(max_tr.buffer, 1); + max_tr.entries = 1; + } destroy_trace_option_files(topts); current_trace = t; topts = create_trace_option_files(current_trace); + if (current_trace->use_max_tr) { + ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); + if (ret < 0) + goto out; + max_tr.entries = global_trace.entries; + } if (t->init) { ret = tracer_init(t, tr); @@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (iter->trace->pipe_open) iter->trace->pipe_open(iter); + nonseekable_open(inode, filp); out: mutex_unlock(&trace_types_lock); return ret; @@ -3211,7 +3177,7 @@ waitagain: trace_event_read_lock(); trace_access_lock(iter->cpu_file); - while (find_next_entry_inc(iter) != NULL) { + while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); rem -= count; - if (!find_next_entry_inc(iter)) { + if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; @@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (ret <= 0) goto out_err; - if (!iter->ent && !find_next_entry_inc(iter)) { + if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } @@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } tracing_start(); - max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); return cnt; @@ -3590,18 +3555,21 @@ static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_ctrl_fops = { .open = tracing_open_generic, .read = tracing_ctrl_read, .write = tracing_ctrl_write, + .llseek = generic_file_llseek, }; static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, .write = tracing_set_trace_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_pipe_fops = { @@ -3610,17 +3578,20 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, + .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, + .llseek = generic_file_llseek, }; static const struct file_operations trace_clock_fops = { @@ -3926,6 +3897,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic, .read = tracing_stats_read, + .llseek = generic_file_llseek, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -3962,6 +3934,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, }; #endif @@ -4115,6 +4088,7 @@ static const struct file_operations trace_options_fops = { .open = tracing_open_generic, .read = trace_options_read, .write = trace_options_write, + .llseek = generic_file_llseek, }; static ssize_t @@ -4166,6 +4140,7 @@ static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, + .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, @@ -4355,9 +4330,6 @@ static __init int tracer_init_debugfs(void) trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_SYSPROF_TRACER - init_tracer_sysprof_debugfs(d_tracer); -#endif create_trace_options_dir(); @@ -4414,7 +4386,7 @@ static struct notifier_block trace_die_notifier = { */ #define KERN_TRACE KERN_EMERG -static void +void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ @@ -4429,6 +4401,13 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = current_trace; + iter->cpu_file = TRACE_PIPE_ALL_CPU; +} + static void __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) { @@ -4454,8 +4433,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + trace_init_global_iter(&iter); + for_each_tracing_cpu(cpu) { - atomic_inc(&global_trace.data[cpu]->disabled); + atomic_inc(&iter.tr->data[cpu]->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -4504,7 +4485,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (find_next_entry_inc(&iter) != NULL) { + if (trace_find_next_entry_inc(&iter) != NULL) { int ret; ret = print_trace_line(&iter); @@ -4526,7 +4507,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&global_trace.data[cpu]->disabled); + atomic_dec(&iter.tr->data[cpu]->disabled); } tracing_on(); } @@ -4575,16 +4556,14 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } - max_tr.entries = ring_buffer_size(max_tr.buffer); - WARN_ON(max_tr.entries != global_trace.entries); + max_tr.entries = 1; #endif /* Allocate the first page for all buffers */ @@ -4597,9 +4576,6 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); current_trace = &nop_trace; -#ifdef CONFIG_BOOT_TRACER - register_tracer(&boot_tracer); -#endif /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463f..d39b3c5454a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,10 +9,7 @@ #include <linux/mmiotrace.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> -#include <trace/boot.h> -#include <linux/kmemtrace.h> #include <linux/hw_breakpoint.h> - #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -25,30 +22,17 @@ enum trace_type { TRACE_STACK, TRACE_PRINT, TRACE_BPRINT, - TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, - TRACE_BOOT_CALL, - TRACE_BOOT_RET, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_KMEM_ALLOC, - TRACE_KMEM_FREE, TRACE_BLK, - TRACE_KSYM, __TRACE_LAST_TYPE, }; -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; - -extern struct tracer boot_tracer; #undef __field #define __field(type, item) type item; @@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ - IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ - IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ - TRACE_KMEM_ALLOC); \ - IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ - TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -298,6 +274,7 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; + int use_max_tr; }; @@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name, const struct file_operations *fops); struct dentry *tracing_init_dentry(void); -void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; @@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); + void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); @@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, struct task_struct *cur, unsigned long flags, int pc); -void trace_special(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, +}; + +extern cpumask_var_t __read_mostly tracing_buffer_mask; -extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_sysprof(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_ksym(struct tracer *trace, - struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; @@ -617,6 +601,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x20000, TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, + TRACE_ITER_RECORD_CMD = 0x100000, }; /* @@ -628,54 +613,6 @@ enum trace_iterator_flags { extern struct tracer nop_trace; -/** - * ftrace_preempt_disable - disable preemption scheduler safe - * - * When tracing can happen inside the scheduler, there exists - * cases that the tracing might happen before the need_resched - * flag is checked. If this happens and the tracer calls - * preempt_enable (after a disable), a schedule might take place - * causing an infinite recursion. - * - * To prevent this, we read the need_resched flag before - * disabling preemption. When we want to enable preemption we - * check the flag, if it is set, then we call preempt_enable_no_resched. - * Otherwise, we call preempt_enable. - * - * The rational for doing the above is that if need_resched is set - * and we have yet to reschedule, we are either in an atomic location - * (where we do not need to check for scheduling) or we are inside - * the scheduler and do not want to resched. - */ -static inline int ftrace_preempt_disable(void) -{ - int resched; - - resched = need_resched(); - preempt_disable_notrace(); - - return resched; -} - -/** - * ftrace_preempt_enable - enable preemption scheduler safe - * @resched: the return value from ftrace_preempt_disable - * - * This is a scheduler safe way to enable preemption and not miss - * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we are either inside an atomic or - * are inside the scheduler (we would have already scheduled - * otherwise). In this case, we do not want to call normal - * preempt_enable, but preempt_enable_no_resched instead. - */ -static inline void ftrace_preempt_enable(int resched) -{ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); -} - #ifdef CONFIG_BRANCH_TRACER extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); @@ -766,6 +703,8 @@ struct filter_pred { int pop_n; }; +extern struct list_head ftrace_common_fields; + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, @@ -795,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +extern void trace_event_enable_cmd_record(bool enable); + extern struct mutex event_mutex; extern struct list_head ftrace_events; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3956ad..000000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * ring buffer based initcalls tracer - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/kallsyms.h> -#include <linux/time.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *boot_trace; -static bool pre_initcalls_finished; - -/* Tells the boot tracer that the pre_smp_initcalls are finished. - * So we are ready . - * It doesn't enable sched events tracing however. - * You have to call enable_boot_trace to do so. - */ -void start_boot_trace(void) -{ - pre_initcalls_finished = true; -} - -void enable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_start_sched_switch_record(); -} - -void disable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_stop_sched_switch_record(); -} - -static int boot_trace_init(struct trace_array *tr) -{ - boot_trace = tr; - - if (!tr) - return 0; - - tracing_reset_online_cpus(tr); - - tracing_sched_switch_assign_trace(tr); - return 0; -} - -static enum print_line_t -initcall_call_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_call *field; - struct boot_trace_call *call; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - call = &field->boot_call; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - (unsigned long)ts, nsec_rem, call->func, call->caller); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -initcall_ret_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_ret *field; - struct boot_trace_ret *init_ret; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - init_ret = &field->boot_ret; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %llu msecs\n", - (unsigned long) ts, - nsec_rem, - init_ret->func, init_ret->result, init_ret->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t initcall_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - switch (entry->type) { - case TRACE_BOOT_CALL: - return initcall_call_print_line(iter); - case TRACE_BOOT_RET: - return initcall_ret_print_line(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -struct tracer boot_tracer __read_mostly = -{ - .name = "initcall", - .init = boot_trace_init, - .reset = tracing_reset_online_cpus, - .print_line = initcall_print_line, -}; - -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_call; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_call *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - /* Get its name now since this function could - * disappear because it is in the .init section. - */ - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_call = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_ret; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_ret *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_ret = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1a..685a67d55db0 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -32,16 +32,15 @@ u64 notrace trace_clock_local(void) { u64 clock; - int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); clock = sched_clock(); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return clock; } @@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void) */ u64 notrace trace_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1240da..e3dfecaf13e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, ); /* - * Special (free-form) trace entry: - */ -FTRACE_ENTRY(special, special_entry, - - TRACE_SPECIAL, - - F_STRUCT( - __field( unsigned long, arg1 ) - __field( unsigned long, arg2 ) - __field( unsigned long, arg3 ) - ), - - F_printk("(%08lx) (%08lx) (%08lx)", - __entry->arg1, __entry->arg2, __entry->arg3) -); - -/* * Stack-trace entry: */ @@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __entry->map_id, __entry->opcode) ); -FTRACE_ENTRY(boot_call, trace_boot_call, - - TRACE_BOOT_CALL, - - F_STRUCT( - __field_struct( struct boot_trace_call, boot_call ) - __field_desc( pid_t, boot_call, caller ) - __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) - ), - - F_printk("%d %s", __entry->caller, __entry->func) -); - -FTRACE_ENTRY(boot_ret, trace_boot_ret, - - TRACE_BOOT_RET, - - F_STRUCT( - __field_struct( struct boot_trace_ret, boot_ret ) - __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) - __field_desc( int, boot_ret, result ) - __field_desc( unsigned long, boot_ret, duration ) - ), - - F_printk("%s %d %lx", - __entry->func, __entry->result, __entry->duration) -); #define TRACE_FUNC_SIZE 30 #define TRACE_FILE_SIZE 20 @@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, - - TRACE_KMEM_ALLOC, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" - " flags:%x node:%d", - __entry->type_id, __entry->call_site, __entry->ptr, - __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, __entry->node) -); - -FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, - - TRACE_KMEM_FREE, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - F_printk("type:%u call_site:%lx ptr:%p", - __entry->type_id, __entry->call_site, __entry->ptr) -); - -FTRACE_ENTRY(ksym_trace, ksym_trace_entry, - - TRACE_KSYM, - - F_STRUCT( - __field( unsigned long, ip ) - __field( unsigned char, type ) - __array( char , cmd, TASK_COMM_LEN ) - __field( unsigned long, addr ) - ), - - F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", - (void *)__entry->ip, (unsigned int)__entry->type, - (void *)__entry->addr, __entry->cmd) -); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a2b73f7c068..000e6e85b445 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,8 +9,6 @@ #include <linux/kprobes.h> #include "trace.h" -EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); - static char *perf_trace_buf[4]; /* @@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - if (tp_event->class->reg) - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); - else - ret = tracepoint_probe_register(tp_event->name, - tp_event->class->perf_probe, - tp_event); - + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); if (ret) goto fail; @@ -96,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event) mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && - tp_event->class && - (tp_event->class->perf_probe || - tp_event->class->reg) && + tp_event->class && tp_event->class->reg && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); break; @@ -138,18 +128,13 @@ void perf_trace_destroy(struct perf_event *p_event) if (--tp_event->perf_refcount > 0) goto out; - if (tp_event->class->reg) - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - else - tracepoint_probe_unregister(tp_event->name, - tp_event->class->perf_probe, - tp_event); + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); /* - * Ensure our callback won't be called anymore. See - * tracepoint_probe_unregister() and __DO_TRACE(). + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. */ - synchronize_sched(); + tracepoint_synchronize_unregister(); free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b0801..09b4fa6e4d3b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -28,6 +28,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +LIST_HEAD(ftrace_common_fields); struct list_head * trace_get_fields(struct ftrace_event_call *event_call) @@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } -int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, int is_signed, - int filter_type) +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) { struct ftrace_event_field *field; - struct list_head *head; - - if (WARN_ON(!call->class)) - return 0; field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) @@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, field->size = size; field->is_signed = is_signed; - head = trace_get_fields(call); list_add(&field->link, head); return 0; @@ -80,17 +76,32 @@ err: return -ENOMEM; } + +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type); +} EXPORT_SYMBOL_GPL(trace_define_field); #define __common_field(type, item) \ - ret = trace_define_field(call, #type, "common_" #item, \ - offsetof(typeof(ent), item), \ - sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -static int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(void) { int ret; struct trace_entry ent; @@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->name, + call->class->probe, + call); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->probe, + call); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->name, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->perf_probe, + call); + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(ftrace_event_reg); + +void trace_event_enable_cmd_record(bool enable) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } else { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + } + mutex_unlock(&event_mutex); +} + static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->flags & TRACE_EVENT_FL_ENABLED) { call->flags &= ~TRACE_EVENT_FL_ENABLED; - tracing_stop_cmdline_record(); - if (call->class->reg) - call->class->reg(call, TRACE_REG_UNREGISTER); - else - tracepoint_probe_unregister(call->name, - call->class->probe, - call); + if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + call->class->reg(call, TRACE_REG_UNREGISTER); } break; case 1: if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { - tracing_start_cmdline_record(); - if (call->class->reg) - ret = call->class->reg(call, TRACE_REG_REGISTER); - else - ret = tracepoint_probe_register(call->name, - call->class->probe, - call); + if (trace_flags & TRACE_ITER_RECORD_CMD) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } + ret = call->class->reg(call, TRACE_REG_REGISTER); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " @@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (match && @@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && (call->class->probe || call->class->reg)) + if (call->class && call->class->reg) return call; } @@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system) != 0) @@ -544,32 +598,10 @@ out: return ret; } -static ssize_t -event_format_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +static void print_event_fields(struct trace_seq *s, struct list_head *head) { - struct ftrace_event_call *call = filp->private_data; struct ftrace_event_field *field; - struct list_head *head; - struct trace_seq *s; - int common_field_count = 5; - char *buf; - int r = 0; - - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - - trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->event.type); - trace_seq_printf(s, "format:\n"); - head = trace_get_fields(call); list_for_each_entry_reverse(field, head, link) { /* * Smartly shows the array type(except dynamic array). @@ -584,29 +616,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, array_descriptor = NULL; if (!array_descriptor) { - r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" + trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" "\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); } else { - r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" + trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" "\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, array_descriptor, field->offset, field->size, !!field->is_signed); } + } +} - if (--common_field_count == 0) - r = trace_seq_printf(s, "\n"); +static ssize_t +event_format_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct list_head *head; + struct trace_seq *s; + char *buf; + int r; - if (!r) - break; - } + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + trace_seq_printf(s, "name: %s\n", call->name); + trace_seq_printf(s, "ID: %d\n", call->event.type); + trace_seq_printf(s, "format:\n"); + + /* print common fields */ + print_event_fields(s, &ftrace_common_fields); - if (r) - r = trace_seq_printf(s, "\nprint fmt: %s\n", - call->print_fmt); + trace_seq_putc(s, '\n'); + + /* print event specific fields */ + head = trace_get_fields(call); + print_event_fields(s, head); + + r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt); if (!r) { /* @@ -963,35 +1020,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->class->probe || call->class->reg) + if (call->class->reg) trace_create_file("enable", 0644, call->dir, call, enable); #ifdef CONFIG_PERF_EVENTS - if (call->event.type && (call->class->perf_probe || call->class->reg)) + if (call->event.type && call->class->reg) trace_create_file("id", 0444, call->dir, call, id); #endif - if (call->class->define_fields) { - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - ret = trace_define_common_fields(call); - if (!ret) - ret = call->class->define_fields(call); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; } - trace_create_file("filter", 0644, call->dir, call, - filter); } + trace_create_file("filter", 0644, call->dir, call, + filter); trace_create_file("format", 0444, call->dir, call, format); @@ -999,11 +1052,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } -static int __trace_add_event_call(struct ftrace_event_call *call) +static int +__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *d_events; int ret; + /* The linker may leave blanks */ if (!call->name) return -EINVAL; @@ -1011,8 +1070,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "events/%s\n", call->name); + pr_warning("Could not initialize trace events/%s\n", + call->name); return ret; } } @@ -1021,11 +1080,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call) if (!d_events) return -ENOENT; - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, id, enable, filter, format); if (!ret) list_add(&call->list, &ftrace_events); + call->mod = mod; return ret; } @@ -1035,7 +1093,10 @@ int trace_add_event_call(struct ftrace_event_call *call) { int ret; mutex_lock(&event_mutex); - ret = __trace_add_event_call(call); + ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); mutex_unlock(&event_mutex); return ret; } @@ -1152,8 +1213,6 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; - struct dentry *d_events; - int ret; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1161,38 +1220,14 @@ static void trace_module_add_events(struct module *mod) if (start == end) return; - d_events = event_trace_events_dir(); - if (!d_events) + file_ops = trace_create_file_ops(mod); + if (!file_ops) return; for_each_event(call, start, end) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - /* - * This module has events, create file ops for this module - * if not already done. - */ - if (!file_ops) { - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - } - call->mod = mod; - ret = event_create_dir(call, d_events, + __trace_add_event_call(call, mod, &file_ops->id, &file_ops->enable, &file_ops->filter, &file_ops->format); - if (!ret) - list_add(&call->list, &ftrace_events); } } @@ -1319,25 +1354,14 @@ static __init int event_trace_init(void) trace_create_file("enable", 0644, d_events, NULL, &ftrace_system_enable_fops); + if (trace_define_common_fields()) + pr_warning("tracing: Failed to allocate common fields"); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + __trace_add_event_call(call, NULL, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - if (!ret) - list_add(&call->list, &ftrace_events); } while (true) { @@ -1524,12 +1548,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) struct ftrace_entry *entry; unsigned long flags; long disabled; - int resched; int cpu; int pc; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); @@ -1551,7 +1574,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 57bb1bb32999..36d40104b17f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system, } static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) +__find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; - struct list_head *head; - head = trace_get_fields(call); list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; @@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static void filter_free_pred(struct filter_pred *pred) { if (!pred) @@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system) int err; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(call, &ftrace_events, list) { struct event_filter *filter = call->filter; - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 8536e2a65969..4ba44deaac25 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" -static int ftrace_raw_init_event(struct ftrace_event_call *call) -{ - INIT_LIST_HEAD(&call->class->fields); - return 0; -} - #undef __entry #define __entry REC @@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ - .raw_init = ftrace_raw_init_event, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ }; \ \ struct ftrace_event_call __used \ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776b0cd6..16aee4d44e8f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu, resched; + int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) trace_function(tr, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac99a94..6bff23625781 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", + nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6fd486e0cef4..73a6b0601f2e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 000000000000..7b8ecd751d93 --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,136 @@ +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + */ +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/ftrace.h> + +#include "../debug/kdb/kdb_private.h" +#include "trace.h" +#include "trace_output.h" + +static void ftrace_dump_buf(int skip_lines, long cpu_file) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + unsigned int old_userobj; + int cnt = 0, cpu; + + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + old_userobj = trace_flags; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + if (!trace_empty(&iter)) + trace_find_next_entry_inc(&iter); + while (!trace_empty(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + print_trace_line(&iter); + if (!skip_lines) + trace_printk_seq(&iter.seq); + else + skip_lines--; + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + for_each_tracing_cpu(cpu) + if (iter.buffer_iter[cpu]) + ring_buffer_read_finish(iter.buffer_iter[cpu]); +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_lines = 0; + long cpu_file; + char *cp; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_lines = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = TRACE_PIPE_ALL_CPU; + } + + kdb_trap_printk++; + ftrace_dump_buf(skip_lines, cpu_file); + kdb_trap_printk--; + + return 0; +} + +static __init int kdb_ftrace_register(void) +{ + kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_REPEAT_NONE); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299d..8b27c9849b42 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,6 +30,8 @@ #include <linux/ptrace.h> #include <linux/perf_event.h> #include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -38,6 +40,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX #define KPROBE_EVENT_SYSTEM "kprobes" /* Reserved field names */ @@ -58,14 +61,16 @@ const char *reserved_field_names[] = { }; /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, + void *); #define PRINT_TYPE_FUNC_NAME(type) print_type_##type #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, void *data)\ + const char *name, \ + void *data, void *ent)\ { \ return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ } \ @@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +/* Print type function for string type */ +static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, + const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} +static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); @@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm, return fprm->fn(regs, fprm->data, dest); } -#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. */ -#define DEFINE_BASIC_FETCH_FUNCS(kind) \ -DEFINE_FETCH_##kind(u8) \ -DEFINE_FETCH_##kind(u16) \ -DEFINE_FETCH_##kind(u32) \ -DEFINE_FETCH_##kind(u64) - -#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ - ((FETCH_FUNC_NAME(kind, u8) == fn) || \ - (FETCH_FUNC_NAME(kind, u16) == fn) || \ - (FETCH_FUNC_NAME(kind, u32) == fn) || \ - (FETCH_FUNC_NAME(kind, u64) == fn)) +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ + void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL #define DEFINE_FETCH_stack(type) \ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ @@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL #define DEFINE_FETCH_retval(type) \ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ @@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ *(type *)dest = (type)regs_return_value(regs); \ } DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL #define DEFINE_FETCH_memory(type) \ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ @@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = retval; \ } DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + if (!maxlen) + return; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); +} +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int ret, len = 0; + u8 c; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} /* Memory fetching by symbol */ struct symbol_cache { @@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) /* Dereference memory access function */ struct deref_fetch_param { @@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) +DEFINE_FETCH_deref(string_size) static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) free_deref_fetch_param(data->orig.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } @@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(kind, type) \ - .kind = FETCH_FUNC_NAME(kind, type) - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - {.name = #ptype, \ - .size = sizeof(ftype), \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_END, +}; + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } \ } +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + /* Fetch type information table */ static const struct fetch_type { const char *name; /* Name of type */ @@ -264,14 +404,16 @@ static const struct fetch_type { int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ /* Fetch functions */ - fetch_func_t reg; - fetch_func_t stack; - fetch_func_t retval; - fetch_func_t memory; - fetch_func_t symbol; - fetch_func_t deref; + fetch_func_t fetch[FETCH_MTD_END]; } fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), ASSIGN_FETCH_TYPE(u16, u16, 0), ASSIGN_FETCH_TYPE(u32, u32, 0), @@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs, *(unsigned long *)dest = kernel_stack_pointer(regs); } +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn) +{ + int i; + + if (type != &fetch_type_table[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + return NULL; +} + /** * Kprobe event core functions */ struct probe_arg { struct fetch_param fetch; + struct fetch_param fetch_size; unsigned int offset; /* Offset from argument entry */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ @@ -429,9 +587,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); kfree(arg->name); kfree(arg->comm); @@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (strcmp(arg, "retval") == 0) { if (is_return) - f->fn = t->retval; + f->fn = t->fetch[FETCH_MTD_retval]; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { @@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - f->fn = t->stack; + f->fn = t->fetch[FETCH_MTD_stack]; f->data = (void *)param; } } else @@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->reg; + f->fn = t->fetch[FETCH_MTD_reg]; f->data = (void *)(unsigned long)ret; ret = 0; } @@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - f->fn = t->memory; + f->fn = t->fetch[FETCH_MTD_memory]; f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); @@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, break; f->data = alloc_symbol_cache(arg + 1, offset); if (f->data) - f->fn = t->symbol; + f->fn = t->fetch[FETCH_MTD_symbol]; } break; case '+': /* deref memory */ @@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, if (ret) kfree(dprm); else { - f->fn = t->deref; + f->fn = t->fetch[FETCH_MTD_deref]; f->data = (void *)dprm; } } break; } - if (!ret && !f->fn) + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", + t->name); ret = -EINVAL; + } return ret; } @@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) { const char *t; + int ret; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, } parg->offset = tp->size; tp->size += parg->type->size; - return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn); + parg->fetch_size.data = parg->fetch.data; + } + return ret; } /* Return 1 if name is reserved or already used by another argument */ @@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv) pr_info("Delete command needs an event name.\n"); return -EINVAL; } + mutex_lock(&probe_lock); tp = find_probe_event(event, group); if (!tp) { + mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ unregister_trace_probe(tp); free_trace_probe(tp); + mutex_unlock(&probe_lock); return 0; } @@ -1043,6 +1214,54 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Sum up total data length for dynamic arraies (strings) */ +static __kprobes int __get_data_size(struct trace_probe *tp, + struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, + struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + /* Kprobe handler */ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { @@ -1050,8 +1269,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, dsize, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -1060,7 +1278,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1069,9 +1288,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) entry = ring_buffer_event_data(event); entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1085,15 +1302,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, pc, dsize; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1103,9 +1320,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1137,7 +1352,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1179,7 +1394,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1214,11 +1429,6 @@ static void probe_event_disable(struct ftrace_event_call *call) } } -static int probe_event_raw_init(struct ftrace_event_call *event_call) -{ - return 0; -} - #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ do { \ @@ -1239,7 +1449,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1261,7 +1471,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1301,8 +1511,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); } #undef LEN_OR_ZERO @@ -1339,11 +1554,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1355,9 +1570,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, return; entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + memset(&entry[1], 0, dsize); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); @@ -1371,11 +1585,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1388,9 +1602,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); @@ -1486,15 +1698,12 @@ static int register_probe_event(struct trace_probe *tp) int ret; /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); if (probe_is_return(tp)) { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kretprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kretprobe_event_define_fields; } else { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c deleted file mode 100644 index 8eaf00749b65..000000000000 --- a/kernel/trace/trace_ksym.c +++ /dev/null @@ -1,508 +0,0 @@ -/* - * trace_ksym.c - Kernel Symbol Tracer - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2009 - */ - -#include <linux/kallsyms.h> -#include <linux/uaccess.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/fs.h> - -#include "trace_output.h" -#include "trace.h" - -#include <linux/hw_breakpoint.h> -#include <asm/hw_breakpoint.h> - -#include <asm/atomic.h> - -#define KSYM_TRACER_OP_LEN 3 /* rw- */ - -struct trace_ksym { - struct perf_event **ksym_hbp; - struct perf_event_attr attr; -#ifdef CONFIG_PROFILE_KSYM_TRACER - atomic64_t counter; -#endif - struct hlist_node ksym_hlist; -}; - -static struct trace_array *ksym_trace_array; - -static unsigned int ksym_tracing_enabled; - -static HLIST_HEAD(ksym_filter_head); - -static DEFINE_MUTEX(ksym_tracer_mutex); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - -#define MAX_UL_INT 0xffffffff - -void ksym_collect_stats(unsigned long hbp_hit_addr) -{ - struct hlist_node *node; - struct trace_ksym *entry; - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == hbp_hit_addr) { - atomic64_inc(&entry->counter); - break; - } - } - rcu_read_unlock(); -} -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -void ksym_hbp_handler(struct perf_event *hbp, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct ring_buffer_event *event; - struct ksym_trace_entry *entry; - struct ring_buffer *buffer; - int pc; - - if (!ksym_tracing_enabled) - return; - - buffer = ksym_trace_array->buffer; - - pc = preempt_count(); - - event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, - sizeof(*entry), 0, pc); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(regs); - entry->type = hw_breakpoint_type(hbp); - entry->addr = hw_breakpoint_addr(hbp); - strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - ksym_collect_stats(hw_breakpoint_addr(hbp)); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -/* Valid access types are represented as - * - * rw- : Set Read/Write Access Breakpoint - * -w- : Set Write Access Breakpoint - * --- : Clear Breakpoints - * --x : Set Execution Break points (Not available yet) - * - */ -static int ksym_trace_get_access_type(char *str) -{ - int access = 0; - - if (str[0] == 'r') - access |= HW_BREAKPOINT_R; - - if (str[1] == 'w') - access |= HW_BREAKPOINT_W; - - if (str[2] == 'x') - access |= HW_BREAKPOINT_X; - - switch (access) { - case HW_BREAKPOINT_R: - case HW_BREAKPOINT_W: - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - return access; - default: - return -EINVAL; - } -} - -/* - * There can be several possible malformed requests and we attempt to capture - * all of them. We enumerate some of the rules - * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. - * i.e. multiple ':' symbols disallowed. Possible uses are of the form - * <module>:<ksym_name>:<op>. - * 2. No delimiter symbol ':' in the input string - * 3. Spurious operator symbols or symbols not in their respective positions - * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file - * 5. Kernel symbol not a part of /proc/kallsyms - * 6. Duplicate requests - */ -static int parse_ksym_trace_str(char *input_string, char **ksymname, - unsigned long *addr) -{ - int ret; - - *ksymname = strsep(&input_string, ":"); - *addr = kallsyms_lookup_name(*ksymname); - - /* Check for malformed request: (2), (1) and (5) */ - if ((!input_string) || - (strlen(input_string) != KSYM_TRACER_OP_LEN) || - (*addr == 0)) - return -EINVAL;; - - ret = ksym_trace_get_access_type(input_string); - - return ret; -} - -int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) -{ - struct trace_ksym *entry; - int ret = -ENOMEM; - - entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - hw_breakpoint_init(&entry->attr); - - entry->attr.bp_type = op; - entry->attr.bp_addr = addr; - entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - - entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - - if (IS_ERR(entry->ksym_hbp)) { - ret = PTR_ERR(entry->ksym_hbp); - if (ret == -ENOSPC) { - printk(KERN_ERR "ksym_tracer: Maximum limit reached." - " No new requests for tracing can be accepted now.\n"); - } else { - printk(KERN_INFO "ksym_tracer request failed. Try again" - " later!!\n"); - } - goto err; - } - - hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); - - return 0; - -err: - kfree(entry); - - return ret; -} - -static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - struct trace_seq *s; - ssize_t cnt = 0; - int ret; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - trace_seq_init(s); - - mutex_lock(&ksym_tracer_mutex); - - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - ret = trace_seq_printf(s, "%pS:", - (void *)(unsigned long)entry->attr.bp_addr); - if (entry->attr.bp_type == HW_BREAKPOINT_R) - ret = trace_seq_puts(s, "r--\n"); - else if (entry->attr.bp_type == HW_BREAKPOINT_W) - ret = trace_seq_puts(s, "-w-\n"); - else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) - ret = trace_seq_puts(s, "rw-\n"); - WARN_ON_ONCE(!ret); - } - - cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); - - mutex_unlock(&ksym_tracer_mutex); - - kfree(s); - - return cnt; -} - -static void __ksym_trace_reset(void) -{ - struct trace_ksym *entry; - struct hlist_node *node, *node1; - - mutex_lock(&ksym_tracer_mutex); - hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, - ksym_hlist) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - } - mutex_unlock(&ksym_tracer_mutex); -} - -static ssize_t ksym_trace_filter_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - char *buf, *input_string, *ksymname = NULL; - unsigned long ksym_addr = 0; - int ret, op, changed = 0; - - buf = kzalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(buf, buffer, count)) - goto out; - - buf[count] = '\0'; - input_string = strstrip(buf); - - /* - * Clear all breakpoints if: - * 1: echo > ksym_trace_filter - * 2: echo 0 > ksym_trace_filter - * 3: echo "*:---" > ksym_trace_filter - */ - if (!input_string[0] || !strcmp(input_string, "0") || - !strcmp(input_string, "*:---")) { - __ksym_trace_reset(); - ret = 0; - goto out; - } - - ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); - if (ret < 0) - goto out; - - mutex_lock(&ksym_tracer_mutex); - - ret = -EINVAL; - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == ksym_addr) { - /* Check for malformed request: (6) */ - if (entry->attr.bp_type != op) - changed = 1; - else - goto out_unlock; - break; - } - } - if (changed) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - entry->attr.bp_type = op; - ret = 0; - if (op > 0) { - entry->ksym_hbp = - register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - if (IS_ERR(entry->ksym_hbp)) - ret = PTR_ERR(entry->ksym_hbp); - else - goto out_unlock; - } - /* Error or "symbol:---" case: drop it */ - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - goto out_unlock; - } else { - /* Check for malformed request: (4) */ - if (op) - ret = process_new_ksym_entry(ksymname, op, ksym_addr); - } -out_unlock: - mutex_unlock(&ksym_tracer_mutex); -out: - kfree(buf); - return !ret ? count : ret; -} - -static const struct file_operations ksym_tracing_fops = { - .open = tracing_open_generic, - .read = ksym_trace_filter_read, - .write = ksym_trace_filter_write, -}; - -static void ksym_trace_reset(struct trace_array *tr) -{ - ksym_tracing_enabled = 0; - __ksym_trace_reset(); -} - -static int ksym_trace_init(struct trace_array *tr) -{ - int cpu, ret = 0; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - ksym_tracing_enabled = 1; - ksym_trace_array = tr; - - return ret; -} - -static void ksym_trace_print_header(struct seq_file *m) -{ - seq_puts(m, - "# TASK-PID CPU# Symbol " - "Type Function\n"); - seq_puts(m, - "# | | | " - " | |\n"); -} - -static enum print_line_t ksym_trace_output(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct ksym_trace_entry *field; - char str[KSYM_SYMBOL_LEN]; - int ret; - - if (entry->type != TRACE_KSYM) - return TRACE_TYPE_UNHANDLED; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, - entry->pid, iter->cpu, (char *)field->addr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - switch (field->type) { - case HW_BREAKPOINT_R: - ret = trace_seq_printf(s, " R "); - break; - case HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " RW "); - break; - default: - return TRACE_TYPE_PARTIAL_LINE; - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - sprint_symbol(str, field->ip); - ret = trace_seq_printf(s, "%s\n", str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -struct tracer ksym_tracer __read_mostly = -{ - .name = "ksym_tracer", - .init = ksym_trace_init, - .reset = ksym_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_ksym, -#endif - .print_header = ksym_trace_print_header, - .print_line = ksym_trace_output -}; - -#ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_profile_show(struct seq_file *m, void *v) -{ - struct hlist_node *node; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; - - seq_puts(m, " Access Type "); - seq_puts(m, " Symbol Counter\n"); - seq_puts(m, " ----------- "); - seq_puts(m, " ------ -------\n"); - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - - access_type = entry->attr.bp_type; - - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); - } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", "<NA>"); - seq_printf(m, " %15llu\n", - (unsigned long long)atomic64_read(&entry->counter)); - } - rcu_read_unlock(); - - return 0; -} - -static int ksym_profile_open(struct inode *node, struct file *file) -{ - return single_open(file, ksym_profile_show, NULL); -} - -static const struct file_operations ksym_profile_fops = { - .open = ksym_profile_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - trace_create_file("ksym_profile", 0444, d_tracer, - NULL, &ksym_profile_fops); -#endif - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b4596470..02272baa2206 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -16,9 +16,6 @@ DECLARE_RWSEM(trace_event_mutex); -DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); -EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); - static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = { .funcs = &trace_wake_funcs, }; -/* TRACE_SPECIAL */ -static enum print_line_t trace_special_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_hex(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_HEX_FIELD_RET(s, field->arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_bin(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_FIELD_RET(s, field->arg1); - SEQ_PUT_FIELD_RET(s, field->arg2); - SEQ_PUT_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static struct trace_event_functions trace_special_funcs = { - .trace = trace_special_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, -}; - -static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, - .funcs = &trace_special_funcs, -}; - /* TRACE_STACK */ static enum print_line_t trace_stack_print(struct trace_iterator *iter, @@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_stack_funcs = { .trace = trace_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_stack_event = { @@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_user_stack_funcs = { .trace = trace_user_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_user_stack_event = { @@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, &trace_wake_event, - &trace_special_event, &trace_stack_event, &trace_user_stack_event, &trace_bprint_event, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2ef8c5..4086eae6e81b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int resched; int cpu; int pc; @@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) @@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&data->disabled); out_enable: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .use_max_tr = 1, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .use_max_tr = 1, }; __init static int init_wakeup_tracer(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 250e7f9bd2f0..155a415b3209 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_WAKE: case TRACE_STACK: case TRACE_PRINT: - case TRACE_SPECIAL: case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_KSYM: return 1; } return 0; @@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -#ifdef CONFIG_SYSPROF_TRACER -int -trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_SYSPROF_TRACER */ - #ifdef CONFIG_BRANCH_TRACER int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) @@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_KSYM_TRACER -static int ksym_selftest_dummy; - -int -trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - ksym_selftest_dummy = 0; - /* Register the read-write tracing request */ - - ret = process_new_ksym_entry("ksym_selftest_dummy", - HW_BREAKPOINT_R | HW_BREAKPOINT_W, - (unsigned long)(&ksym_selftest_dummy)); - - if (ret < 0) { - printk(KERN_CONT "ksym_trace read-write startup test failed\n"); - goto ret_path; - } - /* Perform a read and a write operation over the dummy variable to - * trigger the tracer - */ - if (ksym_selftest_dummy == 0) - ksym_selftest_dummy++; - - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - /* read & write operations - one each is performed on the dummy variable - * triggering two entries in the trace buffer - */ - if (!ret && count != 2) { - printk(KERN_CONT "Ksym tracer startup test failed"); - ret = -1; - } - -ret_path: - return ret; -} -#endif /* CONFIG_KSYM_TRACER */ - diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b27de5f..056468eae7cf 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -110,12 +110,12 @@ static inline void check_stack(void) static void stack_trace_call(unsigned long ip, unsigned long parent_ip) { - int cpu, resched; + int cpu; if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 34e35804304b..bac752f0cfb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); +/* All syscall exit events have the same fields */ +static LIST_HEAD(syscall_exit_fields); + static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call) static struct list_head * syscall_get_exit_fields(struct ftrace_event_call *call) { - struct syscall_metadata *entry = call->data; - - return &entry->exit_fields; + return &syscall_exit_fields; } struct trace_event_functions enter_syscall_print_funcs = { diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index a7974a552ca9..000000000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * trace stack traces - * - * Copyright (C) 2004-2008, Soeren Sandmann - * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> - * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> - */ -#include <linux/kallsyms.h> -#include <linux/debugfs.h> -#include <linux/hrtimer.h> -#include <linux/uaccess.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/irq.h> -#include <linux/fs.h> - -#include <asm/stacktrace.h> - -#include "trace.h" - -static struct trace_array *sysprof_trace; -static int __read_mostly tracer_enabled; - -/* - * 1 msec sample interval by default: - */ -static unsigned long sample_period = 1000000; -static const unsigned int sample_max_depth = 512; - -static DEFINE_MUTEX(sample_timer_lock); -/* - * Per CPU hrtimers that do the profiling: - */ -static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); - -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - int ret; - - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; - - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); - - return ret; -} - -struct backtrace_info { - struct trace_array_cpu *data; - struct trace_array *tr; - int pos; -}; - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - /* Don't bother with IRQ stacks for now */ - return -1; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct backtrace_info *info = data; - - if (info->pos < sample_max_depth && reliable) { - __trace_special(info->tr, info->data, 1, addr, 0); - - info->pos++; - } -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack, -}; - -static int -trace_kernel(struct pt_regs *regs, struct trace_array *tr, - struct trace_array_cpu *data) -{ - struct backtrace_info info; - unsigned long bp; - char *stack; - - info.tr = tr; - info.data = data; - info.pos = 1; - - __trace_special(info.tr, info.data, 1, regs->ip, 0); - - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - bp = regs->bp; -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - - return info.pos; -} - -static void timer_notify(struct pt_regs *regs, int cpu) -{ - struct trace_array_cpu *data; - struct stack_frame frame; - struct trace_array *tr; - const void __user *fp; - int is_user; - int i; - - if (!regs) - return; - - tr = sysprof_trace; - data = tr->data[cpu]; - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - __trace_special(tr, data, 0, 0, current->pid); - - if (!is_user) - i = trace_kernel(regs, tr, data); - else - i = 0; - - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm && i < sample_max_depth) { - regs = (struct pt_regs *)current->thread.sp0 - 1; - - fp = (void __user *)regs->bp; - - __trace_special(tr, data, 2, regs->ip, 0); - - while (i < sample_max_depth) { - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - - i++; - } - - } - - /* - * Special trace entry if we overflow the max depth: - */ - if (i == sample_max_depth) - __trace_special(tr, data, -1, -1, -1); - - __trace_special(tr, data, 3, current->pid, i); -} - -static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -{ - /* trace here */ - timer_notify(get_irq_regs(), smp_processor_id()); - - hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - - return HRTIMER_RESTART; -} - -static void start_stack_timer(void *unused) -{ - struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); - - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = stack_trace_timer_fn; - - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); -} - -static void start_stack_timers(void) -{ - on_each_cpu(start_stack_timer, NULL, 1); -} - -static void stop_stack_timer(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); - - hrtimer_cancel(hrtimer); -} - -static void stop_stack_timers(void) -{ - int cpu; - - for_each_online_cpu(cpu) - stop_stack_timer(cpu); -} - -static void stop_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - tracer_enabled = 0; - mutex_unlock(&sample_timer_lock); -} - -static int stack_trace_init(struct trace_array *tr) -{ - sysprof_trace = tr; - - tracing_start_cmdline_record(); - - mutex_lock(&sample_timer_lock); - start_stack_timers(); - tracer_enabled = 1; - mutex_unlock(&sample_timer_lock); - return 0; -} - -static void stack_trace_reset(struct trace_array *tr) -{ - tracing_stop_cmdline_record(); - stop_stack_trace(tr); -} - -static struct tracer stack_trace __read_mostly = -{ - .name = "sysprof", - .init = stack_trace_init, - .reset = stack_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sysprof, -#endif -}; - -__init static int init_stack_trace(void) -{ - return register_tracer(&stack_trace); -} -device_initcall(init_stack_trace); - -#define MAX_LONG_DIGITS 22 - -static ssize_t -sysprof_sample_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - int r; - - r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -sysprof_sample_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - unsigned long val; - - if (cnt > MAX_LONG_DIGITS-1) - cnt = MAX_LONG_DIGITS-1; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - val = simple_strtoul(buf, NULL, 10); - /* - * Enforce a minimum sample period of 100 usecs: - */ - if (val < 100) - val = 100; - - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - sample_period = val * 1000; - start_stack_timers(); - mutex_unlock(&sample_timer_lock); - - return cnt; -} - -static const struct file_operations sysprof_sample_fops = { - .read = sysprof_sample_read, - .write = sysprof_sample_write, -}; - -void init_tracer_sysprof_debugfs(struct dentry *d_tracer) -{ - - trace_create_file("sysprof_sample_period", 0644, - d_tracer, NULL, &sysprof_sample_fops); -} diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b2d70d38dff4..25915832291a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/highuid.h> #include <linux/cred.h> /* @@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref) schedule_work(&ns->destroyer); } EXPORT_SYMBOL(free_user_ns); + +uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return uid; + + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (uid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowuid; +} + +gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return gid; + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (gid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowgid; +} diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 000000000000..613bc1f04610 --- /dev/null +++ b/kernel/watchdog.c @@ -0,0 +1,567 @@ +/* + * Detect hard and soft lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * to those contributors as well. + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/lockdep.h> +#include <linux/notifier.h> +#include <linux/module.h> +#include <linux/sysctl.h> + +#include <asm/irq_regs.h> +#include <linux/perf_event.h> + +int watchdog_enabled; +int __read_mostly softlockup_thresh = 60; + +static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(bool, soft_watchdog_warn); +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +#endif + +static int __read_mostly did_panic; +static int __initdata no_watchdog; + + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int hardlockup_panic; + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); +#endif + +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + +static int __init nowatchdog_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); + +/* deprecated */ +static int __init nosoftlockup_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); +/* */ + + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(int this_cpu) +{ + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static unsigned long get_sample_period(void) +{ + /* + * convert softlockup_thresh from seconds to ns + * the divide by 5 is to give hrtimer 5 chances to + * increment before the hardlockup detector generates + * a warning + */ + return softlockup_thresh / 5 * NSEC_PER_SEC; +} + +/* Commands for resetting the watchdog */ +static void __touch_watchdog(void) +{ + int this_cpu = smp_processor_id(); + + __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); +} + +void touch_softlockup_watchdog(void) +{ + __get_cpu_var(watchdog_touch_ts) = 0; +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +void touch_all_softlockup_watchdogs(void) +{ + int cpu; + + /* + * this is done lockless + * do we care if a 0 races with a timestamp? + * all it means is the softlock check starts one cycle later + */ + for_each_online_cpu(cpu) + per_cpu(watchdog_touch_ts, cpu) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +void touch_nmi_watchdog(void) +{ + __get_cpu_var(watchdog_nmi_touch) = true; + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +#endif + +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlockup_touch_sync) = true; + __raw_get_cpu_var(watchdog_touch_ts) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* watchdog detector functions */ +static int is_hardlockup(void) +{ + unsigned long hrint = __get_cpu_var(hrtimer_interrupts); + + if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) + return 1; + + __get_cpu_var(hrtimer_interrupts_saved) = hrint; + return 0; +} +#endif + +static int is_softlockup(unsigned long touch_ts) +{ + unsigned long now = get_timestamp(smp_processor_id()); + + /* Warn about unreasonable delays: */ + if (time_after(now, touch_ts + softlockup_thresh)) + return now - touch_ts; + + return 0; +} + +static int +watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = watchdog_panic, +}; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +void watchdog_overflow_callback(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (__get_cpu_var(watchdog_nmi_touch) == true) { + __get_cpu_var(watchdog_nmi_touch) = false; + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__get_cpu_var(hard_watchdog_warn) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + + __get_cpu_var(hard_watchdog_warn) = true; + return; + } + + __get_cpu_var(hard_watchdog_warn) = false; + return; +} +static void watchdog_interrupt_count(void) +{ + __get_cpu_var(hrtimer_interrupts)++; +} +#else +static inline void watchdog_interrupt_count(void) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* watchdog kicker functions */ +static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +{ + unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); + struct pt_regs *regs = get_irq_regs(); + int duration; + + /* kick the hardlockup detector */ + watchdog_interrupt_count(); + + /* kick the softlockup detector */ + wake_up_process(__get_cpu_var(softlockup_watchdog)); + + /* .. and repeat */ + hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + + if (touch_ts == 0) { + if (unlikely(__get_cpu_var(softlockup_touch_sync))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + __get_cpu_var(softlockup_touch_sync) = false; + sched_clock_tick(); + } + __touch_watchdog(); + return HRTIMER_RESTART; + } + + /* check for a softlockup + * This is done by making sure a high priority task is + * being scheduled. The task touches the watchdog to + * indicate it is getting cpu time. If it hasn't then + * this is a good indication some task is hogging the cpu + */ + duration = is_softlockup(touch_ts); + if (unlikely(duration)) { + /* only warn once */ + if (__get_cpu_var(soft_watchdog_warn) == true) + return HRTIMER_RESTART; + + printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + smp_processor_id(), duration, + current->comm, task_pid_nr(current)); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (softlockup_panic) + panic("softlockup: hung tasks"); + __get_cpu_var(soft_watchdog_warn) = true; + } else + __get_cpu_var(soft_watchdog_warn) = false; + + return HRTIMER_RESTART; +} + + +/* + * The watchdog thread - touches the timestamp. + */ +static int watchdog(void *unused) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* initialize timestamp */ + __touch_watchdog(); + + /* kick off the timer for the hardlockup detector */ + /* done here because hrtimer_start can only pin to smp_processor_id() */ + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); + + set_current_state(TASK_INTERRUPTIBLE); + /* + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 60 seconds then the + * debug-printout triggers in watchdog_timer_fn(). + */ + while (!kthread_should_stop()) { + __touch_watchdog(); + schedule(); + + if (kthread_should_stop()) + break; + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int watchdog_nmi_enable(int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + /* Try to register using hardware perf events */ + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(); + event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); + if (!IS_ERR(event)) { + printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + goto out_save; + } + + printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + return -1; + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +static void watchdog_nmi_disable(int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + return; +} +#else +static int watchdog_nmi_enable(int cpu) { return 0; } +static void watchdog_nmi_disable(int cpu) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* prepare/enable/disable routines */ +static int watchdog_prepare_cpu(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + WARN_ON(per_cpu(softlockup_watchdog, cpu)); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + + return 0; +} + +static int watchdog_enable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + + /* enable the perf event */ + if (watchdog_nmi_enable(cpu) != 0) + return -1; + + /* create the watchdog thread */ + if (!p) { + p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + if (IS_ERR(p)) { + printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + return -1; + } + kthread_bind(p, cpu); + per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(softlockup_watchdog, cpu) = p; + wake_up_process(p); + } + + return 0; +} + +static void watchdog_disable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + /* + * cancel the timer first to stop incrementing the stats + * and waking up the kthread + */ + hrtimer_cancel(hrtimer); + + /* disable the perf event */ + watchdog_nmi_disable(cpu); + + /* stop the watchdog thread */ + if (p) { + per_cpu(softlockup_watchdog, cpu) = NULL; + kthread_stop(p); + } + + /* if any cpu succeeds, watchdog is considered enabled for the system */ + watchdog_enabled = 1; +} + +static void watchdog_enable_all_cpus(void) +{ + int cpu; + int result = 0; + + for_each_online_cpu(cpu) + result += watchdog_enable(cpu); + + if (result) + printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + +} + +static void watchdog_disable_all_cpus(void) +{ + int cpu; + + for_each_online_cpu(cpu) + watchdog_disable(cpu); + + /* if all watchdogs are disabled, then they are disabled for the system */ + watchdog_enabled = 0; +} + + +/* sysctl functions */ +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ + +int proc_dowatchdog_enabled(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, buffer, length, ppos); + + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + return 0; +} + +int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif /* CONFIG_SYSCTL */ + + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __cpuinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (watchdog_prepare_cpu(hotcpu)) + return NOTIFY_BAD; + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (watchdog_enable(hotcpu)) + return NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + watchdog_disable(hotcpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + watchdog_disable(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +static int __init spawn_watchdog_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + if (no_watchdog) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + WARN_ON(err == NOTIFY_BAD); + + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; +} +early_initcall(spawn_watchdog_task); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2deb4451..9ca34cddaf6d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,41 +33,287 @@ #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> -#define CREATE_TRACE_POINTS -#include <trace/events/workqueue.h> +#include <linux/idr.h> + +#include "workqueue_sched.h" + +enum { + /* global_cwq flags */ + GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ + GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 3, /* freeze in progress */ + GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + + /* worker flags */ + WORKER_STARTED = 1 << 0, /* started */ + WORKER_DIE = 1 << 1, /* die die die */ + WORKER_IDLE = 1 << 2, /* is idle */ + WORKER_PREP = 1 << 3, /* preparing to run works */ + WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ + WORKER_REBIND = 1 << 5, /* mom is home, come back */ + WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ + WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | + WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + + /* gcwq->trustee_state */ + TRUSTEE_START = 0, /* start */ + TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ + TRUSTEE_BUTCHER = 2, /* butcher workers */ + TRUSTEE_RELEASE = 3, /* release workers */ + TRUSTEE_DONE = 4, /* trustee is done */ + + BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ + BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, + BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, + + MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ + IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ + + MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ + MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ + CREATE_COOLDOWN = HZ, /* time to breath after fail */ + TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ + + /* + * Rescue workers are used only on emergencies and shared by + * all cpus. Give -20. + */ + RESCUER_NICE_LEVEL = -20, +}; /* - * The per-CPU workqueue (if single thread, we always use the first - * possible cpu). + * Structure fields follow one of the following exclusion rules. + * + * I: Set during initialization and read-only afterwards. + * + * P: Preemption protected. Disabling preemption is enough and should + * only be modified and accessed from the local cpu. + * + * L: gcwq->lock protected. Access with gcwq->lock held. + * + * X: During normal operation, modification requires gcwq->lock and + * should be done only from local cpu. Either disabling preemption + * on local cpu or grabbing gcwq->lock is enough for read access. + * If GCWQ_DISASSOCIATED is set, it's identical to L. + * + * F: wq->flush_mutex protected. + * + * W: workqueue_lock protected. */ -struct cpu_workqueue_struct { - spinlock_t lock; +struct global_cwq; - struct list_head worklist; - wait_queue_head_t more_work; - struct work_struct *current_work; +/* + * The poor guys doing the actual heavy lifting. All on-duty workers + * are either serving the manager role, on idle list or on busy hash. + */ +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; - struct workqueue_struct *wq; - struct task_struct *thread; -} ____cacheline_aligned; + struct work_struct *current_work; /* L: work being processed */ + struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ + struct list_head scheduled; /* L: scheduled works */ + struct task_struct *task; /* I: worker task */ + struct global_cwq *gcwq; /* I: the associated gcwq */ + /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + struct work_struct rebind_work; /* L: rebind worker to cpu */ +}; + +/* + * Global per-cpu workqueue. There's one and only one for each cpu + * and all works are queued and processed here regardless of their + * target workqueues. + */ +struct global_cwq { + spinlock_t lock; /* the gcwq lock */ + struct list_head worklist; /* L: list of pending works */ + unsigned int cpu; /* I: the associated cpu */ + unsigned int flags; /* L: GCWQ_* flags */ + + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + /* workers are chained either in the idle_list or busy_hash */ + struct list_head idle_list; /* X: list of idle workers */ + struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; + /* L: hash of busy workers */ + + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for dworkers */ + + struct ida worker_ida; /* L: for worker IDs */ + + struct task_struct *trustee; /* L: for gcwq shutdown */ + unsigned int trustee_state; /* L: trustee state */ + wait_queue_head_t trustee_wait; /* trustee wait */ + struct worker *first_idle; /* L: first idle worker */ +} ____cacheline_aligned_in_smp; + +/* + * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of + * work_struct->data are used for flags and thus cwqs need to be + * aligned at two's power of the number of flag bits. + */ +struct cpu_workqueue_struct { + struct global_cwq *gcwq; /* I: the associated gcwq */ + struct workqueue_struct *wq; /* I: the owning workqueue */ + int work_color; /* L: current color */ + int flush_color; /* L: flushing color */ + int nr_in_flight[WORK_NR_COLORS]; + /* L: nr of in_flight works */ + int nr_active; /* L: nr of active works */ + int max_active; /* L: max active works */ + struct list_head delayed_works; /* L: delayed works */ +}; + +/* + * Structure used to wait for workqueue flush. + */ +struct wq_flusher { + struct list_head list; /* F: list of flushers */ + int flush_color; /* F: flush color waiting for */ + struct completion done; /* flush completion */ +}; + +/* + * All cpumasks are assumed to be always set on UP and thus can't be + * used to determine whether there's something to be done. + */ +#ifdef CONFIG_SMP +typedef cpumask_var_t mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) \ + cpumask_test_and_set_cpu((cpu), (mask)) +#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) +#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) +#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp)) +#define free_mayday_mask(mask) free_cpumask_var((mask)) +#else +typedef unsigned long mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) +#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) +#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) +#define alloc_mayday_mask(maskp, gfp) true +#define free_mayday_mask(mask) do { } while (0) +#endif /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct *cpu_wq; - struct list_head list; - const char *name; - int singlethread; - int freezeable; /* Freeze threads during suspend */ - int rt; + unsigned int flags; /* I: WQ_* flags */ + union { + struct cpu_workqueue_struct __percpu *pcpu; + struct cpu_workqueue_struct *single; + unsigned long v; + } cpu_wq; /* I: cwq's */ + struct list_head list; /* W: list of all workqueues */ + + struct mutex flush_mutex; /* protects wq flushing */ + int work_color; /* F: current work color */ + int flush_color; /* F: current flush color */ + atomic_t nr_cwqs_to_flush; /* flush in progress */ + struct wq_flusher *first_flusher; /* F: first flusher */ + struct list_head flusher_queue; /* F: flush waiters */ + struct list_head flusher_overflow; /* F: flush overflow list */ + + mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct worker *rescuer; /* I: rescue worker */ + + int saved_max_active; /* W: saved cwq max_active */ + const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; +struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_nrt_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL_GPL(system_long_wq); +EXPORT_SYMBOL_GPL(system_nrt_wq); +EXPORT_SYMBOL_GPL(system_unbound_wq); + +#define for_each_busy_worker(worker, i, pos, gcwq) \ + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ + hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) + +static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, + unsigned int sw) +{ + if (cpu < nr_cpu_ids) { + if (sw & 1) { + cpu = cpumask_next(cpu, mask); + if (cpu < nr_cpu_ids) + return cpu; + } + if (sw & 2) + return WORK_CPU_UNBOUND; + } + return WORK_CPU_NONE; +} + +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, + struct workqueue_struct *wq) +{ + return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); +} + +/* + * CPU iterators + * + * An extra gcwq is defined for an invalid cpu number + * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any + * specific CPU. The following iterators are similar to + * for_each_*_cpu() iterators but also considers the unbound gcwq. + * + * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND + * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND + * for_each_cwq_cpu() : possible CPUs for bound workqueues, + * WORK_CPU_UNBOUND for unbound workqueues + */ +#define for_each_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) + +#define for_each_online_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) + +#define for_each_cwq_cpu(cpu, wq) \ + for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) + +#ifdef CONFIG_LOCKDEP +/** + * in_workqueue_context() - in context of specified workqueue? + * @wq: the workqueue of interest + * + * Checks lockdep state to see if the current task is executing from + * within a workqueue item. This function exists only if lockdep is + * enabled. + */ +int in_workqueue_context(struct workqueue_struct *wq) +{ + return lock_is_held(&wq->lockdep_map); +} +#endif + #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; @@ -107,7 +353,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); return 0; @@ -181,94 +427,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { } /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); +static bool workqueue_freezing; /* W: have wqs started freezing? */ + +/* + * The almighty global cpu workqueues. nr_running is the only field + * which is expected to be used frequently by other cpus via + * try_to_wake_up(). Put it in a separate cacheline. + */ +static DEFINE_PER_CPU(struct global_cwq, global_cwq); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); -static int singlethread_cpu __read_mostly; -static const struct cpumask *cpu_singlethread_map __read_mostly; /* - * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD - * flushes cwq->worklist. This means that flush_workqueue/wait_on_work - * which comes in between can't use for_each_online_cpu(). We could - * use cpu_possible_map, the cpumask below is more a documentation - * than optimization. + * Global cpu workqueue and nr_running counter for unbound gcwq. The + * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its + * workers have WORKER_UNBOUND set. */ -static cpumask_var_t cpu_populated_map __read_mostly; +static struct global_cwq unbound_global_cwq; +static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ -/* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_wq_single_threaded(struct workqueue_struct *wq) +static int worker_thread(void *__worker); + +static struct global_cwq *get_gcwq(unsigned int cpu) { - return wq->singlethread; + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(global_cwq, cpu); + else + return &unbound_global_cwq; } -static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) +static atomic_t *get_gcwq_nr_running(unsigned int cpu) { - return is_wq_single_threaded(wq) - ? cpu_singlethread_map : cpu_populated_map; + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(gcwq_nr_running, cpu); + else + return &unbound_gcwq_nr_running; } -static -struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, + struct workqueue_struct *wq) { - if (unlikely(is_wq_single_threaded(wq))) - cpu = singlethread_cpu; - return per_cpu_ptr(wq->cpu_wq, cpu); + if (!(wq->flags & WQ_UNBOUND)) { + if (likely(cpu < nr_cpu_ids)) { +#ifdef CONFIG_SMP + return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); +#else + return wq->cpu_wq.single; +#endif + } + } else if (likely(cpu == WORK_CPU_UNBOUND)) + return wq->cpu_wq.single; + return NULL; +} + +static unsigned int work_color_to_flags(int color) +{ + return color << WORK_STRUCT_COLOR_SHIFT; +} + +static int get_work_color(struct work_struct *work) +{ + return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + ((1 << WORK_STRUCT_COLOR_BITS) - 1); +} + +static int work_next_color(int color) +{ + return (color + 1) % WORK_NR_COLORS; } /* - * Set the workqueue on which a work item is to be run - * - Must *only* be called if the pending flag is set + * A work's data points to the cwq with WORK_STRUCT_CWQ set while the + * work is on queue. Once execution starts, WORK_STRUCT_CWQ is + * cleared and the work data contains the cpu number it was last on. + * + * set_work_{cwq|cpu}() and clear_work_data() can be used to set the + * cwq, cpu or clear work->data. These functions should only be + * called while the work is owned - ie. while the PENDING bit is set. + * + * get_work_[g]cwq() can be used to obtain the gcwq or cwq + * corresponding to a work. gcwq is available once the work has been + * queued anywhere after initialization. cwq is available only from + * queueing until execution starts. */ -static inline void set_wq_data(struct work_struct *work, - struct cpu_workqueue_struct *cwq) +static inline void set_work_data(struct work_struct *work, unsigned long data, + unsigned long flags) { - unsigned long new; - BUG_ON(!work_pending(work)); + atomic_long_set(&work->data, data | flags | work_static(work)); +} + +static void set_work_cwq(struct work_struct *work, + struct cpu_workqueue_struct *cwq, + unsigned long extra_flags) +{ + set_work_data(work, (unsigned long)cwq, + WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); +} + +static void set_work_cpu(struct work_struct *work, unsigned int cpu) +{ + set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); +} + +static void clear_work_data(struct work_struct *work) +{ + set_work_data(work, WORK_STRUCT_NO_CPU, 0); +} + +static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + if (data & WORK_STRUCT_CWQ) + return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + else + return NULL; +} + +static struct global_cwq *get_work_gcwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + unsigned int cpu; + + if (data & WORK_STRUCT_CWQ) + return ((struct cpu_workqueue_struct *) + (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + + cpu = data >> WORK_STRUCT_FLAG_BITS; + if (cpu == WORK_CPU_NONE) + return NULL; + + BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); + return get_gcwq(cpu); +} + +/* + * Policy functions. These define the policies on how the global + * worker pool is managed. Unless noted otherwise, these functions + * assume that they're being called with gcwq->lock held. + */ + +static bool __need_more_worker(struct global_cwq *gcwq) +{ + return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || + gcwq->flags & GCWQ_HIGHPRI_PENDING; +} + +/* + * Need to wake up a worker? Called from anything but currently + * running workers. + */ +static bool need_more_worker(struct global_cwq *gcwq) +{ + return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); +} + +/* Can I start working? Called from busy but !running workers. */ +static bool may_start_working(struct global_cwq *gcwq) +{ + return gcwq->nr_idle; +} + +/* Do I need to keep working? Called from currently running workers. */ +static bool keep_working(struct global_cwq *gcwq) +{ + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; +} + +/* Do we need a new worker? Called from manager. */ +static bool need_to_create_worker(struct global_cwq *gcwq) +{ + return need_more_worker(gcwq) && !may_start_working(gcwq); +} + +/* Do I need to be the manager? */ +static bool need_to_manage_workers(struct global_cwq *gcwq) +{ + return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; +} + +/* Do we have too many workers and should some go away? */ +static bool too_many_workers(struct global_cwq *gcwq) +{ + bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; + int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ + int nr_busy = gcwq->nr_workers - nr_idle; - new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); - new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); - atomic_long_set(&work->data, new); + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } /* - * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. + * Wake up functions. */ -static inline void clear_wq_data(struct work_struct *work) + +/* Return the first worker. Safe with preemption disabled */ +static struct worker *first_worker(struct global_cwq *gcwq) { - unsigned long flags = *work_data_bits(work) & - (1UL << WORK_STRUCT_STATIC); - atomic_long_set(&work->data, flags); + if (unlikely(list_empty(&gcwq->idle_list))) + return NULL; + + return list_first_entry(&gcwq->idle_list, struct worker, entry); } -static inline -struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) +/** + * wake_up_worker - wake up an idle worker + * @gcwq: gcwq to wake worker for + * + * Wake up the first idle worker of @gcwq. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void wake_up_worker(struct global_cwq *gcwq) { - return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); + struct worker *worker = first_worker(gcwq); + + if (likely(worker)) + wake_up_process(worker->task); +} + +/** + * wq_worker_waking_up - a worker is waking up + * @task: task waking up + * @cpu: CPU @task is waking up to + * + * This function is called during try_to_wake_up() when a worker is + * being awoken. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +{ + struct worker *worker = kthread_data(task); + + if (likely(!(worker->flags & WORKER_NOT_RUNNING))) + atomic_inc(get_gcwq_nr_running(cpu)); +} + +/** + * wq_worker_sleeping - a worker is going to sleep + * @task: task going to sleep + * @cpu: CPU in question, must be the current CPU number + * + * This function is called during schedule() when a busy worker is + * going to sleep. Worker on the same cpu can be woken up by + * returning pointer to its task. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * RETURNS: + * Worker task on @cpu to wake up, %NULL if none. + */ +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu) +{ + struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct global_cwq *gcwq = get_gcwq(cpu); + atomic_t *nr_running = get_gcwq_nr_running(cpu); + + if (unlikely(worker->flags & WORKER_NOT_RUNNING)) + return NULL; + + /* this can only happen on the local cpu */ + BUG_ON(cpu != raw_smp_processor_id()); + + /* + * The counterpart of the following dec_and_test, implied mb, + * worklist not empty test sequence is in insert_work(). + * Please read comment there. + * + * NOT_RUNNING is clear. This means that trustee is not in + * charge and we're running on the local cpu w/ rq lock held + * and preemption disabled, which in turn means that none else + * could be manipulating idle_list, so dereferencing idle_list + * without gcwq lock is safe. + */ + if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) + to_wakeup = first_worker(gcwq); + return to_wakeup ? to_wakeup->task : NULL; +} + +/** + * worker_set_flags - set worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to set + * @wakeup: wakeup an idle worker if necessary + * + * Set @flags in @worker->flags and adjust nr_running accordingly. If + * nr_running becomes zero and @wakeup is %true, an idle worker is + * woken up. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_set_flags(struct worker *worker, unsigned int flags, + bool wakeup) +{ + struct global_cwq *gcwq = worker->gcwq; + + WARN_ON_ONCE(worker->task != current); + + /* + * If transitioning into NOT_RUNNING, adjust nr_running and + * wake up an idle worker as necessary if requested by + * @wakeup. + */ + if ((flags & WORKER_NOT_RUNNING) && + !(worker->flags & WORKER_NOT_RUNNING)) { + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + if (wakeup) { + if (atomic_dec_and_test(nr_running) && + !list_empty(&gcwq->worklist)) + wake_up_worker(gcwq); + } else + atomic_dec(nr_running); + } + + worker->flags |= flags; } +/** + * worker_clr_flags - clear worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to clear + * + * Clear @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_clr_flags(struct worker *worker, unsigned int flags) +{ + struct global_cwq *gcwq = worker->gcwq; + unsigned int oflags = worker->flags; + + WARN_ON_ONCE(worker->task != current); + + worker->flags &= ~flags; + + /* if transitioning out of NOT_RUNNING, increment nr_running */ + if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) + atomic_inc(get_gcwq_nr_running(gcwq->cpu)); +} + +/** + * busy_worker_head - return the busy hash head for a work + * @gcwq: gcwq of interest + * @work: work to be hashed + * + * Return hash head of @gcwq for @work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to the hash head. + */ +static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, + struct work_struct *work) +{ + const int base_shift = ilog2(sizeof(struct work_struct)); + unsigned long v = (unsigned long)work; + + /* simple shift and fold hash, do we need something better? */ + v >>= base_shift; + v += v >> BUSY_WORKER_HASH_ORDER; + v &= BUSY_WORKER_HASH_MASK; + + return &gcwq->busy_hash[v]; +} + +/** + * __find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @bwh: hash head as returned by busy_worker_head() + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. @bwh should be + * the hash head obtained by calling busy_worker_head() with the same + * work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, + struct hlist_head *bwh, + struct work_struct *work) +{ + struct worker *worker; + struct hlist_node *tmp; + + hlist_for_each_entry(worker, tmp, bwh, hentry) + if (worker->current_work == work) + return worker; + return NULL; +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. This function is + * identical to __find_worker_executing_work() except that this + * function calculates @bwh itself. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct global_cwq *gcwq, + struct work_struct *work) +{ + return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), + work); +} + +/** + * gcwq_determine_ins_pos - find insertion position + * @gcwq: gcwq of interest + * @cwq: cwq a work is being queued for + * + * A work for @cwq is about to be queued on @gcwq, determine insertion + * position for the work. If @cwq is for HIGHPRI wq, the work is + * queued at the head of the queue but in FIFO order with respect to + * other HIGHPRI works; otherwise, at the end of the queue. This + * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that + * there are HIGHPRI works pending. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to inserstion position. + */ +static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, + struct cpu_workqueue_struct *cwq) +{ + struct work_struct *twork; + + if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) + return &gcwq->worklist; + + list_for_each_entry(twork, &gcwq->worklist, entry) { + struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); + + if (!(tcwq->wq->flags & WQ_HIGHPRI)) + break; + } + + gcwq->flags |= GCWQ_HIGHPRI_PENDING; + return &twork->entry; +} + +/** + * insert_work - insert a work into gcwq + * @cwq: cwq @work belongs to + * @work: work to insert + * @head: insertion point + * @extra_flags: extra WORK_STRUCT_* flags to set + * + * Insert @work which belongs to @cwq into @gcwq after @head. + * @extra_flags is or'd to work_struct flags. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head) + struct work_struct *work, struct list_head *head, + unsigned int extra_flags) { - trace_workqueue_insertion(cwq->thread, work); + struct global_cwq *gcwq = cwq->gcwq; + + /* we own @work, set data and link */ + set_work_cwq(work, cwq, extra_flags); - set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the * result of list_add() below, see try_to_grab_pending(). */ smp_wmb(); + list_add_tail(&work->entry, head); - wake_up(&cwq->more_work); + + /* + * Ensure either worker_sched_deactivated() sees the above + * list_add_tail() or we see zero nr_running to avoid workers + * lying around lazily while there are works to be processed. + */ + smp_mb(); + + if (__need_more_worker(gcwq)) + wake_up_worker(gcwq); } -static void __queue_work(struct cpu_workqueue_struct *cwq, +static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { + struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + struct list_head *worklist; unsigned long flags; debug_work_activate(work); - spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, &cwq->worklist); - spin_unlock_irqrestore(&cwq->lock, flags); + + /* determine gcwq to use */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *last_gcwq; + + if (unlikely(cpu == WORK_CPU_UNBOUND)) + cpu = raw_smp_processor_id(); + + /* + * It's multi cpu. If @wq is non-reentrant and @work + * was previously on a different cpu, it might still + * be running there, in which case the work needs to + * be queued on that cpu to guarantee non-reentrance. + */ + gcwq = get_gcwq(cpu); + if (wq->flags & WQ_NON_REENTRANT && + (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + struct worker *worker; + + spin_lock_irqsave(&last_gcwq->lock, flags); + + worker = find_worker_executing_work(last_gcwq, work); + + if (worker && worker->current_cwq->wq == wq) + gcwq = last_gcwq; + else { + /* meh... not running there, queue here */ + spin_unlock_irqrestore(&last_gcwq->lock, flags); + spin_lock_irqsave(&gcwq->lock, flags); + } + } else + spin_lock_irqsave(&gcwq->lock, flags); + } else { + gcwq = get_gcwq(WORK_CPU_UNBOUND); + spin_lock_irqsave(&gcwq->lock, flags); + } + + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + + BUG_ON(!list_empty(&work->entry)); + + cwq->nr_in_flight[cwq->work_color]++; + + if (likely(cwq->nr_active < cwq->max_active)) { + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else + worklist = &cwq->delayed_works; + + insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color)); + + spin_unlock_irqrestore(&gcwq->lock, flags); } /** @@ -308,9 +1035,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { int ret = 0; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); ret = 1; } return ret; @@ -320,10 +1046,9 @@ EXPORT_SYMBOL_GPL(queue_work_on); static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); - struct workqueue_struct *wq = cwq->wq; + struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(smp_processor_id(), cwq->wq, &dwork->work); } /** @@ -360,14 +1085,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + unsigned int lcpu; + BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); timer_stats_timer_set_start_info(&dwork->timer); - /* This stores cwq for the moment, for the timer_fn */ - set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); + /* + * This stores cwq for the moment, for the timer_fn. + * Note that the work's gcwq is preserved to allow + * reentrance detection for delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); + + if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) + lcpu = gcwq->cpu; + else + lcpu = raw_smp_processor_id(); + } else + lcpu = WORK_CPU_UNBOUND; + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; @@ -382,80 +1124,872 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -static void run_workqueue(struct cpu_workqueue_struct *cwq) +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_enter_idle(struct worker *worker) { - spin_lock_irq(&cwq->lock); - while (!list_empty(&cwq->worklist)) { - struct work_struct *work = list_entry(cwq->worklist.next, - struct work_struct, entry); - work_func_t f = work->func; -#ifdef CONFIG_LOCKDEP + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(worker->flags & WORKER_IDLE); + BUG_ON(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev)); + + /* can't use worker_set_flags(), also called from start_worker() */ + worker->flags |= WORKER_IDLE; + gcwq->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &gcwq->idle_list); + + if (likely(!(worker->flags & WORKER_ROGUE))) { + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } else + wake_up_all(&gcwq->trustee_wait); + + /* sanity check nr_running */ + WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + atomic_read(get_gcwq_nr_running(gcwq->cpu))); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(!(worker->flags & WORKER_IDLE)); + worker_clr_flags(worker, WORKER_IDLE); + gcwq->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * @worker: self + * + * Works which are scheduled while the cpu is online must at least be + * scheduled to a worker which is bound to the cpu so that if they are + * flushed from cpu callbacks while cpu is going down, they are + * guaranteed to execute on the cpu. + * + * This function is to be used by rogue workers and rescuers to bind + * themselves to the target cpu and may race with cpu going down or + * coming online. kthread_bind() can't be used because it may put the + * worker to already dead cpu and set_cpus_allowed_ptr() can't be used + * verbatim as it's best effort and blocking and gcwq may be + * [dis]associated in the meantime. + * + * This function tries set_cpus_allowed() and locks gcwq and verifies + * the binding against GCWQ_DISASSOCIATED which is set during + * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters + * idle state or fetches works without dropping lock, it can guarantee + * the scheduling requirement described in the first paragraph. + * + * CONTEXT: + * Might sleep. Called without any lock but returns with gcwq->lock + * held. + * + * RETURNS: + * %true if the associated gcwq is online (@worker is successfully + * bound), %false if offline. + */ +static bool worker_maybe_bind_and_lock(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + struct task_struct *task = worker->task; + + while (true) { /* - * It is permissible to free the struct work_struct - * from inside the function that is called from it, - * this we need to take into account for lockdep too. - * To avoid bogus "held lock freed" warnings as well - * as problems when looking into work->lockdep_map, - * make a copy and use that here. + * The following call may fail, succeed or succeed + * without actually migrating the task to the cpu if + * it races with cpu hotunplug operation. Verify + * against GCWQ_DISASSOCIATED. */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - trace_workqueue_execution(cwq->thread, work); - debug_work_deactivate(work); - cwq->current_work = work; - list_del_init(cwq->worklist.next); - spin_unlock_irq(&cwq->lock); - - BUG_ON(get_wq_data(work) != cwq); - work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - f(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) + set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); + + spin_lock_irq(&gcwq->lock); + if (gcwq->flags & GCWQ_DISASSOCIATED) + return false; + if (task_cpu(task) == gcwq->cpu && + cpumask_equal(¤t->cpus_allowed, + get_cpu_mask(gcwq->cpu))) + return true; + spin_unlock_irq(&gcwq->lock); + + /* CPU has come up inbetween, retry migration */ + cpu_relax(); + } +} + +/* + * Function for worker->rebind_work used to rebind rogue busy workers + * to the associated cpu which is coming back online. This is + * scheduled by cpu up but can race with other cpu hotplug operations + * and may be executed twice without intervening cpu down. + */ +static void worker_rebind_fn(struct work_struct *work) +{ + struct worker *worker = container_of(work, struct worker, rebind_work); + struct global_cwq *gcwq = worker->gcwq; + + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_REBIND); + + spin_unlock_irq(&gcwq->lock); +} + +static struct worker *alloc_worker(void) +{ + struct worker *worker; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (worker) { + INIT_LIST_HEAD(&worker->entry); + INIT_LIST_HEAD(&worker->scheduled); + INIT_WORK(&worker->rebind_work, worker_rebind_fn); + /* on creation a worker is in !idle && prep state */ + worker->flags = WORKER_PREP; + } + return worker; +} + +/** + * create_worker - create a new workqueue worker + * @gcwq: gcwq the new worker will belong to + * @bind: whether to set affinity to @cpu or not + * + * Create a new worker which is bound to @gcwq. The returned worker + * can be started by calling start_worker() or destroyed using + * destroy_worker(). + * + * CONTEXT: + * Might sleep. Does GFP_KERNEL allocations. + * + * RETURNS: + * Pointer to the newly created worker. + */ +static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +{ + bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct worker *worker = NULL; + int id = -1; + + spin_lock_irq(&gcwq->lock); + while (ida_get_new(&gcwq->worker_ida, &id)) { + spin_unlock_irq(&gcwq->lock); + if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + goto fail; + spin_lock_irq(&gcwq->lock); + } + spin_unlock_irq(&gcwq->lock); + + worker = alloc_worker(); + if (!worker) + goto fail; + + worker->gcwq = gcwq; + worker->id = id; + + if (!on_unbound_cpu) + worker->task = kthread_create(worker_thread, worker, + "kworker/%u:%d", gcwq->cpu, id); + else + worker->task = kthread_create(worker_thread, worker, + "kworker/u:%d", id); + if (IS_ERR(worker->task)) + goto fail; + + /* + * A rogue worker will become a regular one if CPU comes + * online later on. Make sure every worker has + * PF_THREAD_BOUND set. + */ + if (bind && !on_unbound_cpu) + kthread_bind(worker->task, gcwq->cpu); + else { + worker->task->flags |= PF_THREAD_BOUND; + if (on_unbound_cpu) + worker->flags |= WORKER_UNBOUND; + } + + return worker; +fail: + if (id >= 0) { + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); + spin_unlock_irq(&gcwq->lock); + } + kfree(worker); + return NULL; +} + +/** + * start_worker - start a newly created worker + * @worker: worker to start + * + * Make the gcwq aware of @worker and start it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void start_worker(struct worker *worker) +{ + worker->flags |= WORKER_STARTED; + worker->gcwq->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); +} + +/** + * destroy_worker - destroy a workqueue worker + * @worker: worker to be destroyed + * + * Destroy @worker and adjust @gcwq stats accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void destroy_worker(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + int id = worker->id; + + /* sanity check frenzy */ + BUG_ON(worker->current_work); + BUG_ON(!list_empty(&worker->scheduled)); + + if (worker->flags & WORKER_STARTED) + gcwq->nr_workers--; + if (worker->flags & WORKER_IDLE) + gcwq->nr_idle--; + + list_del_init(&worker->entry); + worker->flags |= WORKER_DIE; + + spin_unlock_irq(&gcwq->lock); + + kthread_stop(worker->task); + kfree(worker); + + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); +} + +static void idle_worker_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + + spin_lock_irq(&gcwq->lock); + + if (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; + + /* idle_list is kept in LIFO order, check the last one */ + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; + + if (time_before(jiffies, expires)) + mod_timer(&gcwq->idle_timer, expires); + else { + /* it's been idle for too long, wake up manager */ + gcwq->flags |= GCWQ_MANAGE_WORKERS; + wake_up_worker(gcwq); + } + } + + spin_unlock_irq(&gcwq->lock); +} + +static bool send_mayday(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct workqueue_struct *wq = cwq->wq; + unsigned int cpu; + + if (!(wq->flags & WQ_RESCUER)) + return false; + + /* mayday mayday mayday */ + cpu = cwq->gcwq->cpu; + /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ + if (cpu == WORK_CPU_UNBOUND) + cpu = 0; + if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + wake_up_process(wq->rescuer->task); + return true; +} + +static void gcwq_mayday_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + struct work_struct *work; + + spin_lock_irq(&gcwq->lock); + + if (need_to_create_worker(gcwq)) { + /* + * We've been trying to create a new worker but + * haven't been successful. We might be hitting an + * allocation deadlock. Send distress signals to + * rescuers. + */ + list_for_each_entry(work, &gcwq->worklist, entry) + send_mayday(work); + } + + spin_unlock_irq(&gcwq->lock); + + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); +} + +/** + * maybe_create_worker - create a new worker if necessary + * @gcwq: gcwq to create a new worker for + * + * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * have at least one idle worker on return from this function. If + * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is + * sent to all rescuers with works scheduled on @gcwq to resolve + * possible allocation deadlock. + * + * On return, need_to_create_worker() is guaranteed to be false and + * may_start_working() true. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. Called only from + * manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_create_worker(struct global_cwq *gcwq) +{ + if (!need_to_create_worker(gcwq)) + return false; +restart: + spin_unlock_irq(&gcwq->lock); + + /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + + while (true) { + struct worker *worker; + + worker = create_worker(gcwq, true); + if (worker) { + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + BUG_ON(need_to_create_worker(gcwq)); + return true; } - spin_lock_irq(&cwq->lock); - cwq->current_work = NULL; + if (!need_to_create_worker(gcwq)) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(CREATE_COOLDOWN); + + if (!need_to_create_worker(gcwq)) + break; } - spin_unlock_irq(&cwq->lock); + + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + if (need_to_create_worker(gcwq)) + goto restart; + return true; } -static int worker_thread(void *__cwq) +/** + * maybe_destroy_worker - destroy workers which have been idle for a while + * @gcwq: gcwq to destroy workers for + * + * Destroy @gcwq workers which have been idle for longer than + * IDLE_WORKER_TIMEOUT. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Called only from manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_destroy_workers(struct global_cwq *gcwq) { - struct cpu_workqueue_struct *cwq = __cwq; - DEFINE_WAIT(wait); + bool ret = false; - if (cwq->wq->freezeable) - set_freezable(); + while (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; - for (;;) { - prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && - !kthread_should_stop() && - list_empty(&cwq->worklist)) - schedule(); - finish_wait(&cwq->more_work, &wait); + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; - try_to_freeze(); + if (time_before(jiffies, expires)) { + mod_timer(&gcwq->idle_timer, expires); + break; + } - if (kthread_should_stop()) + destroy_worker(worker); + ret = true; + } + + return ret; +} + +/** + * manage_workers - manage worker pool + * @worker: self + * + * Assume the manager role and manage gcwq worker pool @worker belongs + * to. At any given time, there can be only zero or one manager per + * gcwq. The exclusion is handled automatically by this function. + * + * The caller can safely start processing works on false return. On + * true return, it's guaranteed that need_to_create_worker() is false + * and may_start_working() is true. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true if + * some action was taken. + */ +static bool manage_workers(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + bool ret = false; + + if (gcwq->flags & GCWQ_MANAGING_WORKERS) + return ret; + + gcwq->flags &= ~GCWQ_MANAGE_WORKERS; + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + /* + * Destroy and then create so that may_start_working() is true + * on return. + */ + ret |= maybe_destroy_workers(gcwq); + ret |= maybe_create_worker(gcwq); + + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* + * The trustee might be waiting to take over the manager + * position, tell it we're done. + */ + if (unlikely(gcwq->trustee)) + wake_up_all(&gcwq->trustee_wait); + + return ret; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + + move_linked_works(work, pos, NULL); + cwq->nr_active++; +} - run_workqueue(cwq); +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + cwq->nr_active--; + + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); } - return 0; + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * process_one_work - process single work + * @worker: self + * @work: work to process + * + * Process @work. This function contains all the logics necessary to + * process a single work including synchronization against and + * interaction with other workers on the same cpu, queueing and + * flushing. As long as context requirement is met, any worker can + * call this function to process a work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void process_one_work(struct worker *worker, struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct global_cwq *gcwq = cwq->gcwq; + struct hlist_head *bwh = busy_worker_head(gcwq, work); + bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; + work_func_t f = work->func; + int work_color; + struct worker *collision; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct from + * inside the function that is called from it, this we need to + * take into account for lockdep too. To avoid bogus "held + * lock freed" warnings as well as problems when looking into + * work->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + /* + * A single work shouldn't be executed concurrently by + * multiple workers on a single cpu. Check whether anyone is + * already processing the work. If so, defer the work to the + * currently executing one. + */ + collision = __find_worker_executing_work(gcwq, bwh, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, NULL); + return; + } + + /* claim and process */ + debug_work_deactivate(work); + hlist_add_head(&worker->hentry, bwh); + worker->current_work = work; + worker->current_cwq = cwq; + work_color = get_work_color(work); + + /* record the current cpu number in the work data and dequeue */ + set_work_cpu(work, gcwq->cpu); + list_del_init(&work->entry); + + /* + * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, + * wake up another worker; otherwise, clear HIGHPRI_PENDING. + */ + if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { + struct work_struct *nwork = list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (!list_empty(&gcwq->worklist) && + get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) + wake_up_worker(gcwq); + else + gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; + } + + /* + * CPU intensive works don't participate in concurrency + * management. They're the scheduler's responsibility. + */ + if (unlikely(cpu_intensive)) + worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + + spin_unlock_irq(&gcwq->lock); + + work_clear_pending(work); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + f(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), task_pid_nr(current)); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } + + spin_lock_irq(&gcwq->lock); + + /* clear cpu intensive status */ + if (unlikely(cpu_intensive)) + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + + /* we're done with it, release */ + hlist_del_init(&worker->hentry); + worker->current_work = NULL; + worker->current_cwq = NULL; + cwq_dec_nr_in_flight(cwq, work_color); +} + +/** + * process_scheduled_works - process scheduled works + * @worker: self + * + * Process all scheduled works. Please note that the scheduled list + * may change while processing a work, so this function repeatedly + * fetches a work from the top and executes it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. + */ +static void process_scheduled_works(struct worker *worker) +{ + while (!list_empty(&worker->scheduled)) { + struct work_struct *work = list_first_entry(&worker->scheduled, + struct work_struct, entry); + process_one_work(worker, work); + } +} + +/** + * worker_thread - the worker thread function + * @__worker: self + * + * The gcwq worker thread function. There's a single dynamic pool of + * these per each cpu. These workers process all works regardless of + * their specific target workqueue. The only exception is works which + * belong to workqueues with a rescuer which will be explained in + * rescuer_thread(). + */ +static int worker_thread(void *__worker) +{ + struct worker *worker = __worker; + struct global_cwq *gcwq = worker->gcwq; + + /* tell the scheduler that this is a workqueue worker */ + worker->task->flags |= PF_WQ_WORKER; +woke_up: + spin_lock_irq(&gcwq->lock); + + /* DIE can be set only while we're idle, checking here is enough */ + if (worker->flags & WORKER_DIE) { + spin_unlock_irq(&gcwq->lock); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + worker_leave_idle(worker); +recheck: + /* no more worker necessary? */ + if (!need_more_worker(gcwq)) + goto sleep; + + /* do we need to manage? */ + if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * ->scheduled list can only be filled while a worker is + * preparing to process a work or actually processing it. + * Make sure nobody diddled with it while I was sleeping. + */ + BUG_ON(!list_empty(&worker->scheduled)); + + /* + * When control reaches this point, we're guaranteed to have + * at least one idle worker or that someone else has already + * assumed the manager role. + */ + worker_clr_flags(worker, WORKER_PREP); + + do { + struct work_struct *work = + list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { + /* optimization path, not strictly necessary */ + process_one_work(worker, work); + if (unlikely(!list_empty(&worker->scheduled))) + process_scheduled_works(worker); + } else { + move_linked_works(work, &worker->scheduled, NULL); + process_scheduled_works(worker); + } + } while (keep_working(gcwq)); + + worker_set_flags(worker, WORKER_PREP, false); +sleep: + if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * gcwq->lock is held and there's no work to process and no + * need to manage, sleep. Workers are woken up only while + * holding gcwq->lock or from local cpu, so setting the + * current state before releasing gcwq->lock is enough to + * prevent losing any event. + */ + worker_enter_idle(worker); + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&gcwq->lock); + schedule(); + goto woke_up; +} + +/** + * rescuer_thread - the rescuer thread function + * @__wq: the associated workqueue + * + * Workqueue rescuer thread function. There's one rescuer for each + * workqueue which has WQ_RESCUER set. + * + * Regular work processing on a gcwq may block trying to create a new + * worker which uses GFP_KERNEL allocation which has slight chance of + * developing into deadlock if some works currently on the same queue + * need to be processed to satisfy the GFP_KERNEL allocation. This is + * the problem rescuer solves. + * + * When such condition is possible, the gcwq summons rescuers of all + * workqueues which have works queued on the gcwq and let them process + * those works so that forward progress can be guaranteed. + * + * This should happen rarely. + */ +static int rescuer_thread(void *__wq) +{ + struct workqueue_struct *wq = __wq; + struct worker *rescuer = wq->rescuer; + struct list_head *scheduled = &rescuer->scheduled; + bool is_unbound = wq->flags & WQ_UNBOUND; + unsigned int cpu; + + set_user_nice(current, RESCUER_NICE_LEVEL); +repeat: + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + /* + * See whether any cpu is asking for help. Unbounded + * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. + */ + for_each_mayday_cpu(cpu, wq->mayday_mask) { + unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; + struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + struct work_struct *work, *n; + + __set_current_state(TASK_RUNNING); + mayday_clear_cpu(cpu, wq->mayday_mask); + + /* migrate to the target cpu if possible */ + rescuer->gcwq = gcwq; + worker_maybe_bind_and_lock(rescuer); + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + BUG_ON(!list_empty(&rescuer->scheduled)); + list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + if (get_work_cwq(work) == cwq) + move_linked_works(work, scheduled, &n); + + process_scheduled_works(rescuer); + spin_unlock_irq(&gcwq->lock); + } + + schedule(); + goto repeat; } struct wq_barrier { @@ -469,44 +2003,137 @@ static void wq_barrier_func(struct work_struct *work) complete(&barr->done); } +/** + * insert_wq_barrier - insert a barrier work + * @cwq: cwq to insert barrier into + * @barr: wq_barrier to insert + * @target: target work to attach @barr to + * @worker: worker currently executing @target, NULL if @target is not executing + * + * @barr is linked to @target such that @barr is completed only after + * @target finishes execution. Please note that the ordering + * guarantee is observed only with respect to @target and on the local + * cpu. + * + * Currently, a queued barrier can't be canceled. This is because + * try_to_grab_pending() can't determine whether the work to be + * grabbed is at the head of the queue and thus can't clear LINKED + * flag of the previous work while there must be a valid next work + * after a work with LINKED flag set. + * + * Note that when @worker is non-NULL, @target may be modified + * underneath us, so we can't reliably determine cwq from @target. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, struct list_head *head) + struct wq_barrier *barr, + struct work_struct *target, struct worker *worker) { + struct list_head *head; + unsigned int linked = 0; + /* - * debugobject calls are safe here even with cwq->lock locked + * debugobject calls are safe here even with gcwq->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. */ INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); - + __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + /* + * If @target is currently being executed, schedule the + * barrier to the worker; otherwise, put it after @target. + */ + if (worker) + head = worker->scheduled.next; + else { + unsigned long *bits = work_data_bits(target); + + head = target->entry.next; + /* there can already be other linked works, inherit and set */ + linked = *bits & WORK_STRUCT_LINKED; + __set_bit(WORK_STRUCT_LINKED_BIT, bits); + } + debug_work_activate(&barr->work); - insert_work(cwq, &barr->work, head); + insert_work(cwq, &barr->work, head, + work_color_to_flags(WORK_NO_COLOR) | linked); } -static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +/** + * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * @wq: workqueue being flushed + * @flush_color: new flush color, < 0 for no-op + * @work_color: new work color, < 0 for no-op + * + * Prepare cwqs for workqueue flushing. + * + * If @flush_color is non-negative, flush_color on all cwqs should be + * -1. If no cwq has in-flight commands at the specified color, all + * cwq->flush_color's stay at -1 and %false is returned. If any cwq + * has in flight commands, its cwq->flush_color is set to + * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * wakeup logic is armed and %true is returned. + * + * The caller should have initialized @wq->first_flusher prior to + * calling this function with non-negative @flush_color. If + * @flush_color is negative, no flush color update is done and %false + * is returned. + * + * If @work_color is non-negative, all cwqs should have the same + * work_color which is previous to @work_color and all will be + * advanced to @work_color. + * + * CONTEXT: + * mutex_lock(wq->flush_mutex). + * + * RETURNS: + * %true if @flush_color >= 0 and there's something to flush. %false + * otherwise. + */ +static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, + int flush_color, int work_color) { - int active = 0; - struct wq_barrier barr; + bool wait = false; + unsigned int cpu; - WARN_ON(cwq->thread == current); - - spin_lock_irq(&cwq->lock); - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, &cwq->worklist); - active = 1; + if (flush_color >= 0) { + BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); + atomic_set(&wq->nr_cwqs_to_flush, 1); } - spin_unlock_irq(&cwq->lock); - if (active) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + + spin_lock_irq(&gcwq->lock); + + if (flush_color >= 0) { + BUG_ON(cwq->flush_color != -1); + + if (cwq->nr_in_flight[flush_color]) { + cwq->flush_color = flush_color; + atomic_inc(&wq->nr_cwqs_to_flush); + wait = true; + } + } + + if (work_color >= 0) { + BUG_ON(work_color != work_next_color(cwq->work_color)); + cwq->work_color = work_color; + } + + spin_unlock_irq(&gcwq->lock); } - return active; + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) + complete(&wq->first_flusher->done); + + return wait; } /** @@ -518,20 +2145,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * * We sleep until all works which were queued on entry have been handled, * but we are not livelocked by new incoming ones. - * - * This function used to run the workqueues itself. Now we just wait for the - * helper threads to do it. */ void flush_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + struct wq_flusher this_flusher = { + .list = LIST_HEAD_INIT(this_flusher.list), + .flush_color = -1, + .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + }; + int next_color; - might_sleep(); lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - for_each_cpu(cpu, cpu_map) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); + + mutex_lock(&wq->flush_mutex); + + /* + * Start-to-wait phase + */ + next_color = work_next_color(wq->work_color); + + if (next_color != wq->flush_color) { + /* + * Color space is not full. The current work_color + * becomes our flush_color and work_color is advanced + * by one. + */ + BUG_ON(!list_empty(&wq->flusher_overflow)); + this_flusher.flush_color = wq->work_color; + wq->work_color = next_color; + + if (!wq->first_flusher) { + /* no flush in progress, become the first flusher */ + BUG_ON(wq->flush_color != this_flusher.flush_color); + + wq->first_flusher = &this_flusher; + + if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, + wq->work_color)) { + /* nothing to flush, done */ + wq->flush_color = next_color; + wq->first_flusher = NULL; + goto out_unlock; + } + } else { + /* wait in queue */ + BUG_ON(wq->flush_color == this_flusher.flush_color); + list_add_tail(&this_flusher.list, &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + } else { + /* + * Oops, color space is full, wait on overflow queue. + * The next flush completion will assign us + * flush_color and transfer to flusher_queue. + */ + list_add_tail(&this_flusher.list, &wq->flusher_overflow); + } + + mutex_unlock(&wq->flush_mutex); + + wait_for_completion(&this_flusher.done); + + /* + * Wake-up-and-cascade phase + * + * First flushers are responsible for cascading flushes and + * handling overflow. Non-first flushers can simply return. + */ + if (wq->first_flusher != &this_flusher) + return; + + mutex_lock(&wq->flush_mutex); + + /* we might have raced, check again with mutex held */ + if (wq->first_flusher != &this_flusher) + goto out_unlock; + + wq->first_flusher = NULL; + + BUG_ON(!list_empty(&this_flusher.list)); + BUG_ON(wq->flush_color != this_flusher.flush_color); + + while (true) { + struct wq_flusher *next, *tmp; + + /* complete all the flushers sharing the current flush color */ + list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { + if (next->flush_color != wq->flush_color) + break; + list_del_init(&next->list); + complete(&next->done); + } + + BUG_ON(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); + + /* this flush_color is finished, advance by one */ + wq->flush_color = work_next_color(wq->flush_color); + + /* one color has been freed, handle overflow queue */ + if (!list_empty(&wq->flusher_overflow)) { + /* + * Assign the same color to all overflowed + * flushers, advance work_color and append to + * flusher_queue. This is the start-to-wait + * phase for these overflowed flushers. + */ + list_for_each_entry(tmp, &wq->flusher_overflow, list) + tmp->flush_color = wq->work_color; + + wq->work_color = work_next_color(wq->work_color); + + list_splice_tail_init(&wq->flusher_overflow, + &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + + if (list_empty(&wq->flusher_queue)) { + BUG_ON(wq->flush_color != wq->work_color); + break; + } + + /* + * Need to flush more colors. Make the next flusher + * the new first flusher and arm cwqs. + */ + BUG_ON(wq->flush_color == wq->work_color); + BUG_ON(wq->flush_color != next->flush_color); + + list_del_init(&next->list); + wq->first_flusher = next; + + if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) + break; + + /* + * Meh... this color is already done, clear first + * flusher and repeat cascading. + */ + wq->first_flusher = NULL; + } + +out_unlock: + mutex_unlock(&wq->flush_mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -547,43 +2304,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue); */ int flush_work(struct work_struct *work) { + struct worker *worker = NULL; + struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; - struct list_head *prev; struct wq_barrier barr; might_sleep(); - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return 0; - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - prev = NULL; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued under us we are not going to wait. + * If it was re-queued to a different gcwq under us, we + * are not going to wait. */ smp_rmb(); - if (unlikely(cwq != get_wq_data(work))) - goto out; - prev = &work->entry; + cwq = get_work_cwq(work); + if (unlikely(!cwq || gcwq != cwq->gcwq)) + goto already_gone; } else { - if (cwq->current_work != work) - goto out; - prev = &cwq->worklist; + worker = find_worker_executing_work(gcwq, work); + if (!worker) + goto already_gone; + cwq = worker->current_cwq; } - insert_wq_barrier(cwq, &barr, prev->next); -out: - spin_unlock_irq(&cwq->lock); - if (!prev) - return 0; + + insert_wq_barrier(cwq, &barr, work, worker); + spin_unlock_irq(&gcwq->lock); + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return 1; +already_gone: + spin_unlock_irq(&gcwq->lock); + return 0; } EXPORT_SYMBOL_GPL(flush_work); @@ -593,54 +2353,55 @@ EXPORT_SYMBOL_GPL(flush_work); */ static int try_to_grab_pending(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; + struct global_cwq *gcwq; int ret = -1; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ - - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return ret; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* - * This work is queued, but perhaps we locked the wrong cwq. + * This work is queued, but perhaps we locked the wrong gcwq. * In that case we must see the new value after rmb(), see * insert_work()->wmb(). */ smp_rmb(); - if (cwq == get_wq_data(work)) { + if (gcwq == get_work_gcwq(work)) { debug_work_deactivate(work); list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work)); ret = 1; } } - spin_unlock_irq(&cwq->lock); + spin_unlock_irq(&gcwq->lock); return ret; } -static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) +static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) { struct wq_barrier barr; - int running = 0; + struct worker *worker; - spin_lock_irq(&cwq->lock); - if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, cwq->worklist.next); - running = 1; - } - spin_unlock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); + + worker = find_worker_executing_work(gcwq, work); + if (unlikely(worker)) + insert_wq_barrier(worker->current_cwq, &barr, work, worker); - if (unlikely(running)) { + spin_unlock_irq(&gcwq->lock); + + if (unlikely(worker)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); } @@ -648,9 +2409,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, static void wait_on_work(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -658,15 +2416,8 @@ static void wait_on_work(struct work_struct *work) lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - cwq = get_wq_data(work); - if (!cwq) - return; - - wq = cwq->wq; - cpu_map = wq_cpu_map(wq); - - for_each_cpu(cpu, cpu_map) - wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + for_each_gcwq_cpu(cpu) + wait_on_cpu_work(get_gcwq(cpu), work); } static int __cancel_work_timer(struct work_struct *work, @@ -681,7 +2432,7 @@ static int __cancel_work_timer(struct work_struct *work, wait_on_work(work); } while (unlikely(ret < 0)); - clear_wq_data(work); + clear_work_data(work); return ret; } @@ -727,8 +2478,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -static struct workqueue_struct *keventd_wq __read_mostly; - /** * schedule_work - put work task in global workqueue * @work: job to be done @@ -742,7 +2491,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; */ int schedule_work(struct work_struct *work) { - return queue_work(keventd_wq, work); + return queue_work(system_wq, work); } EXPORT_SYMBOL(schedule_work); @@ -755,7 +2504,7 @@ EXPORT_SYMBOL(schedule_work); */ int schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, keventd_wq, work); + return queue_work_on(cpu, system_wq, work); } EXPORT_SYMBOL(schedule_work_on); @@ -770,7 +2519,7 @@ EXPORT_SYMBOL(schedule_work_on); int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(keventd_wq, dwork, delay); + return queue_delayed_work(system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); @@ -783,9 +2532,8 @@ EXPORT_SYMBOL(schedule_delayed_work); void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { - struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); - __queue_work(cwq, &dwork->work); + __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, + &dwork->work); put_cpu(); } flush_work(&dwork->work); @@ -804,7 +2552,7 @@ EXPORT_SYMBOL(flush_delayed_work); int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); @@ -820,7 +2568,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; - int orig = -1; struct work_struct *works; works = alloc_percpu(struct work_struct); @@ -829,23 +2576,12 @@ int schedule_on_each_cpu(work_func_t func) get_online_cpus(); - /* - * When running in keventd don't schedule a work item on - * itself. Can just call directly because the work queue is - * already bound. This also is faster. - */ - if (current_is_keventd()) - orig = raw_smp_processor_id(); - for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - if (cpu != orig) - schedule_work_on(cpu, work); + schedule_work_on(cpu, work); } - if (orig >= 0) - func(per_cpu_ptr(works, orig)); for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); @@ -881,7 +2617,7 @@ int schedule_on_each_cpu(work_func_t func) */ void flush_scheduled_work(void) { - flush_workqueue(keventd_wq); + flush_workqueue(system_wq); } EXPORT_SYMBOL(flush_scheduled_work); @@ -913,170 +2649,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); int keventd_up(void) { - return keventd_wq != NULL; + return system_wq != NULL; } -int current_is_keventd(void) +static int alloc_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq; - int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ - int ret = 0; - - BUG_ON(!keventd_wq); + /* + * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * Make sure that the alignment isn't lower than that of + * unsigned long long. + */ + const size_t size = sizeof(struct cpu_workqueue_struct); + const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, + __alignof__(unsigned long long)); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); - if (current == cwq->thread) - ret = 1; + if (percpu) + wq->cpu_wq.pcpu = __alloc_percpu(size, align); + else { + void *ptr; - return ret; + /* + * Allocate enough room to align cwq and put an extra + * pointer at the end pointing back to the originally + * allocated pointer which will be used for free. + */ + ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); + if (ptr) { + wq->cpu_wq.single = PTR_ALIGN(ptr, align); + *(void **)(wq->cpu_wq.single + 1) = ptr; + } + } + /* just in case, make sure it's actually aligned */ + BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); + return wq->cpu_wq.v ? 0 : -ENOMEM; } -static struct cpu_workqueue_struct * -init_cpu_workqueue(struct workqueue_struct *wq, int cpu) +static void free_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - cwq->wq = wq; - spin_lock_init(&cwq->lock); - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - return cwq; + if (percpu) + free_percpu(wq->cpu_wq.pcpu); + else if (wq->cpu_wq.single) { + /* the pointer to free is stored right after the cwq */ + kfree(*(void **)(wq->cpu_wq.single + 1)); + } } -static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +static int wq_clamp_max_active(int max_active, unsigned int flags, + const char *name) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; - struct task_struct *p; + int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); - /* - * Nobody can add the work_struct to this cwq, - * if (caller is __create_workqueue) - * nobody should see this wq - * else // caller is CPU_UP_PREPARE - * cpu is not on cpu_online_map - * so we can abort safely. - */ - if (IS_ERR(p)) - return PTR_ERR(p); - if (cwq->wq->rt) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - cwq->thread = p; + if (max_active < 1 || max_active > lim) + printk(KERN_WARNING "workqueue: max_active %d requested for %s " + "is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); - trace_workqueue_creation(cwq->thread, cpu); - - return 0; + return clamp_val(max_active, 1, lim); } -static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +struct workqueue_struct *__alloc_workqueue_key(const char *name, + unsigned int flags, + int max_active, + struct lock_class_key *key, + const char *lock_name) { - struct task_struct *p = cwq->thread; + struct workqueue_struct *wq; + unsigned int cpu; - if (p != NULL) { - if (cpu >= 0) - kthread_bind(p, cpu); - wake_up_process(p); - } -} + /* + * Unbound workqueues aren't concurrency managed and should be + * dispatched to workers immediately. + */ + if (flags & WQ_UNBOUND) + flags |= WQ_HIGHPRI; -struct workqueue_struct *__create_workqueue_key(const char *name, - int singlethread, - int freezeable, - int rt, - struct lock_class_key *key, - const char *lock_name) -{ - struct workqueue_struct *wq; - struct cpu_workqueue_struct *cwq; - int err = 0, cpu; + max_active = max_active ?: WQ_DFL_ACTIVE; + max_active = wq_clamp_max_active(max_active, flags, name); wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) - return NULL; + goto err; - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } + wq->flags = flags; + wq->saved_max_active = max_active; + mutex_init(&wq->flush_mutex); + atomic_set(&wq->nr_cwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->flusher_queue); + INIT_LIST_HEAD(&wq->flusher_overflow); wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - wq->singlethread = singlethread; - wq->freezeable = freezeable; - wq->rt = rt; INIT_LIST_HEAD(&wq->list); - if (singlethread) { - cwq = init_cpu_workqueue(wq, singlethread_cpu); - err = create_workqueue_thread(cwq, singlethread_cpu); - start_workqueue_thread(cwq, -1); - } else { - cpu_maps_update_begin(); - /* - * We must place this wq on list even if the code below fails. - * cpu_down(cpu) can remove cpu from cpu_populated_map before - * destroy_workqueue() takes the lock, in that case we leak - * cwq[cpu]->thread. - */ - spin_lock(&workqueue_lock); - list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); - /* - * We must initialize cwqs for each possible cpu even if we - * are going to call destroy_workqueue() finally. Otherwise - * cpu_up() can hit the uninitialized cwq once we drop the - * lock. - */ - for_each_possible_cpu(cpu) { - cwq = init_cpu_workqueue(wq, cpu); - if (err || !cpu_online(cpu)) - continue; - err = create_workqueue_thread(cwq, cpu); - start_workqueue_thread(cwq, cpu); - } - cpu_maps_update_done(); + if (alloc_cwqs(wq) < 0) + goto err; + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = get_gcwq(cpu); + + BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); + cwq->gcwq = gcwq; + cwq->wq = wq; + cwq->flush_color = -1; + cwq->max_active = max_active; + INIT_LIST_HEAD(&cwq->delayed_works); } - if (err) { - destroy_workqueue(wq); - wq = NULL; + if (flags & WQ_RESCUER) { + struct worker *rescuer; + + if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) + goto err; + + wq->rescuer = rescuer = alloc_worker(); + if (!rescuer) + goto err; + + rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + if (IS_ERR(rescuer->task)) + goto err; + + wq->rescuer = rescuer; + rescuer->task->flags |= PF_THREAD_BOUND; + wake_up_process(rescuer->task); } - return wq; -} -EXPORT_SYMBOL_GPL(__create_workqueue_key); -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) -{ /* - * Our caller is either destroy_workqueue() or CPU_POST_DEAD, - * cpu_add_remove_lock protects cwq->thread. + * workqueue_lock protects global freeze state and workqueues + * list. Grab it, set max_active accordingly and add the new + * workqueue to workqueues list. */ - if (cwq->thread == NULL) - return; + spin_lock(&workqueue_lock); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) + for_each_cwq_cpu(cpu, wq) + get_cwq(cpu, wq)->max_active = 0; - flush_cpu_workqueue(cwq); - /* - * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, - * a concurrent flush_workqueue() can insert a barrier after us. - * However, in that case run_workqueue() won't return and check - * kthread_should_stop() until it flushes all work_struct's. - * When ->worklist becomes empty it is safe to exit because no - * more work_structs can be queued on this cwq: flush_workqueue - * checks list_empty(), and a "normal" queue_work() can't use - * a dead CPU. - */ - trace_workqueue_destruction(cwq->thread); - kthread_stop(cwq->thread); - cwq->thread = NULL; + list_add(&wq->list, &workqueues); + + spin_unlock(&workqueue_lock); + + return wq; +err: + if (wq) { + free_cwqs(wq); + free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); + kfree(wq); + } + return NULL; } +EXPORT_SYMBOL_GPL(__alloc_workqueue_key); /** * destroy_workqueue - safely terminate a workqueue @@ -1086,72 +2822,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + unsigned int cpu; - cpu_maps_update_begin(); + flush_workqueue(wq); + + /* + * wq list is used to freeze wq, remove from list after + * flushing is complete in case freeze races us. + */ spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); - for_each_cpu(cpu, cpu_map) - cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - cpu_maps_update_done(); + /* sanity check */ + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + BUG_ON(cwq->nr_in_flight[i]); + BUG_ON(cwq->nr_active); + BUG_ON(!list_empty(&cwq->delayed_works)); + } + + if (wq->flags & WQ_RESCUER) { + kthread_stop(wq->rescuer->task); + free_mayday_mask(wq->mayday_mask); + } - free_percpu(wq->cpu_wq); + free_cwqs(wq); kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); +/** + * workqueue_set_max_active - adjust max_active of a workqueue + * @wq: target workqueue + * @max_active: new max_active value. + * + * Set max_active of @wq to @max_active. + * + * CONTEXT: + * Don't call from IRQ context. + */ +void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) +{ + unsigned int cpu; + + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); + + spin_lock(&workqueue_lock); + + wq->saved_max_active = max_active; + + for_each_cwq_cpu(cpu, wq) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_irq(&gcwq->lock); + + if (!(wq->flags & WQ_FREEZEABLE) || + !(gcwq->flags & GCWQ_FREEZING)) + get_cwq(gcwq->cpu, wq)->max_active = max_active; + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); + +/** + * workqueue_congested - test whether a workqueue is congested + * @cpu: CPU in question + * @wq: target workqueue + * + * Test whether @wq's cpu workqueue for @cpu is congested. There is + * no synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * + * RETURNS: + * %true if congested, %false otherwise. + */ +bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +{ + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + return !list_empty(&cwq->delayed_works); +} +EXPORT_SYMBOL_GPL(workqueue_congested); + +/** + * work_cpu - return the last known associated cpu for @work + * @work: the work of interest + * + * RETURNS: + * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. + */ +unsigned int work_cpu(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + + return gcwq ? gcwq->cpu : WORK_CPU_NONE; +} +EXPORT_SYMBOL_GPL(work_cpu); + +/** + * work_busy - test whether a work is currently pending or running + * @work: the work to be tested + * + * Test whether @work is currently pending or running. There is no + * synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * Especially for reentrant wqs, the pending state might hide the + * running state. + * + * RETURNS: + * OR'd bitmask of WORK_BUSY_* bits. + */ +unsigned int work_busy(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long flags; + unsigned int ret = 0; + + if (!gcwq) + return false; + + spin_lock_irqsave(&gcwq->lock, flags); + + if (work_pending(work)) + ret |= WORK_BUSY_PENDING; + if (find_worker_executing_work(gcwq, work)) + ret |= WORK_BUSY_RUNNING; + + spin_unlock_irqrestore(&gcwq->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(work_busy); + +/* + * CPU hotplug. + * + * There are two challenges in supporting CPU hotplug. Firstly, there + * are a lot of assumptions on strong associations among work, cwq and + * gcwq which make migrating pending and scheduled works very + * difficult to implement without impacting hot paths. Secondly, + * gcwqs serve mix of short, long and very long running works making + * blocked draining impractical. + * + * This is solved by allowing a gcwq to be detached from CPU, running + * it with unbound (rogue) workers and allowing it to be reattached + * later if the cpu comes back online. A separate thread is created + * to govern a gcwq in such state and is called the trustee of the + * gcwq. + * + * Trustee states and their descriptions. + * + * START Command state used on startup. On CPU_DOWN_PREPARE, a + * new trustee is started with this state. + * + * IN_CHARGE Once started, trustee will enter this state after + * assuming the manager role and making all existing + * workers rogue. DOWN_PREPARE waits for trustee to + * enter this state. After reaching IN_CHARGE, trustee + * tries to execute the pending worklist until it's empty + * and the state is set to BUTCHER, or the state is set + * to RELEASE. + * + * BUTCHER Command state which is set by the cpu callback after + * the cpu has went down. Once this state is set trustee + * knows that there will be no new works on the worklist + * and once the worklist is empty it can proceed to + * killing idle workers. + * + * RELEASE Command state which is set by the cpu callback if the + * cpu down has been canceled or it has come online + * again. After recognizing this state, trustee stops + * trying to drain or butcher and clears ROGUE, rebinds + * all remaining workers back to the cpu and releases + * manager role. + * + * DONE Trustee will enter this state after BUTCHER or RELEASE + * is complete. + * + * trustee CPU draining + * took over down complete + * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE + * | | ^ + * | CPU is back online v return workers | + * ----------------> RELEASE -------------- + */ + +/** + * trustee_wait_event_timeout - timed event wait for trustee + * @cond: condition to wait for + * @timeout: timeout in jiffies + * + * wait_event_timeout() for trustee to use. Handles locking and + * checks for RELEASE request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * Positive indicating left time if @cond is satisfied, 0 if timed + * out, -1 if canceled. + */ +#define trustee_wait_event_timeout(cond, timeout) ({ \ + long __ret = (timeout); \ + while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ + __ret) { \ + spin_unlock_irq(&gcwq->lock); \ + __wait_event_timeout(gcwq->trustee_wait, (cond) || \ + (gcwq->trustee_state == TRUSTEE_RELEASE), \ + __ret); \ + spin_lock_irq(&gcwq->lock); \ + } \ + gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ +}) + +/** + * trustee_wait_event - event wait for trustee + * @cond: condition to wait for + * + * wait_event() for trustee to use. Automatically handles locking and + * checks for CANCEL request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * 0 if @cond is satisfied, -1 if canceled. + */ +#define trustee_wait_event(cond) ({ \ + long __ret1; \ + __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ + __ret1 < 0 ? -1 : 0; \ +}) + +static int __cpuinit trustee_thread(void *__gcwq) +{ + struct global_cwq *gcwq = __gcwq; + struct worker *worker; + struct work_struct *work; + struct hlist_node *pos; + long rc; + int i; + + BUG_ON(gcwq->cpu != smp_processor_id()); + + spin_lock_irq(&gcwq->lock); + /* + * Claim the manager position and make all workers rogue. + * Trustee must be bound to the target cpu and can't be + * cancelled. + */ + BUG_ON(gcwq->cpu != smp_processor_id()); + rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + BUG_ON(rc < 0); + + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + list_for_each_entry(worker, &gcwq->idle_list, entry) + worker->flags |= WORKER_ROGUE; + + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_ROGUE; + + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the rogue flag. This is + * necessary as scheduler callbacks may be invoked from other + * cpus. + */ + spin_unlock_irq(&gcwq->lock); + schedule(); + spin_lock_irq(&gcwq->lock); + + /* + * Sched callbacks are disabled now. Zap nr_running. After + * this, nr_running stays zero and need_more_worker() and + * keep_working() are always true as long as the worklist is + * not empty. + */ + atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + + spin_unlock_irq(&gcwq->lock); + del_timer_sync(&gcwq->idle_timer); + spin_lock_irq(&gcwq->lock); + + /* + * We're now in charge. Notify and proceed to drain. We need + * to keep the gcwq running during the whole CPU down + * procedure as other cpu hotunplug callbacks may need to + * flush currently running tasks. + */ + gcwq->trustee_state = TRUSTEE_IN_CHARGE; + wake_up_all(&gcwq->trustee_wait); + + /* + * The original cpu is in the process of dying and may go away + * anytime now. When that happens, we and all workers would + * be migrated to other cpus. Try draining any left work. We + * want to get it over with ASAP - spam rescuers, wake up as + * many idlers as necessary and create new ones till the + * worklist is empty. Note that if the gcwq is frozen, there + * may be frozen works in freezeable cwqs. Don't declare + * completion while frozen. + */ + while (gcwq->nr_workers != gcwq->nr_idle || + gcwq->flags & GCWQ_FREEZING || + gcwq->trustee_state == TRUSTEE_IN_CHARGE) { + int nr_works = 0; + + list_for_each_entry(work, &gcwq->worklist, entry) { + send_mayday(work); + nr_works++; + } + + list_for_each_entry(worker, &gcwq->idle_list, entry) { + if (!nr_works--) + break; + wake_up_process(worker->task); + } + + if (need_to_create_worker(gcwq)) { + spin_unlock_irq(&gcwq->lock); + worker = create_worker(gcwq, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; + start_worker(worker); + } + } + + /* give a breather */ + if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) + break; + } + + /* + * Either all works have been scheduled and cpu is down, or + * cpu down has already been canceled. Wait for and butcher + * all workers till we're canceled. + */ + do { + rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); + while (!list_empty(&gcwq->idle_list)) + destroy_worker(list_first_entry(&gcwq->idle_list, + struct worker, entry)); + } while (gcwq->nr_workers && rc >= 0); + + /* + * At this point, either draining has completed and no worker + * is left, or cpu down has been canceled or the cpu is being + * brought back up. There shouldn't be any idle one left. + * Tell the remaining busy ones to rebind once it finishes the + * currently scheduled works by scheduling the rebind_work. + */ + WARN_ON(!list_empty(&gcwq->idle_list)); + + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that + * rebinding is scheduled. + */ + worker->flags |= WORKER_REBIND; + worker->flags &= ~WORKER_ROGUE; + + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } + + /* relinquish manager role */ + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* notify completion */ + gcwq->trustee = NULL; + gcwq->trustee_state = TRUSTEE_DONE; + wake_up_all(&gcwq->trustee_wait); + spin_unlock_irq(&gcwq->lock); + return 0; +} + +/** + * wait_trustee_state - wait for trustee to enter the specified state + * @gcwq: gcwq the trustee of interest belongs to + * @state: target state to wait for + * + * Wait for the trustee to reach @state. DONE is already matched. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by cpu_callback. + */ +static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +{ + if (!(gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE)) { + spin_unlock_irq(&gcwq->lock); + __wait_event(gcwq->trustee_wait, + gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE); + spin_lock_irq(&gcwq->lock); + } +} + static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - int err = 0; + struct global_cwq *gcwq = get_gcwq(cpu); + struct task_struct *new_trustee = NULL; + struct worker *uninitialized_var(new_worker); + unsigned long flags; action &= ~CPU_TASKS_FROZEN; switch (action) { + case CPU_DOWN_PREPARE: + new_trustee = kthread_create(trustee_thread, gcwq, + "workqueue_trustee/%d\n", cpu); + if (IS_ERR(new_trustee)) + return notifier_from_errno(PTR_ERR(new_trustee)); + kthread_bind(new_trustee, cpu); + /* fall through */ case CPU_UP_PREPARE: - cpumask_set_cpu(cpu, cpu_populated_map); - } -undo: - list_for_each_entry(wq, &workqueues, list) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - switch (action) { - case CPU_UP_PREPARE: - err = create_workqueue_thread(cwq, cpu); - if (!err) - break; - printk(KERN_ERR "workqueue [%s] for %i failed\n", - wq->name, cpu); - action = CPU_UP_CANCELED; - err = -ENOMEM; - goto undo; - - case CPU_ONLINE: - start_workqueue_thread(cwq, cpu); - break; - - case CPU_UP_CANCELED: - start_workqueue_thread(cwq, -1); - case CPU_POST_DEAD: - cleanup_workqueue_thread(cwq); - break; + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) { + if (new_trustee) + kthread_stop(new_trustee); + return NOTIFY_BAD; } } + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); + switch (action) { - case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + /* initialize trustee and tell it to acquire the gcwq */ + BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); + gcwq->trustee = new_trustee; + gcwq->trustee_state = TRUSTEE_START; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; + + case CPU_DYING: + /* + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + break; + case CPU_POST_DEAD: - cpumask_clear_cpu(cpu, cpu_populated_map); + gcwq->trustee_state = TRUSTEE_BUTCHER; + /* fall through */ + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + + case CPU_DOWN_FAILED: + case CPU_ONLINE: + gcwq->flags &= ~GCWQ_DISASSOCIATED; + if (gcwq->trustee_state != TRUSTEE_DONE) { + gcwq->trustee_state = TRUSTEE_RELEASE; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_DONE); + } + + /* + * Trustee is done and there might be no worker left. + * Put the first_idle in and request a real manager to + * take a look. + */ + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; } - return notifier_from_errno(err); + spin_unlock_irqrestore(&gcwq->lock, flags); + + return notifier_from_errno(0); } #ifdef CONFIG_SMP @@ -1201,14 +3381,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) EXPORT_SYMBOL_GPL(work_on_cpu); #endif /* CONFIG_SMP */ -void __init init_workqueues(void) +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all + * freezeable workqueues will queue new works to their frozen_works + * list instead of gcwq->worklist. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void freeze_workqueues_begin(void) { - alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + unsigned int cpu; - cpumask_copy(cpu_populated_map, cpu_online_mask); - singlethread_cpu = cpumask_first(cpu_possible_mask); - cpu_singlethread_map = cpumask_of(singlethread_cpu); - hotcpu_notifier(workqueue_cpu_callback, 0); - keventd_wq = create_workqueue("events"); - BUG_ON(!keventd_wq); + spin_lock(&workqueue_lock); + + BUG_ON(workqueue_freezing); + workqueue_freezing = true; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(gcwq->flags & GCWQ_FREEZING); + gcwq->flags |= GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (cwq && wq->flags & WQ_FREEZEABLE) + cwq->max_active = 0; + } + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} + +/** + * freeze_workqueues_busy - are freezeable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases workqueue_lock. + * + * RETURNS: + * %true if some freezeable workqueues are still busy. %false if + * freezing is complete. + */ +bool freeze_workqueues_busy(void) +{ + unsigned int cpu; + bool busy = false; + + spin_lock(&workqueue_lock); + + BUG_ON(!workqueue_freezing); + + for_each_gcwq_cpu(cpu) { + struct workqueue_struct *wq; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + BUG_ON(cwq->nr_active < 0); + if (cwq->nr_active) { + busy = true; + goto out_unlock; + } + } + } +out_unlock: + spin_unlock(&workqueue_lock); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective gcwq worklists. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void thaw_workqueues(void) +{ + unsigned int cpu; + + spin_lock(&workqueue_lock); + + if (!workqueue_freezing) + goto out_unlock; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); + gcwq->flags &= ~GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + /* restore max_active and repopulate worklist */ + cwq->max_active = wq->saved_max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + + wake_up_worker(gcwq); + + spin_unlock_irq(&gcwq->lock); + } + + workqueue_freezing = false; +out_unlock: + spin_unlock(&workqueue_lock); +} +#endif /* CONFIG_FREEZER */ + +static int __init init_workqueues(void) +{ + unsigned int cpu; + int i; + + hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + + /* initialize gcwqs */ + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_init(&gcwq->lock); + INIT_LIST_HEAD(&gcwq->worklist); + gcwq->cpu = cpu; + if (cpu == WORK_CPU_UNBOUND) + gcwq->flags |= GCWQ_DISASSOCIATED; + + INIT_LIST_HEAD(&gcwq->idle_list); + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) + INIT_HLIST_HEAD(&gcwq->busy_hash[i]); + + init_timer_deferrable(&gcwq->idle_timer); + gcwq->idle_timer.function = idle_worker_timeout; + gcwq->idle_timer.data = (unsigned long)gcwq; + + setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, + (unsigned long)gcwq); + + ida_init(&gcwq->worker_ida); + + gcwq->trustee_state = TRUSTEE_DONE; + init_waitqueue_head(&gcwq->trustee_wait); + } + + /* create the initial worker */ + for_each_online_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *worker; + + worker = create_worker(gcwq, true); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } + + system_wq = alloc_workqueue("events", 0, 0); + system_long_wq = alloc_workqueue("events_long", 0, 0); + system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); + return 0; } +early_initcall(init_workqueues); diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 000000000000..2d10fc98dc79 --- /dev/null +++ b/kernel/workqueue_sched.h @@ -0,0 +1,9 @@ +/* + * kernel/workqueue_sched.h + * + * Scheduler hooks for concurrency managed workqueue. Only to be + * included from sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu); |