diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 4 | ||||
-rw-r--r-- | kernel/cpu.c | 1 | ||||
-rw-r--r-- | kernel/cpuset.c | 4 | ||||
-rw-r--r-- | kernel/exec_domain.c | 2 | ||||
-rw-r--r-- | kernel/exit.c | 451 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | kernel/hrtimer.c | 2 | ||||
-rw-r--r-- | kernel/irq/manage.c | 39 | ||||
-rw-r--r-- | kernel/kmod.c | 2 | ||||
-rw-r--r-- | kernel/kthread.c | 2 | ||||
-rw-r--r-- | kernel/module.c | 336 | ||||
-rw-r--r-- | kernel/pid.c | 1 | ||||
-rw-r--r-- | kernel/power/Kconfig | 2 | ||||
-rw-r--r-- | kernel/power/disk.c | 50 | ||||
-rw-r--r-- | kernel/power/main.c | 16 | ||||
-rw-r--r-- | kernel/power/process.c | 97 | ||||
-rw-r--r-- | kernel/power/user.c | 71 | ||||
-rw-r--r-- | kernel/profile.c | 6 | ||||
-rw-r--r-- | kernel/ptrace.c | 37 | ||||
-rw-r--r-- | kernel/rcuclassic.c | 34 | ||||
-rw-r--r-- | kernel/rcupdate.c | 71 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 418 | ||||
-rw-r--r-- | kernel/rcupreempt_trace.c | 1 | ||||
-rw-r--r-- | kernel/rcutorture.c | 174 | ||||
-rw-r--r-- | kernel/rtmutex-tester.c | 7 | ||||
-rw-r--r-- | kernel/sched.c | 8 | ||||
-rw-r--r-- | kernel/smp.c | 383 | ||||
-rw-r--r-- | kernel/softirq.c | 4 | ||||
-rw-r--r-- | kernel/sys_ni.c | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 17 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 8 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 2 |
32 files changed, 1721 insertions, 536 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f6328e16dfdd..985ddb7da4d0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,6 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_sched.o = -mno-spe -pg endif obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o @@ -39,6 +38,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o +obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o diff --git a/kernel/cpu.c b/kernel/cpu.c index b11f06dc149a..cfb1d43ab801 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -299,6 +299,7 @@ int __ref cpu_down(unsigned int cpu) cpu_maps_update_done(); return err; } +EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a8..d2cc67dac8b1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -679,7 +679,9 @@ restart: if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; - update_domain_attr(dattr, b); + if (dattr) + update_domain_attr(dattr + + nslot, b); } } nslot++; diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index a9e6bad9f706..c1ef192aa655 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -65,7 +65,7 @@ lookup_exec_domain(u_long personality) goto out; } -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES read_unlock(&exec_domains_lock); request_module("personality-%ld", pers); read_lock(&exec_domains_lock); diff --git a/kernel/exit.c b/kernel/exit.c index ceb258782835..93d2711b9381 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -71,7 +71,7 @@ static void __unhash_process(struct task_struct *p) __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - remove_parent(p); + list_del_init(&p->sibling); } /* @@ -152,6 +152,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(container_of(rhp, struct task_struct, rcu)); } +/* + * Do final ptrace-related cleanup of a zombie being reaped. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_release_task(struct task_struct *p) +{ + BUG_ON(!list_empty(&p->ptraced)); + ptrace_unlink(p); + BUG_ON(!list_empty(&p->ptrace_entry)); +} + void release_task(struct task_struct * p) { struct task_struct *leader; @@ -160,8 +172,7 @@ repeat: atomic_dec(&p->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + ptrace_release_task(p); __exit_signal(p); /* @@ -315,9 +326,8 @@ static void reparent_to_kthreadd(void) ptrace_unlink(current); /* Reparent to init */ - remove_parent(current); current->real_parent = current->parent = kthreadd_task; - add_parent(current); + list_move_tail(¤t->sibling, ¤t->real_parent->children); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; @@ -692,37 +702,97 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -static void -reparent_thread(struct task_struct *p, struct task_struct *father, int traced) +/* + * Return nonzero if @parent's children should reap themselves. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static int ignoring_children(struct task_struct *parent) { - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + int ret; + struct sighand_struct *psig = parent->sighand; + unsigned long flags; + spin_lock_irqsave(&psig->siglock, flags); + ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); + spin_unlock_irqrestore(&psig->siglock, flags); + return ret; +} - /* Move the child from its dying parent to the new one. */ - if (unlikely(traced)) { - /* Preserve ptrace links if someone else is tracing this child. */ - list_del_init(&p->ptrace_list); - if (ptrace_reparented(p)) - list_add(&p->ptrace_list, &p->real_parent->ptrace_children); - } else { - /* If this child is being traced, then we're the one tracing it - * anyway, so let go of it. +/* + * Detach all tasks we were using ptrace on. + * Any that need to be release_task'd are put on the @dead list. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_exit(struct task_struct *parent, struct list_head *dead) +{ + struct task_struct *p, *n; + int ign = -1; + + list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { + __ptrace_unlink(p); + + if (p->exit_state != EXIT_ZOMBIE) + continue; + + /* + * If it's a zombie, our attachedness prevented normal + * parent notification or self-reaping. Do notification + * now if it would have happened earlier. If it should + * reap itself, add it to the @dead list. We can't call + * release_task() here because we already hold tasklist_lock. + * + * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. */ - p->ptrace = 0; - remove_parent(p); - p->parent = p->real_parent; - add_parent(p); + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, parent)) + do_notify_parent(p, p->exit_signal); + else { + if (ign < 0) + ign = ignoring_children(parent); + if (ign) + p->exit_signal = -1; + } + } - if (task_is_traced(p)) { + if (task_detached(p)) { /* - * If it was at a trace stop, turn it into - * a normal stop since it's no longer being - * traced. + * Mark it as in the process of being reaped. */ - ptrace_untrace(p); + p->exit_state = EXIT_DEAD; + list_add(&p->ptrace_entry, dead); } } +} + +/* + * Finish up exit-time ptrace cleanup. + * + * Called without locks. + */ +static void ptrace_exit_finish(struct task_struct *parent, + struct list_head *dead) +{ + struct task_struct *p, *n; + + BUG_ON(!list_empty(&parent->ptraced)); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } +} + +static void reparent_thread(struct task_struct *p, struct task_struct *father) +{ + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + list_move_tail(&p->sibling, &p->real_parent->children); /* If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -737,7 +807,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) /* If we'd notified the old parent about this child's death, * also notify the new parent. */ - if (!traced && p->exit_state == EXIT_ZOMBIE && + if (!ptrace_reparented(p) && + p->exit_state == EXIT_ZOMBIE && !task_detached(p) && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); @@ -754,12 +825,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) static void forget_original_parent(struct task_struct *father) { struct task_struct *p, *n, *reaper = father; - struct list_head ptrace_dead; - - INIT_LIST_HEAD(&ptrace_dead); + LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); + /* + * First clean up ptrace if we were using it. + */ + ptrace_exit(father, &ptrace_dead); + do { reaper = next_thread(reaper); if (reaper == father) { @@ -768,58 +842,19 @@ static void forget_original_parent(struct task_struct *father) } } while (reaper->flags & PF_EXITING); - /* - * There are only two places where our children can be: - * - * - in our child list - * - in our ptraced child list - * - * Search them and reparent children. - */ list_for_each_entry_safe(p, n, &father->children, sibling) { - int ptrace; - - ptrace = p->ptrace; - - /* if father isn't the real parent, then ptrace must be enabled */ - BUG_ON(father != p->real_parent && !ptrace); - - if (father == p->real_parent) { - /* reparent with a reaper, real father it's us */ - p->real_parent = reaper; - reparent_thread(p, father, 0); - } else { - /* reparent ptraced task to its real parent */ - __ptrace_unlink (p); - if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) && - thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - } - - /* - * if the ptraced child is a detached zombie we must collect - * it before we exit, or it will remain zombie forever since - * we prevented it from self-reap itself while it was being - * traced by us, to be able to see it in wait4. - */ - if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) - list_add(&p->ptrace_list, &ptrace_dead); - } - - list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { p->real_parent = reaper; - reparent_thread(p, father, 1); + if (p->parent == father) { + BUG_ON(p->ptrace); + p->parent = p->real_parent; + } + reparent_thread(p, father); } write_unlock_irq(&tasklist_lock); BUG_ON(!list_empty(&father->children)); - BUG_ON(!list_empty(&father->ptrace_children)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) { - list_del_init(&p->ptrace_list); - release_task(p); - } + ptrace_exit_finish(father, &ptrace_dead); } /* @@ -1180,13 +1215,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; } - /* - * Do not consider detached threads that are - * not ptraced: - */ - if (task_detached(p) && !p->ptrace) - return 0; - /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1197,14 +1225,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; err = security_task_wait(p); - if (likely(!err)) - return 1; + if (err) + return err; - if (type != PIDTYPE_PID) - return 0; - /* This child was explicitly requested, abort */ - read_unlock(&tasklist_lock); - return err; + return 1; } static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, @@ -1238,7 +1262,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_zombie(struct task_struct *p, int noreap, +static int wait_task_zombie(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1246,7 +1270,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap, int retval, status, traced; pid_t pid = task_pid_vnr(p); - if (unlikely(noreap)) { + if (!likely(options & WEXITED)) + return 0; + + if (unlikely(options & WNOWAIT)) { uid_t uid = p->uid; int exit_code = p->exit_code; int why, status; @@ -1396,21 +1423,24 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(struct task_struct *p, - int noreap, struct siginfo __user *infop, +static int wait_task_stopped(int ptrace, struct task_struct *p, + int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { int retval, exit_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; + if (!(options & WUNTRACED)) + return 0; + exit_code = 0; spin_lock_irq(&p->sighand->siglock); if (unlikely(!task_is_stopped_or_traced(p))) goto unlock_sig; - if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) + if (!ptrace && p->signal->group_stop_count > 0) /* * A group stop is in progress and this is the group leader. * We won't report until all threads have stopped. @@ -1421,7 +1451,7 @@ static int wait_task_stopped(struct task_struct *p, if (!exit_code) goto unlock_sig; - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->exit_code = 0; uid = p->uid; @@ -1439,10 +1469,10 @@ unlock_sig: */ get_task_struct(p); pid = task_pid_vnr(p); - why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); - if (unlikely(noreap)) + if (unlikely(options & WNOWAIT)) return wait_noreap_copyout(p, pid, uid, why, exit_code, infop, ru); @@ -1476,7 +1506,7 @@ unlock_sig: * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_continued(struct task_struct *p, int noreap, +static int wait_task_continued(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1484,6 +1514,9 @@ static int wait_task_continued(struct task_struct *p, int noreap, pid_t pid; uid_t uid; + if (!unlikely(options & WCONTINUED)) + return 0; + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; @@ -1493,7 +1526,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, spin_unlock_irq(&p->sighand->siglock); return 0; } - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; spin_unlock_irq(&p->sighand->siglock); @@ -1519,89 +1552,161 @@ static int wait_task_continued(struct task_struct *p, int noreap, return retval; } +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then *@notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int wait_consider_task(struct task_struct *parent, int ptrace, + struct task_struct *p, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int ret = eligible_child(type, pid, options, p); + if (!ret) + return ret; + + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (*notask_error) + *notask_error = ret; + } + + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * This child is hidden by ptrace. + * We aren't allowed to see it now, but eventually we will. + */ + *notask_error = 0; + return 0; + } + + if (p->exit_state == EXIT_DEAD) + return 0; + + /* + * We don't reap group leaders with subthreads. + */ + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) + return wait_task_zombie(p, options, infop, stat_addr, ru); + + /* + * It's stopped or running now, so it might + * later continue, exit, or stop again. + */ + *notask_error = 0; + + if (task_is_stopped_or_traced(p)) + return wait_task_stopped(ptrace, p, options, + infop, stat_addr, ru); + + return wait_task_continued(p, options, infop, stat_addr, ru); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * *@notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int do_wait_thread(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + /* + * Do not consider detached threads. + */ + if (!task_detached(p)) { + int ret = wait_consider_task(tsk, 0, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + } + + return 0; +} + +static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + /* + * Traditionally we see ptrace'd stopped tasks regardless of options. + */ + options |= WUNTRACED; + + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(tsk, 1, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + + return 0; +} + static long do_wait(enum pid_type type, struct pid *pid, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; - int flag, retval; + int retval; add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: - /* If there is nothing that can match our critier just get out */ + /* + * If there is nothing that can match our critiera just get out. + * We will clear @retval to zero if we see any child that might later + * match our criteria, even if we are not able to reap it yet. + */ retval = -ECHILD; if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) goto end; - /* - * We will set this flag if we see any child that might later - * match our criteria, even if we are not able to reap it yet. - */ - flag = retval = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; do { - struct task_struct *p; - - list_for_each_entry(p, &tsk->children, sibling) { - int ret = eligible_child(type, pid, options, p); - if (!ret) - continue; - - if (unlikely(ret < 0)) { - retval = ret; - } else if (task_is_stopped_or_traced(p)) { - /* - * It's stopped now, so it might later - * continue, exit, or stop again. - */ - flag = 1; - if (!(p->ptrace & PT_PTRACED) && - !(options & WUNTRACED)) - continue; - - retval = wait_task_stopped(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state == EXIT_ZOMBIE && - !delay_group_leader(p)) { - /* - * We don't reap group leaders with subthreads. - */ - if (!likely(options & WEXITED)) - continue; - retval = wait_task_zombie(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state != EXIT_DEAD) { - /* - * It's running now, so it might later - * exit, stop, or stop and then continue. - */ - flag = 1; - if (!unlikely(options & WCONTINUED)) - continue; - retval = wait_task_continued(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } - if (retval != 0) /* tasklist_lock released */ - goto end; - } - if (!flag) { - list_for_each_entry(p, &tsk->ptrace_children, - ptrace_list) { - flag = eligible_child(type, pid, options, p); - if (!flag) - continue; - if (likely(flag > 0)) - break; - retval = flag; - goto end; - } + int tsk_result = do_wait_thread(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (!tsk_result) + tsk_result = ptrace_do_wait(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (tsk_result) { + /* + * tasklist_lock is unlocked and we have a final result. + */ + retval = tsk_result; + goto end; } + if (options & __WNOTHREAD) break; tsk = next_thread(tsk); @@ -1609,16 +1714,14 @@ repeat: } while (tsk != current); read_unlock(&tasklist_lock); - if (flag) { - if (options & WNOHANG) - goto end; + if (!retval && !(options & WNOHANG)) { retval = -ERESTARTSYS; - if (signal_pending(current)) - goto end; - schedule(); - goto repeat; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } } - retval = -ECHILD; + end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); diff --git a/kernel/fork.c b/kernel/fork.c index 4bd2f516401f..adefc1131f27 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1125,8 +1125,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_children); - INIT_LIST_HEAD(&p->ptrace_list); + INIT_LIST_HEAD(&p->ptrace_entry); + INIT_LIST_HEAD(&p->ptraced); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1198,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - add_parent(p); + list_add_tail(&p->sibling, &p->real_parent->children); if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2913a8bff612..b8e4dce80a74 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -622,7 +622,7 @@ static void retrigger_next_event(void *arg) void clock_was_set(void) { /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 0, 1); + on_each_cpu(retrigger_next_event, NULL, 1); } /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 77a51be36010..3cfc0fefb5ee 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -217,6 +217,17 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); +int set_irq_wake_real(unsigned int irq, unsigned int on) +{ + struct irq_desc *desc = irq_desc + irq; + int ret = -ENXIO; + + if (desc->chip->set_wake) + ret = desc->chip->set_wake(irq, on); + + return ret; +} + /** * set_irq_wake - control irq power management wakeup * @irq: interrupt to control @@ -233,30 +244,34 @@ int set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_desc + irq; unsigned long flags; - int ret = -ENXIO; - int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake; + int ret = 0; /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ spin_lock_irqsave(&desc->lock, flags); if (on) { - if (desc->wake_depth++ == 0) - desc->status |= IRQ_WAKEUP; - else - set_wake = NULL; + if (desc->wake_depth++ == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 0; + else + desc->status |= IRQ_WAKEUP; + } } else { if (desc->wake_depth == 0) { printk(KERN_WARNING "Unbalanced IRQ %d " "wake disable\n", irq); WARN_ON(1); - } else if (--desc->wake_depth == 0) - desc->status &= ~IRQ_WAKEUP; - else - set_wake = NULL; + } else if (--desc->wake_depth == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 1; + else + desc->status &= ~IRQ_WAKEUP; + } } - if (set_wake) - ret = desc->chip->set_wake(irq, on); + spin_unlock_irqrestore(&desc->lock, flags); return ret; } diff --git a/kernel/kmod.c b/kernel/kmod.c index 8df97d3dfda8..90d7af1c1655 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -42,7 +42,7 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES /* modprobe_path is set via /proc/sys. diff --git a/kernel/kthread.c b/kernel/kthread.c index 97747cdd37c9..ac3fb7326641 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -235,7 +235,7 @@ int kthreadd(void *unused) set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed(tsk, CPU_MASK_ALL); - current->flags |= PF_NOFREEZE; + current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; for (;;) { set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/module.c b/kernel/module.c index 5f80478b746d..d8b5605132a0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -70,6 +70,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); +/* Bounds of module allocation, for speeding __module_text_address */ +static unsigned long module_addr_min = -1UL, module_addr_max = 0; + int register_module_notifier(struct notifier_block * nb) { return blocking_notifier_chain_register(&module_notify_list, nb); @@ -134,17 +137,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; -extern const struct kernel_symbol __start___ksymtab_unused[]; -extern const struct kernel_symbol __stop___ksymtab_unused[]; -extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; extern const unsigned long __start___kcrctab[]; extern const unsigned long __start___kcrctab_gpl[]; extern const unsigned long __start___kcrctab_gpl_future[]; +#ifdef CONFIG_UNUSED_SYMBOLS +extern const struct kernel_symbol __start___ksymtab_unused[]; +extern const struct kernel_symbol __stop___ksymtab_unused[]; +extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; extern const unsigned long __start___kcrctab_unused[]; extern const unsigned long __start___kcrctab_unused_gpl[]; +#endif #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL @@ -152,156 +157,186 @@ extern const unsigned long __start___kcrctab_unused_gpl[]; #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; -} - -static bool always_ok(bool gplok, bool warn, const char *name) -{ - return true; -} - -static bool printk_unused_warning(bool gplok, bool warn, const char *name) -{ - if (warn) { - printk(KERN_WARNING "Symbol %s is marked as UNUSED, " - "however this module is using it.\n", name); - printk(KERN_WARNING - "This symbol will go away in the future.\n"); - printk(KERN_WARNING - "Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); - } - return true; -} - -static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name) -{ - if (!gplok) - return false; - return printk_unused_warning(gplok, warn, name); -} - -static bool gpl_only(bool gplok, bool warn, const char *name) -{ - return gplok; -} - -static bool warn_if_not_gpl(bool gplok, bool warn, const char *name) -{ - if (!gplok && warn) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); - } - return true; -} - struct symsearch { const struct kernel_symbol *start, *stop; const unsigned long *crcs; - bool (*check)(bool gplok, bool warn, const char *name); + enum { + NOT_GPL_ONLY, + GPL_ONLY, + WILL_BE_GPL_ONLY, + } licence; + bool unused; }; -/* Look through this array of symbol tables for a symbol match which - * passes the check function. */ -static const struct kernel_symbol *search_symarrays(const struct symsearch *arr, - unsigned int num, - const char *name, - bool gplok, - bool warn, - const unsigned long **crc) +static bool each_symbol_in_section(const struct symsearch *arr, + unsigned int arrsize, + struct module *owner, + bool (*fn)(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data), + void *data) { - unsigned int i; - const struct kernel_symbol *ks; + unsigned int i, j; - for (i = 0; i < num; i++) { - ks = lookup_symbol(name, arr[i].start, arr[i].stop); - if (!ks || !arr[i].check(gplok, warn, name)) - continue; - - if (crc) - *crc = symversion(arr[i].crcs, ks - arr[i].start); - return ks; + for (j = 0; j < arrsize; j++) { + for (i = 0; i < arr[j].stop - arr[j].start; i++) + if (fn(&arr[j], owner, i, data)) + return true; } - return NULL; + + return false; } -/* Find a symbol, return value, (optional) crc and (optional) module - * which owns it */ -static unsigned long find_symbol(const char *name, - struct module **owner, - const unsigned long **crc, - bool gplok, - bool warn) +/* Returns true as soon as fn returns true, otherwise false. */ +static bool each_symbol(bool (*fn)(const struct symsearch *arr, + struct module *owner, + unsigned int symnum, void *data), + void *data) { struct module *mod; - const struct kernel_symbol *ks; const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, - always_ok }, + NOT_GPL_ONLY, false }, { __start___ksymtab_gpl, __stop___ksymtab_gpl, - __start___kcrctab_gpl, gpl_only }, + __start___kcrctab_gpl, + GPL_ONLY, false }, { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, - __start___kcrctab_gpl_future, warn_if_not_gpl }, + __start___kcrctab_gpl_future, + WILL_BE_GPL_ONLY, false }, +#ifdef CONFIG_UNUSED_SYMBOLS { __start___ksymtab_unused, __stop___ksymtab_unused, - __start___kcrctab_unused, printk_unused_warning }, + __start___kcrctab_unused, + NOT_GPL_ONLY, true }, { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, - __start___kcrctab_unused_gpl, gpl_only_unused_warning }, + __start___kcrctab_unused_gpl, + GPL_ONLY, true }, +#endif }; - /* Core kernel first. */ - ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc); - if (ks) { - if (owner) - *owner = NULL; - return ks->value; - } + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) + return true; - /* Now try modules. */ list_for_each_entry(mod, &modules, list) { struct symsearch arr[] = { { mod->syms, mod->syms + mod->num_syms, mod->crcs, - always_ok }, + NOT_GPL_ONLY, false }, { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, - mod->gpl_crcs, gpl_only }, + mod->gpl_crcs, + GPL_ONLY, false }, { mod->gpl_future_syms, mod->gpl_future_syms + mod->num_gpl_future_syms, - mod->gpl_future_crcs, warn_if_not_gpl }, + mod->gpl_future_crcs, + WILL_BE_GPL_ONLY, false }, +#ifdef CONFIG_UNUSED_SYMBOLS { mod->unused_syms, mod->unused_syms + mod->num_unused_syms, - mod->unused_crcs, printk_unused_warning }, + mod->unused_crcs, + NOT_GPL_ONLY, true }, { mod->unused_gpl_syms, mod->unused_gpl_syms + mod->num_unused_gpl_syms, - mod->unused_gpl_crcs, gpl_only_unused_warning }, + mod->unused_gpl_crcs, + GPL_ONLY, true }, +#endif }; - ks = search_symarrays(arr, ARRAY_SIZE(arr), - name, gplok, warn, crc); - if (ks) { - if (owner) - *owner = mod; - return ks->value; + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) + return true; + } + return false; +} + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const unsigned long *crc; + unsigned long value; +}; + +static bool find_symbol_in_section(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) +{ + struct find_symbol_arg *fsa = data; + + if (strcmp(syms->start[symnum].name, fsa->name) != 0) + return false; + + if (!fsa->gplok) { + if (syms->licence == GPL_ONLY) + return false; + if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { + printk(KERN_WARNING "Symbol %s is being used " + "by a non-GPL module, which will not " + "be allowed in the future\n", fsa->name); + printk(KERN_WARNING "Please see the file " + "Documentation/feature-removal-schedule.txt " + "in the kernel source tree for more details.\n"); } } +#ifdef CONFIG_UNUSED_SYMBOLS + if (syms->unused && fsa->warn) { + printk(KERN_WARNING "Symbol %s is marked as UNUSED, " + "however this module is using it.\n", fsa->name); + printk(KERN_WARNING + "This symbol will go away in the future.\n"); + printk(KERN_WARNING + "Please evalute if this is the right api to use and if " + "it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); + } +#endif + + fsa->owner = owner; + fsa->crc = symversion(syms->crcs, symnum); + fsa->value = syms->start[symnum].value; + return true; +} + +/* Find a symbol, return value, (optional) crc and (optional) module + * which owns it */ +static unsigned long find_symbol(const char *name, + struct module **owner, + const unsigned long **crc, + bool gplok, + bool warn) +{ + struct find_symbol_arg fsa; + + fsa.name = name; + fsa.gplok = gplok; + fsa.warn = warn; + + if (each_symbol(find_symbol_in_section, &fsa)) { + if (owner) + *owner = fsa.owner; + if (crc) + *crc = fsa.crc; + return fsa.value; + } + DEBUGP("Failed to find symbol %s\n", name); return -ENOENT; } +/* lookup symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + const struct kernel_symbol *ks = start; + for (; ks < stop; ks++) + if (strcmp(ks->name, name) == 0) + return ks; + return NULL; +} + /* Search for module by name: must hold module_mutex. */ static struct module *find_module(const char *name) { @@ -639,8 +674,8 @@ static int __try_stop_module(void *_sref) { struct stopref *sref = _sref; - /* If it's not unused, quit unless we are told to block. */ - if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { + /* If it's not unused, quit unless we're forcing. */ + if (module_refcount(sref->mod) != 0) { if (!(*sref->forced = try_force_unload(sref->flags))) return -EWOULDBLOCK; } @@ -652,9 +687,16 @@ static int __try_stop_module(void *_sref) static int try_stop_module(struct module *mod, int flags, int *forced) { - struct stopref sref = { mod, flags, forced }; + if (flags & O_NONBLOCK) { + struct stopref sref = { mod, flags, forced }; - return stop_machine_run(__try_stop_module, &sref, NR_CPUS); + return stop_machine_run(__try_stop_module, &sref, NR_CPUS); + } else { + /* We don't need to stop the machine for this. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + return 0; + } } unsigned int module_refcount(struct module *mod) @@ -1445,8 +1487,10 @@ static int verify_export_symbols(struct module *mod) { mod->syms, mod->num_syms }, { mod->gpl_syms, mod->num_gpl_syms }, { mod->gpl_future_syms, mod->num_gpl_future_syms }, +#ifdef CONFIG_UNUSED_SYMBOLS { mod->unused_syms, mod->num_unused_syms }, { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, +#endif }; for (i = 0; i < ARRAY_SIZE(arr); i++) { @@ -1526,7 +1570,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, } /* Update size with this section: return offset. */ -static long get_offset(unsigned long *size, Elf_Shdr *sechdr) +static long get_offset(unsigned int *size, Elf_Shdr *sechdr) { long ret; @@ -1738,6 +1782,20 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +static void *module_alloc_update_bounds(unsigned long size) +{ + void *ret = module_alloc(size); + + if (ret) { + /* Update module bounds. */ + if ((unsigned long)ret < module_addr_min) + module_addr_min = (unsigned long)ret; + if ((unsigned long)ret + size > module_addr_max) + module_addr_max = (unsigned long)ret + size; + } + return ret; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static struct module *load_module(void __user *umod, @@ -1764,10 +1822,12 @@ static struct module *load_module(void __user *umod, unsigned int gplfutureindex; unsigned int gplfuturecrcindex; unsigned int unwindex = 0; +#ifdef CONFIG_UNUSED_SYMBOLS unsigned int unusedindex; unsigned int unusedcrcindex; unsigned int unusedgplindex; unsigned int unusedgplcrcindex; +#endif unsigned int markersindex; unsigned int markersstringsindex; struct module *mod; @@ -1850,13 +1910,15 @@ static struct module *load_module(void __user *umod, exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); - unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); - unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); +#ifdef CONFIG_UNUSED_SYMBOLS + unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); + unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); +#endif setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); @@ -1935,7 +1997,7 @@ static struct module *load_module(void __user *umod, layout_sections(mod, hdr, sechdrs, secstrings); /* Do the allocs. */ - ptr = module_alloc(mod->core_size); + ptr = module_alloc_update_bounds(mod->core_size); if (!ptr) { err = -ENOMEM; goto free_percpu; @@ -1943,7 +2005,7 @@ static struct module *load_module(void __user *umod, memset(ptr, 0, mod->core_size); mod->module_core = ptr; - ptr = module_alloc(mod->init_size); + ptr = module_alloc_update_bounds(mod->init_size); if (!ptr && mod->init_size) { err = -ENOMEM; goto free_core; @@ -2018,14 +2080,15 @@ static struct module *load_module(void __user *umod, mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / sizeof(*mod->gpl_future_syms); - mod->num_unused_syms = sechdrs[unusedindex].sh_size / - sizeof(*mod->unused_syms); - mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / - sizeof(*mod->unused_gpl_syms); mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; if (gplfuturecrcindex) mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; +#ifdef CONFIG_UNUSED_SYMBOLS + mod->num_unused_syms = sechdrs[unusedindex].sh_size / + sizeof(*mod->unused_syms); + mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / + sizeof(*mod->unused_gpl_syms); mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; if (unusedcrcindex) mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; @@ -2033,13 +2096,17 @@ static struct module *load_module(void __user *umod, if (unusedgplcrcindex) mod->unused_gpl_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; +#endif #ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !crcindex) || - (mod->num_gpl_syms && !gplcrcindex) || - (mod->num_gpl_future_syms && !gplfuturecrcindex) || - (mod->num_unused_syms && !unusedcrcindex) || - (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { + if ((mod->num_syms && !crcindex) + || (mod->num_gpl_syms && !gplcrcindex) + || (mod->num_gpl_future_syms && !gplfuturecrcindex) +#ifdef CONFIG_UNUSED_SYMBOLS + || (mod->num_unused_syms && !unusedcrcindex) + || (mod->num_unused_gpl_syms && !unusedgplcrcindex) +#endif + ) { printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); err = try_to_force_load(mod, "nocrc"); if (err) @@ -2512,7 +2579,7 @@ static int m_show(struct seq_file *m, void *p) struct module *mod = list_entry(p, struct module, list); char buf[8]; - seq_printf(m, "%s %lu", + seq_printf(m, "%s %u", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -2595,6 +2662,9 @@ struct module *__module_text_address(unsigned long addr) { struct module *mod; + if (addr < module_addr_min || addr > module_addr_max) + return NULL; + list_for_each_entry(mod, &modules, list) if (within(addr, mod->module_init, mod->init_text_size) || within(addr, mod->module_core, mod->core_text_size)) diff --git a/kernel/pid.c b/kernel/pid.c index 20d59fa2d493..30bd5d4b2ac7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> +#include <linux/rculist.h> #include <linux/bootmem.h> #include <linux/hash.h> #include <linux/pid_namespace.h> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b45da40e8d25..59dfdf1e1d20 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION + depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y config SUSPEND diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 14a656cdc652..f011e0870b52 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -180,6 +180,17 @@ static void platform_restore_cleanup(int platform_mode) } /** + * platform_recover - recover the platform from a failure to suspend + * devices. + */ + +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + +/** * create_image - freeze devices that need to be frozen with interrupts * off, create the hibernation image and thaw those devices. Control * reappears in this routine after a restore. @@ -193,6 +204,7 @@ static int create_image(int platform_mode) if (error) return error; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. @@ -224,9 +236,11 @@ static int create_image(int platform_mode) /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ - device_power_up(); + device_power_up(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -255,10 +269,10 @@ int hibernation_snapshot(int platform_mode) suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) - goto Resume_console; + goto Recover_platform; if (hibernation_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) @@ -280,12 +294,16 @@ int hibernation_snapshot(int platform_mode) Finish: platform_finish(platform_mode); Resume_devices: - device_resume(); - Resume_console: + device_resume(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); resume_console(); Close: platform_end(platform_mode); return error; + + Recover_platform: + platform_recover(platform_mode); + goto Resume_devices; } /** @@ -300,8 +318,9 @@ static int resume_target_kernel(void) { int error; + device_pm_lock(); local_irq_disable(); - error = device_power_down(PMSG_PRETHAW); + error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -329,9 +348,10 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); - device_power_up(); + device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -350,7 +370,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_PRETHAW); + error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -362,7 +382,7 @@ int hibernation_restore(int platform_mode) enable_nonboot_cpus(); } platform_restore_cleanup(platform_mode); - device_resume(); + device_resume(PMSG_RECOVER); Finish: resume_console(); pm_restore_console(); @@ -392,8 +412,11 @@ int hibernation_platform_enter(void) suspend_console(); error = device_suspend(PMSG_HIBERNATE); - if (error) - goto Resume_console; + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } error = hibernation_ops->prepare(); if (error) @@ -403,6 +426,7 @@ int hibernation_platform_enter(void) if (error) goto Finish; + device_pm_lock(); local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { @@ -411,6 +435,7 @@ int hibernation_platform_enter(void) while (1); } local_irq_enable(); + device_pm_unlock(); /* * We don't need to reenable the nonboot CPUs or resume consoles, since @@ -419,8 +444,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: - device_resume(); - Resume_console: + device_resume(PMSG_RESTORE); resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 6a6d5eb3524e..3398f4651aa1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state) { int error = 0; + device_pm_lock(); arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state) if (!suspend_test(TEST_CORE)) error = suspend_ops->enter(state); - device_power_up(); + device_power_up(PMSG_RESUME); Done: arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + device_pm_unlock(); return error; } @@ -267,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state) error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_console; + goto Recover_platform; } if (suspend_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; if (suspend_ops->prepare) { error = suspend_ops->prepare(); @@ -291,13 +293,17 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: - device_resume(); - Resume_console: + device_resume(PMSG_RESUME); resume_console(); Close: if (suspend_ops->end) suspend_ops->end(); return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; } /** diff --git a/kernel/power/process.c b/kernel/power/process.c index f1d0b345c9ba..5fb87652f214 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -19,9 +19,6 @@ */ #define TIMEOUT (20 * HZ) -#define FREEZER_KERNEL_THREADS 0 -#define FREEZER_USER_SPACE 1 - static inline int freezeable(struct task_struct * p) { if ((p == current) || @@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } -static int has_mm(struct task_struct *p) +static inline bool should_send_signal(struct task_struct *p) { - return (p->mm && !(p->flags & PF_BORROWED_MM)); + return !(p->flags & PF_FREEZER_NOSIG); } /** * freeze_task - send a freeze request to given task * @p: task to send the request to - * @with_mm_only: if set, the request will only be sent if the task has its - * own mm - * Return value: 0, if @with_mm_only is set and the task has no mm of its - * own or the task is frozen, 1, otherwise + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise * - * The freeze request is sent by seting the tasks's TIF_FREEZE flag and + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and * either sending a fake signal to it or waking it up, depending on whether - * or not it has its own mm (ie. it is a user land task). If @with_mm_only - * is set and the task has no mm of its own (ie. it is a kernel thread), - * its TIF_FREEZE flag should not be set. - * - * The task_lock() is necessary to prevent races with exit_mm() or - * use_mm()/unuse_mm() from occuring. + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. */ -static int freeze_task(struct task_struct *p, int with_mm_only) +static bool freeze_task(struct task_struct *p, bool sig_only) { - int ret = 1; + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + rmb(); + if (frozen(p)) + return false; - task_lock(p); - if (freezing(p)) { - if (has_mm(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else { - if (with_mm_only) - ret = 0; - else - wake_up_state(p, TASK_INTERRUPTIBLE); - } + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + if (!signal_pending(p)) + fake_signal_wake_up(p); + } else if (sig_only) { + return false; } else { - rmb(); - if (frozen(p)) { - ret = 0; - } else { - if (has_mm(p)) { - set_freeze_flag(p); - fake_signal_wake_up(p); - } else { - if (with_mm_only) { - ret = 0; - } else { - set_freeze_flag(p); - wake_up_state(p, TASK_INTERRUPTIBLE); - } - } - } + wake_up_state(p, TASK_INTERRUPTIBLE); } - task_unlock(p); - return ret; + + return true; } static void cancel_freezing(struct task_struct *p) @@ -156,7 +143,7 @@ static void cancel_freezing(struct task_struct *p) } } -static int try_to_freeze_tasks(int freeze_user_space) +static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; unsigned long end_time; @@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space) if (frozen(p) || !freezeable(p)) continue; - if (!freeze_task(p, freeze_user_space)) + if (!freeze_task(p, sig_only)) continue; /* @@ -235,13 +222,13 @@ int freeze_processes(void) int error; printk("Freezing user space processes ... "); - error = try_to_freeze_tasks(FREEZER_USER_SPACE); + error = try_to_freeze_tasks(true); if (error) goto Exit; printk("done.\n"); printk("Freezing remaining freezable tasks ... "); - error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); + error = try_to_freeze_tasks(false); if (error) goto Exit; printk("done."); @@ -251,7 +238,7 @@ int freeze_processes(void) return error; } -static void thaw_tasks(int thaw_user_space) +static void thaw_tasks(bool nosig_only) { struct task_struct *g, *p; @@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space) if (!freezeable(p)) continue; - if (!p->mm == thaw_user_space) + if (nosig_only && should_send_signal(p)) continue; thaw_process(p); @@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space) void thaw_processes(void) { printk("Restarting tasks ... "); - thaw_tasks(FREEZER_KERNEL_THREADS); - thaw_tasks(FREEZER_USER_SPACE); + thaw_tasks(true); + thaw_tasks(false); schedule(); printk("done.\n"); } diff --git a/kernel/power/user.c b/kernel/power/user.c index f5512cb3aa86..a6332a313262 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -23,6 +23,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> +#include <linux/smp_lock.h> #include <asm/uaccess.h> @@ -69,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - return -EBUSY; + mutex_lock(&pm_mutex); + + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { atomic_inc(&snapshot_device_available); - return -ENOSYS; + error = -ENOSYS; + goto Unlock; } if(create_basic_memory_bitmaps()) { atomic_inc(&snapshot_device_available); - return -ENOMEM; + error = -ENOMEM; + goto Unlock; } nonseekable_open(inode, filp); data = &snapshot_state; @@ -98,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); } - if (error) { + if (error) atomic_inc(&snapshot_device_available); - return error; - } data->frozen = 0; data->ready = 0; data->platform_support = 0; - return 0; + Unlock: + mutex_unlock(&pm_mutex); + + return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + mutex_lock(&pm_mutex); + swsusp_free(); free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); - if (data->frozen) { - mutex_lock(&pm_mutex); + if (data->frozen) thaw_processes(); - mutex_unlock(&pm_mutex); - } pm_notifier_call_chain(data->mode == O_WRONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); + + mutex_unlock(&pm_mutex); + return 0; } @@ -134,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, struct snapshot_data *data; ssize_t res; + mutex_lock(&pm_mutex); + data = filp->private_data; - if (!data->ready) - return -ENODATA; + if (!data->ready) { + res = -ENODATA; + goto Unlock; + } res = snapshot_read_next(&data->handle, count); if (res > 0) { if (copy_to_user(buf, data_of(data->handle), res)) @@ -144,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, else *offp = data->handle.offset; } + + Unlock: + mutex_unlock(&pm_mutex); + return res; } @@ -153,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, struct snapshot_data *data; ssize_t res; + mutex_lock(&pm_mutex); + data = filp->private_data; res = snapshot_write_next(&data->handle, count); if (res > 0) { @@ -161,11 +181,14 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, else *offp = data->handle.offset; } + + mutex_unlock(&pm_mutex); + return res; } -static int snapshot_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long snapshot_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { int error = 0; struct snapshot_data *data; @@ -179,6 +202,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + data = filp->private_data; switch (cmd) { @@ -186,7 +212,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_FREEZE: if (data->frozen) break; - mutex_lock(&pm_mutex); printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); @@ -194,7 +219,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = freeze_processes(); if (error) thaw_processes(); - mutex_unlock(&pm_mutex); if (!error) data->frozen = 1; break; @@ -202,9 +226,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; - mutex_lock(&pm_mutex); thaw_processes(); - mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -307,16 +329,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - if (!mutex_trylock(&pm_mutex)) { - error = -EBUSY; - break; - } /* * Tasks are frozen and the notifiers have been called with * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - mutex_unlock(&pm_mutex); break; case SNAPSHOT_PLATFORM_SUPPORT: @@ -390,6 +407,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, } + mutex_unlock(&pm_mutex); + return error; } @@ -399,7 +418,7 @@ static const struct file_operations snapshot_fops = { .read = snapshot_read, .write = snapshot_write, .llseek = no_llseek, - .ioctl = snapshot_ioctl, + .unlocked_ioctl = snapshot_ioctl, }; static struct miscdevice snapshot_device = { diff --git a/kernel/profile.c b/kernel/profile.c index ae7ead82cbc9..58926411eb2a 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -252,7 +252,7 @@ static void profile_flip_buffers(void) mutex_lock(&profile_flip_mutex); j = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 0, 1); + on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; for (i = 0; i < NR_PROFILE_HIT; ++i) { @@ -275,7 +275,7 @@ static void profile_discard_flip_buffers(void) mutex_lock(&profile_flip_mutex); i = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 0, 1); + on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); @@ -558,7 +558,7 @@ static int __init create_hash_tables(void) out_cleanup: prof_on = 0; smp_mb(); - on_each_cpu(profile_nop, NULL, 0, 1); + on_each_cpu(profile_nop, NULL, 1); for_each_online_cpu(cpu) { struct page *page; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e337390fce01..8392a9da6450 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -33,13 +33,9 @@ */ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_list)); - if (child->parent == new_parent) - return; - list_add(&child->ptrace_list, &child->parent->ptrace_children); - remove_parent(child); + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; - add_parent(child); } /* @@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); child->ptrace = 0; - if (ptrace_reparented(child)) { - list_del_init(&child->ptrace_list); - remove_parent(child); - child->parent = child->real_parent; - add_parent(child); - } + child->parent = child->real_parent; + list_del_init(&child->ptrace_entry); if (task_is_traced(child)) ptrace_untrace(child); @@ -492,15 +484,34 @@ int ptrace_traceme(void) /* * Are we already being traced? */ +repeat: task_lock(current); if (!(current->ptrace & PT_PTRACED)) { + /* + * See ptrace_attach() comments about the locking here. + */ + unsigned long flags; + if (!write_trylock_irqsave(&tasklist_lock, flags)) { + task_unlock(current); + do { + cpu_relax(); + } while (!write_can_lock(&tasklist_lock)); + goto repeat; + } + ret = security_ptrace(current->parent, current, PTRACE_MODE_ATTACH); + /* * Set the ptrace bit in the process ptrace flags. + * Then link us on our parent's ptraced list. */ - if (!ret) + if (!ret) { current->ptrace |= PT_PTRACED; + __ptrace_link(current, current->real_parent); + } + + write_unlock_irqrestore(&tasklist_lock, flags); } task_unlock(current); return ret; diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 65c0906080ef..16eeeaa9d618 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -387,6 +387,10 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp, rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + + local_irq_disable(); + this_rdp->qlen += rdp->qlen; + local_irq_enable(); } static void rcu_offline_cpu(int cpu) @@ -516,10 +520,38 @@ void rcu_check_callbacks(int cpu, int user) if (user || (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so count it. + * + * Also do a memory barrier. This is needed to handle + * the case where writes from a preempt-disable section + * of code get reordered into schedule() by this CPU's + * write buffer. The memory barrier makes sure that + * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see + * by other CPUs to happen after any such write. + */ + + smp_mb(); /* See above block comment. */ rcu_qsctr_inc(cpu); rcu_bh_qsctr_inc(cpu); - } else if (!in_softirq()) + + } else if (!in_softirq()) { + + /* + * Get here if this CPU did not take its interrupt from + * softirq, in other words, if it is not interrupting + * a rcu_bh read-side critical section. This is an _bh + * critical section, so count it. The memory barrier + * is needed for the same reason as is the above one. + */ + + smp_mb(); /* See above block comment. */ rcu_bh_qsctr_inc(cpu); + } raise_rcu_softirq(); } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c09605f8d16c..f14f372cf6f5 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -39,16 +39,16 @@ #include <linux/sched.h> #include <asm/atomic.h> #include <linux/bitops.h> -#include <linux/completion.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/module.h> -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; +enum rcu_barrier { + RCU_BARRIER_STD, + RCU_BARRIER_BH, + RCU_BARRIER_SCHED, }; static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; @@ -60,7 +60,7 @@ static struct completion rcu_barrier_completion; * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. */ -static void wakeme_after_rcu(struct rcu_head *head) +void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -77,17 +77,7 @@ static void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished */ - call_rcu(&rcu.head, wakeme_after_rcu); - - /* Wait for it */ - wait_for_completion(&rcu.completion); -} +synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) @@ -99,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused) /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *notused) +static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); atomic_inc(&rcu_barrier_cpu_count); - call_rcu(head, rcu_barrier_callback); + switch ((enum rcu_barrier)type) { + case RCU_BARRIER_STD: + call_rcu(head, rcu_barrier_callback); + break; + case RCU_BARRIER_BH: + call_rcu_bh(head, rcu_barrier_callback); + break; + case RCU_BARRIER_SCHED: + call_rcu_sched(head, rcu_barrier_callback); + break; + } } -/** - * rcu_barrier - Wait until all the in-flight RCUs are complete. +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. */ -void rcu_barrier(void) +static void _rcu_barrier(enum rcu_barrier type) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ @@ -127,13 +128,39 @@ void rcu_barrier(void) * until all the callbacks are queued. */ rcu_read_lock(); - on_each_cpu(rcu_barrier_func, NULL, 0, 1); + on_each_cpu(rcu_barrier_func, (void *)type, 1); rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(RCU_BARRIER_STD); +} EXPORT_SYMBOL_GPL(rcu_barrier); +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(RCU_BARRIER_BH); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(RCU_BARRIER_SCHED); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + void __init rcu_init(void) { __rcu_init(); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 9bf445664457..6f62b77d93c4 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -46,11 +46,11 @@ #include <asm/atomic.h> #include <linux/bitops.h> #include <linux/module.h> +#include <linux/kthread.h> #include <linux/completion.h> #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> -#include <linux/rcupdate.h> #include <linux/cpu.h> #include <linux/random.h> #include <linux/delay.h> @@ -82,14 +82,18 @@ struct rcu_data { spinlock_t lock; /* Protect rcu_data fields. */ long completed; /* Number of last completed batch. */ int waitlistcount; - struct tasklet_struct rcu_tasklet; struct rcu_head *nextlist; struct rcu_head **nexttail; struct rcu_head *waitlist[GP_STAGES]; struct rcu_head **waittail[GP_STAGES]; - struct rcu_head *donelist; + struct rcu_head *donelist; /* from waitlist & waitschedlist */ struct rcu_head **donetail; long rcu_flipctr[2]; + struct rcu_head *nextschedlist; + struct rcu_head **nextschedtail; + struct rcu_head *waitschedlist; + struct rcu_head **waitschedtail; + int rcu_sched_sleeping; #ifdef CONFIG_RCU_TRACE struct rcupreempt_trace trace; #endif /* #ifdef CONFIG_RCU_TRACE */ @@ -131,11 +135,24 @@ enum rcu_try_flip_states { rcu_try_flip_waitmb_state, }; +/* + * States for rcu_ctrlblk.rcu_sched_sleep. + */ + +enum rcu_sched_sleep_states { + rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ + rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ + rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ +}; + struct rcu_ctrlblk { spinlock_t fliplock; /* Protect state-machine transitions. */ long completed; /* Number of last completed batch. */ enum rcu_try_flip_states rcu_try_flip_state; /* The current state of the rcu state machine */ + spinlock_t schedlock; /* Protect rcu_sched sleep state. */ + enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ + wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ }; static DEFINE_PER_CPU(struct rcu_data, rcu_data); @@ -143,8 +160,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, .rcu_try_flip_state = rcu_try_flip_idle_state, + .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), + .sched_sleep = rcu_sched_not_sleeping, + .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), }; +static struct task_struct *rcu_sched_grace_period_task; #ifdef CONFIG_RCU_TRACE static char *rcu_try_flip_state_names[] = @@ -207,6 +228,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) */ #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); +#define RCU_SCHED_BATCH_TIME (HZ / 50) + /* * Return the number of RCU batches processed thus far. Useful * for debug and statistics. @@ -411,32 +434,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } -#ifdef CONFIG_NO_HZ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { + .dynticks = 1, +}; -DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; -static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +#ifdef CONFIG_NO_HZ static DEFINE_PER_CPU(int, rcu_update_flag); /** * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. * * If the CPU was idle with dynamic ticks active, this updates the - * dynticks_progress_counter to let the RCU handling know that the + * rcu_dyntick_sched.dynticks to let the RCU handling know that the * CPU is active. */ void rcu_irq_enter(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); if (per_cpu(rcu_update_flag, cpu)) per_cpu(rcu_update_flag, cpu)++; /* * Only update if we are coming from a stopped ticks mode - * (dynticks_progress_counter is even). + * (rcu_dyntick_sched.dynticks is even). */ if (!in_interrupt() && - (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + (rdssp->dynticks & 0x1) == 0) { /* * The following might seem like we could have a race * with NMI/SMIs. But this really isn't a problem. @@ -459,12 +484,12 @@ void rcu_irq_enter(void) * RCU read-side critical sections on this CPU would * have already completed. */ - per_cpu(dynticks_progress_counter, cpu)++; + rdssp->dynticks++; /* * The following memory barrier ensures that any * rcu_read_lock() primitives in the irq handler * are seen by other CPUs to follow the above - * increment to dynticks_progress_counter. This is + * increment to rcu_dyntick_sched.dynticks. This is * required in order for other CPUs to correctly * determine when it is safe to advance the RCU * grace-period state machine. @@ -472,7 +497,7 @@ void rcu_irq_enter(void) smp_mb(); /* see above block comment. */ /* * Since we can't determine the dynamic tick mode from - * the dynticks_progress_counter after this routine, + * the rcu_dyntick_sched.dynticks after this routine, * we use a second flag to acknowledge that we came * from an idle state with ticks stopped. */ @@ -480,7 +505,7 @@ void rcu_irq_enter(void) /* * If we take an NMI/SMI now, they will also increment * the rcu_update_flag, and will not update the - * dynticks_progress_counter on exit. That is for + * rcu_dyntick_sched.dynticks on exit. That is for * this IRQ to do. */ } @@ -490,12 +515,13 @@ void rcu_irq_enter(void) * rcu_irq_exit - Called from exiting Hard irq context. * * If the CPU was idle with dynamic ticks active, update the - * dynticks_progress_counter to put let the RCU handling be + * rcu_dyntick_sched.dynticks to put let the RCU handling be * aware that the CPU is going back to idle with no ticks. */ void rcu_irq_exit(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); /* * rcu_update_flag is set if we interrupted the CPU @@ -503,7 +529,7 @@ void rcu_irq_exit(void) * Once this occurs, we keep track of interrupt nesting * because a NMI/SMI could also come in, and we still * only want the IRQ that started the increment of the - * dynticks_progress_counter to be the one that modifies + * rcu_dyntick_sched.dynticks to be the one that modifies * it on exit. */ if (per_cpu(rcu_update_flag, cpu)) { @@ -515,28 +541,29 @@ void rcu_irq_exit(void) /* * If an NMI/SMI happens now we are still - * protected by the dynticks_progress_counter being odd. + * protected by the rcu_dyntick_sched.dynticks being odd. */ /* * The following memory barrier ensures that any * rcu_read_unlock() primitives in the irq handler * are seen by other CPUs to preceed the following - * increment to dynticks_progress_counter. This + * increment to rcu_dyntick_sched.dynticks. This * is required in order for other CPUs to determine * when it is safe to advance the RCU grace-period * state machine. */ smp_mb(); /* see above block comment. */ - per_cpu(dynticks_progress_counter, cpu)++; - WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + rdssp->dynticks++; + WARN_ON(rdssp->dynticks & 0x1); } } static void dyntick_save_progress_counter(int cpu) { - per_cpu(rcu_dyntick_snapshot, cpu) = - per_cpu(dynticks_progress_counter, cpu); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->dynticks_snap = rdssp->dynticks; } static inline int @@ -544,9 +571,10 @@ rcu_try_flip_waitack_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -567,7 +595,7 @@ rcu_try_flip_waitack_needed(int cpu) * that this CPU already acknowledged the counter. */ - if ((curr - snap) > 2 || (snap & 0x1) == 0) + if ((curr - snap) > 2 || (curr & 0x1) == 0) return 0; /* We need this CPU to explicitly acknowledge the counter flip. */ @@ -580,9 +608,10 @@ rcu_try_flip_waitmb_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -609,14 +638,86 @@ rcu_try_flip_waitmb_needed(int cpu) return 1; } +static void dyntick_save_progress_counter_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_dynticks_snap = rdssp->dynticks; +} + +static int rcu_qsctr_inc_needed_dyntick(int cpu) +{ + long curr; + long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + curr = rdssp->dynticks; + snap = rdssp->sched_dynticks_snap; + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. Therefore, this CPU has been in a quiescent + * state the entire time, and we don't need to wait for it. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, this CPU has already + * passed through a quiescent state. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + #else /* !CONFIG_NO_HZ */ -# define dyntick_save_progress_counter(cpu) do { } while (0) -# define rcu_try_flip_waitack_needed(cpu) (1) -# define rcu_try_flip_waitmb_needed(cpu) (1) +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +# define dyntick_save_progress_counter_sched(cpu) do { } while (0) +# define rcu_qsctr_inc_needed_dyntick(cpu) (1) #endif /* CONFIG_NO_HZ */ +static void save_qsctr_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_qs_snap = rdssp->sched_qs; +} + +static inline int rcu_qsctr_inc_needed(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + /* + * If there has been a quiescent state, no more need to wait + * on this CPU. + */ + + if (rdssp->sched_qs != rdssp->sched_qs_snap) { + smp_mb(); /* force ordering with cpu entering schedule(). */ + return 0; + } + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -819,6 +920,26 @@ void rcu_check_callbacks(int cpu, int user) unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + /* + * If this CPU took its interrupt from user mode or from the + * idle loop, and this is not a nested interrupt, then + * this CPU has to have exited all prior preept-disable + * sections of code. So increment the counter to note this. + * + * The memory barrier is needed to handle the case where + * writes from a preempt-disable section of code get reordered + * into schedule() by this CPU's write buffer. So the memory + * barrier makes sure that the rcu_qsctr_inc() is seen by other + * CPUs to happen after any such write. + */ + + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + smp_mb(); /* Guard against aggressive schedule(). */ + rcu_qsctr_inc(cpu); + } + rcu_check_mb(cpu); if (rcu_ctrlblk.completed == rdp->completed) rcu_try_flip(); @@ -869,6 +990,8 @@ void rcu_offline_cpu(int cpu) struct rcu_head *list = NULL; unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + struct rcu_head *schedlist = NULL; + struct rcu_head **schedtail = &schedlist; struct rcu_head **tail = &list; /* @@ -882,6 +1005,11 @@ void rcu_offline_cpu(int cpu) rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], list, tail); rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); + rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, + schedlist, schedtail); + rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, + schedlist, schedtail); + rdp->rcu_sched_sleeping = 0; spin_unlock_irqrestore(&rdp->lock, flags); rdp->waitlistcount = 0; @@ -916,12 +1044,15 @@ void rcu_offline_cpu(int cpu) * fix. */ - local_irq_save(flags); + local_irq_save(flags); /* disable preempt till we know what lock. */ rdp = RCU_DATA_ME(); spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; + *rdp->nextschedtail = schedlist; + if (schedlist) + rdp->nextschedtail = schedtail; spin_unlock_irqrestore(&rdp->lock, flags); } @@ -936,10 +1067,25 @@ void rcu_offline_cpu(int cpu) void __cpuinit rcu_online_cpu(int cpu) { unsigned long flags; + struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); cpu_set(cpu, rcu_cpu_online_map); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); + + /* + * The rcu_sched grace-period processing might have bypassed + * this CPU, given that it was not in the rcu_cpu_online_map + * when the grace-period scan started. This means that the + * grace-period task might sleep. So make sure that if this + * should happen, the first callback posted to this CPU will + * wake up the grace-period task if need be. + */ + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + rdp->rcu_sched_sleeping = 1; + spin_unlock_irqrestore(&rdp->lock, flags); } static void rcu_process_callbacks(struct softirq_action *unused) @@ -982,31 +1128,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) *rdp->nexttail = head; rdp->nexttail = &head->next; RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock(&rdp->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&rdp->lock, flags); } EXPORT_SYMBOL_GPL(call_rcu); +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + int wake_gp = 0; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); + *rdp->nextschedtail = head; + rdp->nextschedtail = &head->next; + if (rdp->rcu_sched_sleeping) { + + /* Grace-period processing might be sleeping... */ + + rdp->rcu_sched_sleeping = 0; + wake_gp = 1; + } + spin_unlock_irqrestore(&rdp->lock, flags); + if (wake_gp) { + + /* Wake up grace-period processing, unless someone beat us. */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) + wake_gp = 0; + rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + if (wake_gp) + wake_up_interruptible(&rcu_ctrlblk.sched_wq); + } +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + /* * Wait until all currently running preempt_disable() code segments * (including hardware-irq-disable segments) complete. Note that * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -void __synchronize_sched(void) +synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +EXPORT_SYMBOL_GPL(__synchronize_sched); + +/* + * kthread function that manages call_rcu_sched grace periods. + */ +static int rcu_sched_grace_period(void *arg) { - cpumask_t oldmask; + int couldsleep; /* might sleep after current pass. */ + int couldsleepnext = 0; /* might sleep after next pass. */ int cpu; + unsigned long flags; + struct rcu_data *rdp; + int ret; - if (sched_getaffinity(0, &oldmask) < 0) - oldmask = cpu_possible_map; - for_each_online_cpu(cpu) { - sched_setaffinity(0, &cpumask_of_cpu(cpu)); - schedule(); - } - sched_setaffinity(0, &oldmask); + /* + * Each pass through the following loop handles one + * rcu_sched grace period cycle. + */ + do { + /* Save each CPU's current state. */ + + for_each_online_cpu(cpu) { + dyntick_save_progress_counter_sched(cpu); + save_qsctr_sched(cpu); + } + + /* + * Sleep for about an RCU grace-period's worth to + * allow better batching and to consume less CPU. + */ + schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); + + /* + * If there was nothing to do last time, prepare to + * sleep at the end of the current grace period cycle. + */ + couldsleep = couldsleepnext; + couldsleepnext = 1; + if (couldsleep) { + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + } + + /* + * Wait on each CPU in turn to have either visited + * a quiescent state or been in dynticks-idle mode. + */ + for_each_online_cpu(cpu) { + while (rcu_qsctr_inc_needed(cpu) && + rcu_qsctr_inc_needed_dyntick(cpu)) { + /* resched_cpu(cpu); @@@ */ + schedule_timeout_interruptible(1); + } + } + + /* Advance callbacks for each CPU. */ + + for_each_online_cpu(cpu) { + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + + /* + * We are running on this CPU irq-disabled, so no + * CPU can go offline until we re-enable irqs. + * The current CPU might have already gone + * offline (between the for_each_offline_cpu and + * the spin_lock_irqsave), but in that case all its + * callback lists will be empty, so no harm done. + * + * Advance the callbacks! We share normal RCU's + * donelist, since callbacks are invoked the + * same way in either case. + */ + if (rdp->waitschedlist != NULL) { + *rdp->donetail = rdp->waitschedlist; + rdp->donetail = rdp->waitschedtail; + + /* + * Next rcu_check_callbacks() will + * do the required raise_softirq(). + */ + } + if (rdp->nextschedlist != NULL) { + rdp->waitschedlist = rdp->nextschedlist; + rdp->waitschedtail = rdp->nextschedtail; + couldsleep = 0; + couldsleepnext = 0; + } else { + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + } + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + + /* Mark sleep intention. */ + + rdp->rcu_sched_sleeping = couldsleep; + + spin_unlock_irqrestore(&rdp->lock, flags); + } + + /* If we saw callbacks on the last scan, go deal with them. */ + + if (!couldsleep) + continue; + + /* Attempt to block... */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { + + /* + * Someone posted a callback after we scanned. + * Go take care of it. + */ + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + couldsleepnext = 0; + continue; + } + + /* Block until the next person posts a callback. */ + + rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + ret = 0; + __wait_event_interruptible(rcu_ctrlblk.sched_wq, + rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, + ret); + + /* + * Signals would prevent us from sleeping, and we cannot + * do much with them in any case. So flush them. + */ + if (ret) + flush_signals(current); + couldsleepnext = 0; + + } while (!kthread_should_stop()); + + return (0); } -EXPORT_SYMBOL_GPL(__synchronize_sched); /* * Check to see if any future RCU-related work will need to be done @@ -1023,7 +1334,9 @@ int rcu_needs_cpu(int cpu) return (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL); + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL); } int rcu_pending(int cpu) @@ -1034,7 +1347,9 @@ int rcu_pending(int cpu) if (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL) + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL) return 1; /* The RCU core needs an acknowledgement from this CPU. */ @@ -1101,6 +1416,11 @@ void __init __rcu_init(void) rdp->donetail = &rdp->donelist; rdp->rcu_flipctr[0] = 0; rdp->rcu_flipctr[1] = 0; + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + rdp->rcu_sched_sleeping = 0; } register_cpu_notifier(&rcu_nb); @@ -1123,11 +1443,15 @@ void __init __rcu_init(void) } /* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. + * Late-boot-time RCU initialization that must wait until after scheduler + * has been initialized. */ -void synchronize_kernel(void) +void __init rcu_init_sched(void) { - synchronize_rcu(); + rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, + NULL, + "rcu_sched_grace_period"); + WARN_ON(IS_ERR(rcu_sched_grace_period_task)); } #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 49ac4947af24..5edf82c34bbc 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -38,7 +38,6 @@ #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> -#include <linux/rcupdate.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/rcupreempt_trace.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 33acc424667e..90b5b123f7a1 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -57,7 +57,9 @@ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ -static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ +static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ +static int stutter = 5; /* Start/stop testing interval (in sec) */ +static int irqreader = 1; /* RCU readers from irq (timers). */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -72,6 +74,10 @@ module_param(test_no_idle_hz, bool, 0444); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0444); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -91,6 +97,7 @@ static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; +static struct task_struct *stutter_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -117,8 +124,18 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_timers = 0; static struct list_head rcu_torture_removed; +static int stutter_pause_test = 0; + +#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) +#define RCUTORTURE_RUNNABLE_INIT 1 +#else +#define RCUTORTURE_RUNNABLE_INIT 0 +#endif +int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; + /* * Allocate an element from the rcu_tortures pool. */ @@ -179,6 +196,16 @@ rcu_random(struct rcu_random_state *rrsp) return swahw32(rrsp->rrs_state); } +static void +rcu_stutter_wait(void) +{ + while (stutter_pause_test || !rcutorture_runnable) + if (rcutorture_runnable) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); +} + /* * Operations vector for selecting different types of tests. */ @@ -192,7 +219,9 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferredfree)(struct rcu_torture *p); void (*sync)(void); + void (*cb_barrier)(void); int (*stats)(char *page); + int irqcapable; char *name; }; static struct rcu_torture_ops *cur_ops = NULL; @@ -265,7 +294,9 @@ static struct rcu_torture_ops rcu_ops = { .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, .sync = synchronize_rcu, + .cb_barrier = rcu_barrier, .stats = NULL, + .irqcapable = 1, .name = "rcu" }; @@ -304,7 +335,9 @@ static struct rcu_torture_ops rcu_sync_ops = { .completed = rcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, + .cb_barrier = NULL, .stats = NULL, + .irqcapable = 1, .name = "rcu_sync" }; @@ -364,7 +397,9 @@ static struct rcu_torture_ops rcu_bh_ops = { .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, + .cb_barrier = rcu_barrier_bh, .stats = NULL, + .irqcapable = 1, .name = "rcu_bh" }; @@ -377,7 +412,9 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .completed = rcu_bh_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, + .cb_barrier = NULL, .stats = NULL, + .irqcapable = 1, .name = "rcu_bh_sync" }; @@ -458,6 +495,7 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, + .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu" }; @@ -482,6 +520,11 @@ static int sched_torture_completed(void) return 0; } +static void rcu_sched_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); +} + static void sched_torture_synchronize(void) { synchronize_sched(); @@ -494,12 +537,28 @@ static struct rcu_torture_ops sched_ops = { .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, + .deferredfree = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, + .cb_barrier = rcu_barrier_sched, .stats = NULL, + .irqcapable = 1, .name = "sched" }; +static struct rcu_torture_ops sched_ops_sync = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .name = "sched_sync" +}; + /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -537,6 +596,7 @@ rcu_torture_writer(void *arg) } rcu_torture_current_version++; oldbatch = cur_ops->completed(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); while (!kthread_should_stop()) @@ -560,6 +620,7 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); cur_ops->sync(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); @@ -569,6 +630,52 @@ rcu_torture_fakewriter(void *arg) } /* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(unsigned long unused) +{ + int idx; + int completed; + static DEFINE_RCU_RANDOM(rand); + static DEFINE_SPINLOCK(rand_lock); + struct rcu_torture *p; + int pipe_count; + + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + p = rcu_dereference(rcu_torture_current); + if (p == NULL) { + /* Leave because rcu_torture_writer is not yet underway */ + cur_ops->readunlock(idx); + return; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + spin_lock(&rand_lock); + cur_ops->readdelay(&rand); + n_rcu_torture_timers++; + spin_unlock(&rand_lock); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_count)[pipe_count]; + completed = cur_ops->completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_batch)[completed]; + preempt_enable(); + cur_ops->readunlock(idx); +} + +/* * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The * counter in the element should never be greater than 1, otherwise, the @@ -582,11 +689,18 @@ rcu_torture_reader(void *arg) DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; + struct timer_list t; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); + if (irqreader && cur_ops->irqcapable) + setup_timer_on_stack(&t, rcu_torture_timer, 0); do { + if (irqreader && cur_ops->irqcapable) { + if (!timer_pending(&t)) + mod_timer(&t, 1); + } idx = cur_ops->readlock(); completed = cur_ops->completed(); p = rcu_dereference(rcu_torture_current); @@ -615,8 +729,11 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + if (irqreader && cur_ops->irqcapable) + del_timer_sync(&t); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -647,20 +764,22 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d", + "rtmbe: %d nt: %ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free), - atomic_read(&n_rcu_torture_mberror)); + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_timers); if (atomic_read(&n_rcu_torture_mberror) != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); + WARN_ON_ONCE(1); } cnt += sprintf(&page[cnt], "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) @@ -785,15 +904,34 @@ rcu_torture_shuffle(void *arg) return 0; } +/* Cause the rcutorture test to "stutter", starting and stopping all + * threads periodically. + */ +static int +rcu_torture_stutter(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + do { + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 1; + if (!kthread_should_stop()) + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 0; + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + return 0; +} + static inline void rcu_torture_print_module_parms(char *tag) { printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval = %d\n", + "shuffle_interval=%d stutter=%d irqreader=%d\n", torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval); + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader); } static void @@ -802,6 +940,11 @@ rcu_torture_cleanup(void) int i; fullstop = 1; + if (stutter_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + kthread_stop(stutter_task); + } + stutter_task = NULL; if (shuffler_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); @@ -848,7 +991,9 @@ rcu_torture_cleanup(void) stats_task = NULL; /* Wait for all RCU callbacks to fire. */ - rcu_barrier(); + + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ @@ -868,7 +1013,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &srcu_ops, &sched_ops, }; + &srcu_ops, &sched_ops, &sched_ops_sync, }; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -988,6 +1133,19 @@ rcu_torture_init(void) goto unwind; } } + if (stutter < 0) + stutter = 0; + if (stutter) { + /* Create the stutter thread */ + stutter_task = kthread_run(rcu_torture_stutter, NULL, + "rcu_torture_stutter"); + if (IS_ERR(stutter_task)) { + firsterr = PTR_ERR(stutter_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + goto unwind; + } + } return 0; unwind: diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 092e4c620af9..a56f629b057a 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -297,8 +297,8 @@ static int test_func(void *data) * * opcode:data */ -static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, - size_t count) +static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, + const char *buf, size_t count) { struct sched_param schedpar; struct test_thread_data *td; @@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, * @dev: thread to query * @buf: char buffer to be filled with thread status info */ -static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) +static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) { struct test_thread_data *td; struct task_struct *tsk; diff --git a/kernel/sched.c b/kernel/sched.c index 99e6d850ecab..b1104ea5d255 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7737,11 +7737,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_mc_power_savings_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } static ssize_t sched_mc_power_savings_store(struct sys_device *dev, + struct sysdev_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); @@ -7751,11 +7753,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_smt_power_savings_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } static ssize_t sched_smt_power_savings_store(struct sys_device *dev, + struct sysdev_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); diff --git a/kernel/smp.c b/kernel/smp.c new file mode 100644 index 000000000000..462c785ca1ee --- /dev/null +++ b/kernel/smp.c @@ -0,0 +1,383 @@ +/* + * Generic helpers for smp ipi calls + * + * (C) Jens Axboe <jens.axboe@oracle.com> 2008 + * + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <linux/smp.h> + +static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); +static LIST_HEAD(call_function_queue); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); + +enum { + CSD_FLAG_WAIT = 0x01, + CSD_FLAG_ALLOC = 0x02, +}; + +struct call_function_data { + struct call_single_data csd; + spinlock_t lock; + unsigned int refs; + cpumask_t cpumask; + struct rcu_head rcu_head; +}; + +struct call_single_queue { + struct list_head list; + spinlock_t lock; +}; + +void __cpuinit init_call_single_data(void) +{ + int i; + + for_each_possible_cpu(i) { + struct call_single_queue *q = &per_cpu(call_single_queue, i); + + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->list); + } +} + +static void csd_flag_wait(struct call_single_data *data) +{ + /* Wait for response */ + do { + /* + * We need to see the flags store in the IPI handler + */ + smp_mb(); + if (!(data->flags & CSD_FLAG_WAIT)) + break; + cpu_relax(); + } while (1); +} + +/* + * Insert a previously allocated call_single_data element for execution + * on the given CPU. data must already have ->func, ->info, and ->flags set. + */ +static void generic_exec_single(int cpu, struct call_single_data *data) +{ + struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); + int wait = data->flags & CSD_FLAG_WAIT, ipi; + unsigned long flags; + + spin_lock_irqsave(&dst->lock, flags); + ipi = list_empty(&dst->list); + list_add_tail(&data->list, &dst->list); + spin_unlock_irqrestore(&dst->lock, flags); + + if (ipi) + arch_send_call_function_single_ipi(cpu); + + if (wait) + csd_flag_wait(data); +} + +static void rcu_free_call_data(struct rcu_head *head) +{ + struct call_function_data *data; + + data = container_of(head, struct call_function_data, rcu_head); + + kfree(data); +} + +/* + * Invoked by arch to handle an IPI for call function. Must be called with + * interrupts disabled. + */ +void generic_smp_call_function_interrupt(void) +{ + struct call_function_data *data; + int cpu = get_cpu(); + + /* + * It's ok to use list_for_each_rcu() here even though we may delete + * 'pos', since list_del_rcu() doesn't clear ->next + */ + rcu_read_lock(); + list_for_each_entry_rcu(data, &call_function_queue, csd.list) { + int refs; + + if (!cpu_isset(cpu, data->cpumask)) + continue; + + data->csd.func(data->csd.info); + + spin_lock(&data->lock); + cpu_clear(cpu, data->cpumask); + WARN_ON(data->refs == 0); + data->refs--; + refs = data->refs; + spin_unlock(&data->lock); + + if (refs) + continue; + + spin_lock(&call_function_lock); + list_del_rcu(&data->csd.list); + spin_unlock(&call_function_lock); + + if (data->csd.flags & CSD_FLAG_WAIT) { + /* + * serialize stores to data with the flag clear + * and wakeup + */ + smp_wmb(); + data->csd.flags &= ~CSD_FLAG_WAIT; + } else + call_rcu(&data->rcu_head, rcu_free_call_data); + } + rcu_read_unlock(); + + put_cpu(); +} + +/* + * Invoked by arch to handle an IPI for call function single. Must be called + * from the arch with interrupts disabled. + */ +void generic_smp_call_function_single_interrupt(void) +{ + struct call_single_queue *q = &__get_cpu_var(call_single_queue); + LIST_HEAD(list); + + /* + * Need to see other stores to list head for checking whether + * list is empty without holding q->lock + */ + smp_mb(); + while (!list_empty(&q->list)) { + unsigned int data_flags; + + spin_lock(&q->lock); + list_replace_init(&q->list, &list); + spin_unlock(&q->lock); + + while (!list_empty(&list)) { + struct call_single_data *data; + + data = list_entry(list.next, struct call_single_data, + list); + list_del(&data->list); + + /* + * 'data' can be invalid after this call if + * flags == 0 (when called through + * generic_exec_single(), so save them away before + * making the call. + */ + data_flags = data->flags; + + data->func(data->info); + + if (data_flags & CSD_FLAG_WAIT) { + smp_wmb(); + data->flags &= ~CSD_FLAG_WAIT; + } else if (data_flags & CSD_FLAG_ALLOC) + kfree(data); + } + /* + * See comment on outer loop + */ + smp_mb(); + } +} + +/* + * smp_call_function_single - Run a function on a specific CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Note that @wait + * will be implicitly turned on in case of allocation failures, since + * we fall back to on-stack allocation. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) +{ + struct call_single_data d; + unsigned long flags; + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + if (cpu == me) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } else { + struct call_single_data *data = NULL; + + if (!wait) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) + data->flags = CSD_FLAG_ALLOC; + } + if (!data) { + data = &d; + data->flags = CSD_FLAG_WAIT; + } + + data->func = func; + data->info = info; + generic_exec_single(cpu, data); + } + + put_cpu(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); + +/** + * __smp_call_function_single(): Run a function on another CPU + * @cpu: The CPU to run on. + * @data: Pre-allocated and setup data structure + * + * Like smp_call_function_single(), but allow caller to pass in a pre-allocated + * data structure. Useful for embedding @data inside other structures, for + * instance. + * + */ +void __smp_call_function_single(int cpu, struct call_single_data *data) +{ + /* Can deadlock when called with interrupts disabled */ + WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); + + generic_exec_single(cpu, data); +} + +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned. Note that @wait + * will be implicitly turned on in case of allocation failures, since + * we fall back to on-stack allocation. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. Preemption + * must be disabled when calling this function. + */ +int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, + int wait) +{ + struct call_function_data d; + struct call_function_data *data = NULL; + cpumask_t allbutself; + unsigned long flags; + int cpu, num_cpus; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + cpu = smp_processor_id(); + allbutself = cpu_online_map; + cpu_clear(cpu, allbutself); + cpus_and(mask, mask, allbutself); + num_cpus = cpus_weight(mask); + + /* + * If zero CPUs, return. If just a single CPU, turn this request + * into a targetted single call instead since it's faster. + */ + if (!num_cpus) + return 0; + else if (num_cpus == 1) { + cpu = first_cpu(mask); + return smp_call_function_single(cpu, func, info, wait); + } + + if (!wait) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) + data->csd.flags = CSD_FLAG_ALLOC; + } + if (!data) { + data = &d; + data->csd.flags = CSD_FLAG_WAIT; + wait = 1; + } + + spin_lock_init(&data->lock); + data->csd.func = func; + data->csd.info = info; + data->refs = num_cpus; + data->cpumask = mask; + + spin_lock_irqsave(&call_function_lock, flags); + list_add_tail_rcu(&data->csd.list, &call_function_queue); + spin_unlock_irqrestore(&call_function_lock, flags); + + /* Send a message to all CPUs in the map */ + arch_send_call_function_ipi(mask); + + /* optionally wait for the CPUs to complete */ + if (wait) + csd_flag_wait(&data->csd); + + return 0; +} +EXPORT_SYMBOL(smp_call_function_mask); + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. In case of allocation + * failure, @wait will be implicitly turned on. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func)(void *), void *info, int wait) +{ + int ret; + + preempt_disable(); + ret = smp_call_function_mask(cpu_online_map, func, info, wait); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(smp_call_function); + +void ipi_call_lock(void) +{ + spin_lock(&call_function_lock); +} + +void ipi_call_unlock(void) +{ + spin_unlock(&call_function_lock); +} + +void ipi_call_lock_irq(void) +{ + spin_lock_irq(&call_function_lock); +} + +void ipi_call_unlock_irq(void) +{ + spin_unlock_irq(&call_function_lock); +} diff --git a/kernel/softirq.c b/kernel/softirq.c index 3e9e896fdc5b..81e2fe0f983a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -645,12 +645,12 @@ __init int spawn_ksoftirqd(void) /* * Call a function on all processors */ -int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) +int on_each_cpu(void (*func) (void *info), void *info, int wait) { int ret = 0; preempt_disable(); - ret = smp_call_function(func, info, retry, wait); + ret = smp_call_function(func, info, wait); local_irq_disable(); func(info); local_irq_enable(); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5b9b467de070..0fea0ee12da9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -59,6 +59,7 @@ cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); cond_syscall(sys_epoll_pwait); +cond_syscall(compat_sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ab59ac008caf..2a7b9d88706b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -83,6 +83,9 @@ extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifdef CONFIG_RCU_TORTURE_TEST +extern int rcutorture_runnable; +#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ #if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) @@ -108,7 +111,7 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES extern char modprobe_path[]; #endif #ifdef CONFIG_CHR_DEV_SG @@ -473,7 +476,7 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES { .ctl_name = KERN_MODPROBE, .procname = "modprobe", @@ -832,6 +835,16 @@ static struct ctl_table kern_table[] = { .child = key_sysctls, }, #endif +#ifdef CONFIG_RCU_TORTURE_TEST + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rcutorture_runnable", + .data = &rcutorture_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dadde5361f32..b1c2da81b050 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs) * Provides sysfs interface for listing current clocksource. */ static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +sysfs_show_current_clocksources(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { ssize_t count = 0; @@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) * clocksource selction. */ static ssize_t sysfs_override_clocksource(struct sys_device *dev, + struct sysdev_attribute *attr, const char *buf, size_t count) { struct clocksource *ovr = NULL; @@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Provides sysfs interface for listing registered clocksources */ static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +sysfs_show_available_clocksources(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) { struct clocksource *src; ssize_t count = 0; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 67f80c261709..f48d0f09d32f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -268,7 +268,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu) "offline CPU #%d\n", *oncpu); else smp_call_function_single(*oncpu, tick_do_broadcast_on_off, - &reason, 1, 1); + &reason, 1); } /* |