From af446b702c58b700cc5fa99f6edc78b99e55b995 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 10 Sep 2011 21:54:08 -0700 Subject: rcu: ->signaled better named ->fqs_state The ->signaled field was named before complications in the form of dyntick-idle mode and offlined CPUs. These complications have required that force_quiescent_state() be implemented as a state machine, instead of simply unconditionally sending reschedule IPIs. Therefore, this commit renames ->signaled to ->fqs_state to catch up with the new force_quiescent_state() reality. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b76d812740c..5d0b55a3a8c0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; NUM_RCU_LVL_3, \ NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_GP_IDLE, \ + .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ @@ -866,8 +866,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); - rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -877,7 +877,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, @@ -927,7 +927,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp = rcu_get_root(rsp); raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -991,7 +991,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->signaled = RCU_GP_IDLE; + rsp->fqs_state = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -1457,7 +1457,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_fqs_ret; /* no GP in progress, time updated. */ } rsp->fqs_active = 1; - switch (rsp->signaled) { + switch (rsp->fqs_state) { case RCU_GP_IDLE: case RCU_GP_INIT: @@ -1473,7 +1473,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) force_qs_rnp(rsp, dyntick_save_progress_counter); raw_spin_lock(&rnp->lock); /* irqs already disabled */ if (rcu_gp_in_progress(rsp)) - rsp->signaled = RCU_FORCE_QS; + rsp->fqs_state = RCU_FORCE_QS; break; case RCU_FORCE_QS: -- cgit v1.2.3-70-g09d2 From 9b2e4f1880b789be1f24f9684f7a54b90310b5c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Sep 2011 12:10:22 -0700 Subject: rcu: Track idleness independent of idle tasks Earlier versions of RCU used the scheduling-clock tick to detect idleness by checking for the idle task, but handled idleness differently for CONFIG_NO_HZ=y. But there are now a number of uses of RCU read-side critical sections in the idle task, for example, for tracing. A more fine-grained detection of idleness is therefore required. This commit presses the old dyntick-idle code into full-time service, so that rcu_idle_enter(), previously known as rcu_enter_nohz(), is always invoked at the beginning of an idle loop iteration. Similarly, rcu_idle_exit(), previously known as rcu_exit_nohz(), is always invoked at the end of an idle-loop iteration. This allows the idle task to use RCU everywhere except between consecutive rcu_idle_enter() and rcu_idle_exit() calls, in turn allowing architecture maintainers to specify exactly where in the idle loop that RCU may be used. Because some of the userspace upcall uses can result in what looks to RCU like half of an interrupt, it is not possible to expect that the irq_enter() and irq_exit() hooks will give exact counts. This patch therefore expands the ->dynticks_nesting counter to 64 bits and uses two separate bitfields to count process/idle transitions and interrupt entry/exit transitions. It is presumed that userspace upcalls do not happen in the idle loop or from usermode execution (though usermode might do a system call that results in an upcall). The counter is hard-reset on each process/idle transition, which avoids the interrupt entry/exit error from accumulating. Overflow is avoided by the 64-bitness of the ->dyntick_nesting counter. This commit also adds warnings if a non-idle task asks RCU to enter idle state (and these checks will need some adjustment before applying Frederic's OS-jitter patches (http://lkml.org/lkml/2011/10/7/246). In addition, validation of ->dynticks and ->dynticks_nesting is added. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 4 - include/linux/hardirq.h | 21 ---- include/linux/rcupdate.h | 21 +--- include/linux/tick.h | 11 ++- include/trace/events/rcu.h | 10 +- kernel/rcutiny.c | 124 ++++++++++++++++++++---- kernel/rcutree.c | 229 +++++++++++++++++++++++++++++++------------- kernel/rcutree.h | 15 +-- kernel/rcutree_trace.c | 10 +- kernel/time/tick-sched.c | 6 +- 10 files changed, 297 insertions(+), 154 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index aaf65f6c6cd7..49587abfc2f7 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -105,14 +105,10 @@ o "dt" is the current value of the dyntick counter that is incremented or one greater than the interrupt-nesting depth otherwise. The number after the second "/" is the NMI nesting depth. - This field is displayed only for CONFIG_NO_HZ kernels. - o "df" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being in dynticks-idle state. - This field is displayed only for CONFIG_NO_HZ kernels. - o "of" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being offline. In a perfect world, this might never happen, but it diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f743883f769e..bb7f30971858 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -139,20 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk) extern void account_system_vtime(struct task_struct *tsk); #endif -#if defined(CONFIG_NO_HZ) #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -static inline void rcu_irq_enter(void) -{ - rcu_exit_nohz(); -} - -static inline void rcu_irq_exit(void) -{ - rcu_enter_nohz(); -} static inline void rcu_nmi_enter(void) { @@ -163,17 +150,9 @@ static inline void rcu_nmi_exit(void) } #else -extern void rcu_irq_enter(void); -extern void rcu_irq_exit(void); extern void rcu_nmi_enter(void); extern void rcu_nmi_exit(void); #endif -#else -# define rcu_irq_enter() do { } while (0) -# define rcu_irq_exit() do { } while (0) -# define rcu_nmi_enter() do { } while (0) -# define rcu_nmi_exit() do { } while (0) -#endif /* #if defined(CONFIG_NO_HZ) */ /* * It is safe to do non-atomic ops on ->hardirq_context, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2cf4226ade7e..cd1ad4b04c6d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -177,23 +177,10 @@ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; - -#ifdef CONFIG_NO_HZ - -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -#else /* #ifdef CONFIG_NO_HZ */ - -static inline void rcu_enter_nohz(void) -{ -} - -static inline void rcu_exit_nohz(void) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ */ +extern void rcu_idle_enter(void); +extern void rcu_idle_exit(void); +extern void rcu_irq_enter(void); +extern void rcu_irq_exit(void); /* * Infrastructure to implement the synchronize_() primitives in diff --git a/include/linux/tick.h b/include/linux/tick.h index b232ccc0ee29..ca40838fdfb7 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -127,8 +127,15 @@ extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_stop_sched_tick(int inidle) { } -static inline void tick_nohz_restart_sched_tick(void) { } +static inline void tick_nohz_stop_sched_tick(int inidle) +{ + if (inidle) + rcu_idle_enter(); +} +static inline void tick_nohz_restart_sched_tick(void) +{ + rcu_idle_exit(); +} static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 669fbd62ec25..e5771804c507 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -246,19 +246,21 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity), + TP_PROTO(char *polarity, int nesting), - TP_ARGS(polarity), + TP_ARGS(polarity, nesting), TP_STRUCT__entry( __field(char *, polarity) + __field(int, nesting) ), TP_fast_assign( __entry->polarity = polarity; + __entry->nesting = nesting; ), - TP_printk("%s", __entry->polarity) + TP_printk("%s %d", __entry->polarity, __entry->nesting) ); /* @@ -443,7 +445,7 @@ TRACE_EVENT(rcu_batch_end, #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity) do { } while (0) +#define trace_rcu_dyntick(polarity, nesting) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 636af6d9c6e5..3ab77bdc90c4 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,31 +53,122 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -#ifdef CONFIG_NO_HZ +static long long rcu_dynticks_nesting = LLONG_MAX / 2; -static long rcu_dynticks_nesting = 1; +/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ +static void rcu_idle_enter_common(void) +{ + if (rcu_dynticks_nesting) { + RCU_TRACE(trace_rcu_dyntick("--=", rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting)); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", + rcu_dynticks_nesting)); + ftrace_dump(DUMP_ALL); + } + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} /* - * Enter dynticks-idle mode, which is an extended quiescent state - * if we have fully entered that mode (i.e., if the new value of - * dynticks_nesting is zero). + * Enter idle, which is an extended quiescent state if we have fully + * entered that mode (i.e., if the new value of dynticks_nesting is zero). */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { - if (--rcu_dynticks_nesting == 0) - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + unsigned long flags; + + local_irq_save(flags); + rcu_dynticks_nesting = 0; + rcu_idle_enter_common(); + local_irq_restore(flags); } /* - * Exit dynticks-idle mode, so that we are no longer in an extended - * quiescent state. + * Exit an interrupt handler towards idle. + */ +void rcu_irq_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_dynticks_nesting--; + WARN_ON_ONCE(rcu_dynticks_nesting < 0); + rcu_idle_enter_common(); + local_irq_restore(flags); +} + +/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ +static void rcu_idle_exit_common(long long oldval) +{ + if (oldval) { + RCU_TRACE(trace_rcu_dyntick("++=", rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("End", oldval)); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", + oldval)); + ftrace_dump(DUMP_ALL); + } +} + +/* + * Exit idle, so that we are no longer in an extended quiescent state. */ -void rcu_exit_nohz(void) +void rcu_idle_exit(void) { + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rcu_dynticks_nesting = LLONG_MAX / 2; + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +/* + * Enter an interrupt handler, moving away from idle. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting++; + WARN_ON_ONCE(rcu_dynticks_nesting == 0); + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +#ifdef CONFIG_PROVE_RCU + +/* + * Test whether RCU thinks that the current CPU is idle. + */ +int rcu_is_cpu_idle(void) +{ + return !rcu_dynticks_nesting; } -#endif /* #ifdef CONFIG_NO_HZ */ +#endif /* #ifdef CONFIG_PROVE_RCU */ + +/* + * Test whether the current CPU was interrupted from idle. Nested + * interrupts don't count, we must be running at the first interrupt + * level. + */ +int rcu_is_cpu_rrupt_from_idle(void) +{ + return rcu_dynticks_nesting <= 0; +} /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). @@ -126,14 +217,13 @@ void rcu_bh_qs(int cpu) /* * Check to see if the scheduling-clock interrupt came from an extended - * quiescent state, and, if so, tell RCU about it. + * quiescent state, and, if so, tell RCU about it. This function must + * be called from hardirq context. It is normally called from the + * scheduling-clock interrupt. */ void rcu_check_callbacks(int cpu, int user) { - if (user || - (idle_cpu(cpu) && - !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) + if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5d0b55a3a8c0..1c40326724f6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -#ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = 1, + .dynticks_nesting = LLONG_MAX / 2, .dynticks = ATOMIC_INIT(1), }; -#endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ @@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } - /* If preemptible RCU, no point in sending reschedule IPI. */ - if (rdp->preemptible) - return 0; - - /* The CPU is online, so send it a reschedule IPI. */ + /* + * The CPU is online, so send it a reschedule IPI. This forces + * it through the scheduler, and (inefficiently) also handles cases + * where idle loops fail to inform RCU about the CPU being idle. + */ if (rdp->cpu != smp_processor_id()) smp_send_reschedule(rdp->cpu); else @@ -343,51 +341,97 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#ifdef CONFIG_NO_HZ +/* + * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * + * If the new value of the ->dynticks_nesting counter now is zero, + * we really have entered idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_enter_common(struct rcu_dynticks *rdtp) +{ + if (rdtp->dynticks_nesting) { + trace_rcu_dyntick("--=", rdtp->dynticks_nesting); + return; + } + trace_rcu_dyntick("Start", rdtp->dynticks_nesting); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + trace_rcu_dyntick("Error on entry: not idle task", + rdtp->dynticks_nesting); + ftrace_dump(DUMP_ALL); + } + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); +} /** - * rcu_enter_nohz - inform RCU that current CPU is entering nohz + * rcu_idle_enter - inform RCU that current CPU is entering idle * - * Enter nohz mode, in other words, -leave- the mode in which RCU + * Enter idle mode, in other words, -leave- the mode in which RCU * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in nohz mode, a possibility - * handled by rcu_irq_enter() and rcu_irq_exit()). + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * We crowbar the ->dynticks_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { unsigned long flags; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (--rdtp->dynticks_nesting) { - local_irq_restore(flags); - return; - } - trace_rcu_dyntick("Start"); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rdtp->dynticks_nesting = 0; + rcu_idle_enter_common(rdtp); local_irq_restore(flags); } -/* - * rcu_exit_nohz - inform RCU that current CPU is leaving nohz +/** + * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle + * + * Exit from an interrupt handler, which might possibly result in entering + * idle mode, in other words, leaving the mode in which read-side critical + * sections can occur. * - * Exit nohz mode, in other words, -enter- the mode in which RCU - * read-side critical sections normally occur. + * This code assumes that the idle loop never does anything that might + * result in unbalanced calls to irq_enter() and irq_exit(). If your + * architecture violates this assumption, RCU will give you what you + * deserve, good and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. */ -void rcu_exit_nohz(void) +void rcu_irq_exit(void) { unsigned long flags; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks_nesting++) { - local_irq_restore(flags); + rdtp->dynticks_nesting--; + WARN_ON_ONCE(rdtp->dynticks_nesting < 0); + rcu_idle_enter_common(rdtp); + local_irq_restore(flags); +} + +/* + * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * + * If the new value of the ->dynticks_nesting counter was previously zero, + * we really have exited idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +{ + if (oldval) { + trace_rcu_dyntick("++=", rdtp->dynticks_nesting); return; } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ @@ -395,7 +439,71 @@ void rcu_exit_nohz(void) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - trace_rcu_dyntick("End"); + trace_rcu_dyntick("End", oldval); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + trace_rcu_dyntick("Error on exit: not idle task", oldval); + ftrace_dump(DUMP_ALL); + } +} + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * We crowbar the ->dynticks_nesting field to LLONG_MAX/2 to allow for + * the possibility of usermode upcalls messing up our count + * of interrupt nesting level during the busy period that is just + * now starting. + */ +void rcu_idle_exit(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rdtp->dynticks_nesting = LLONG_MAX / 2; + rcu_idle_exit_common(rdtp, oldval); + local_irq_restore(flags); +} + +/** + * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle + * + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. + * + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to + * user mode! This code assumes that the idle loop never does upcalls to + * user mode. If your architecture does do upcalls from the idle loop (or + * does anything else that results in unbalanced calls to the irq_enter() + * and irq_exit() functions), RCU will give you what you deserve, good + * and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting++; + WARN_ON_ONCE(rdtp->dynticks_nesting == 0); + rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -442,27 +550,32 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } +#ifdef CONFIG_PROVE_RCU + /** - * rcu_irq_enter - inform RCU of entry to hard irq context + * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * - * If the CPU was idle with dynamic ticks active, this updates the - * rdtp->dynticks to let the RCU handling know that the CPU is active. + * If the current CPU is in its idle loop and is neither in an interrupt + * or NMI handler, return true. The caller must have at least disabled + * preemption. */ -void rcu_irq_enter(void) +int rcu_is_cpu_idle(void) { - rcu_exit_nohz(); + return (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; } +#endif /* #ifdef CONFIG_PROVE_RCU */ + /** - * rcu_irq_exit - inform RCU of exit from hard irq context + * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle * - * If the CPU was idle with dynamic ticks active, update the rdp->dynticks - * to put let the RCU handling be aware that the CPU is going back to idle - * with no ticks. + * If the current CPU is idle or running at a first-level (not nested) + * interrupt from idle, return true. The caller must have at least + * disabled preemption. */ -void rcu_irq_exit(void) +int rcu_is_cpu_rrupt_from_idle(void) { - rcu_enter_nohz(); + return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } #ifdef CONFIG_SMP @@ -512,24 +625,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#else /* #ifdef CONFIG_NO_HZ */ - -#ifdef CONFIG_SMP - -static int dyntick_save_progress_counter(struct rcu_data *rdp) -{ - return 0; -} - -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) -{ - return rcu_implicit_offline_qs(rdp); -} - -#endif /* #ifdef CONFIG_SMP */ - -#endif /* #else #ifdef CONFIG_NO_HZ */ - int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) @@ -1334,16 +1429,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). * Also schedule RCU core processing. * - * This function must be called with hardirqs disabled. It is normally + * This function must be called from hardirq context. It is normally * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + if (user || rcu_is_cpu_rrupt_from_idle()) { /* * Get here if this CPU took its interrupt from user @@ -1913,9 +2006,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rdp->qlen = 0; -#ifdef CONFIG_NO_HZ rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -#endif /* #ifdef CONFIG_NO_HZ */ + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1942,6 +2035,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 517f2f89a293..0963fa1541ac 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,9 +84,10 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track irq/process nesting level. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ + long long dynticks_nesting; /* Track irq/process nesting level. */ + /* Process level is worth LLONG_MAX/2. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ }; /* RCU's kthread states for tracing. */ @@ -274,16 +275,12 @@ struct rcu_data { /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ -#ifdef CONFIG_NO_HZ /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ -#endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ -#ifdef CONFIG_NO_HZ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ -#endif /* #ifdef CONFIG_NO_HZ */ unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long resched_ipi; /* Sent a resched IPI. */ @@ -307,11 +304,7 @@ struct rcu_data { #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS -#endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 59c7bee4ce0f..654cfe67f0d1 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d/%d df=%lu", + seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, " ql=%ld qs=%c%c%c%c", rdp->qlen, @@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, ",%d,%d,%d,%lu", + seq_printf(m, ",%d,%llx,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != @@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); -#ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); -#endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0ba..5d9d23665f12 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -434,7 +434,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; - rcu_enter_nohz(); } ts->idle_sleeps++; @@ -473,6 +472,8 @@ out: ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); end: + if (inidle) + rcu_idle_enter(); local_irq_restore(flags); } @@ -529,6 +530,7 @@ void tick_nohz_restart_sched_tick(void) ktime_t now; local_irq_disable(); + rcu_idle_exit(); if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); @@ -543,8 +545,6 @@ void tick_nohz_restart_sched_tick(void) ts->inidle = 0; - rcu_exit_nohz(); - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); -- cgit v1.2.3-70-g09d2 From 34240697d619c439c55f21989680024dcb604aab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Oct 2011 11:38:52 -0700 Subject: rcu: Disable preemption in rcu_is_cpu_idle() Because rcu_is_cpu_idle() is to be used to check for extended quiescent states in RCU-preempt read-side critical sections, it cannot assume that preemption is disabled. And preemption must be disabled when accessing the dyntick-idle state, because otherwise the following sequence of events could occur: 1. Task A on CPU 1 enters rcu_is_cpu_idle() and picks up the pointer to CPU 1's per-CPU variables. 2. Task B preempts Task A and starts running on CPU 1. 3. Task A migrates to CPU 2. 4. Task B blocks, leaving CPU 1 idle. 5. Task A continues execution on CPU 2, accessing CPU 1's dyntick-idle information using the pointer fetched in step 1 above, and finds that CPU 1 is idle. 6. Task A therefore incorrectly concludes that it is executing in an extended quiescent state, possibly issuing a spurious splat. Therefore, this commit disables preemption within the rcu_is_cpu_idle() function. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1c40326724f6..69b6cdd4f944 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -556,12 +556,16 @@ void rcu_nmi_exit(void) * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * * If the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. The caller must have at least disabled - * preemption. + * or NMI handler, return true. */ int rcu_is_cpu_idle(void) { - return (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + int ret; + + preempt_disable(); + ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + preempt_enable(); + return ret; } #endif /* #ifdef CONFIG_PROVE_RCU */ -- cgit v1.2.3-70-g09d2 From b40d293eb36ba40cd428b6d178db911174689702 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2011 07:12:34 -0700 Subject: rcu: Omit self-awaken when setting up expedited grace period When setting up an expedited grace period, if there were no readers, the task will awaken itself. This commit removes this useless self-awakening. Signed-off-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree.h | 3 ++- kernel/rcutree_plugin.h | 16 +++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 69b6cdd4f944..8afb2e89745b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1320,7 +1320,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) else raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, true); rcu_node_kthread_setaffinity(rnp, -1); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 0963fa1541ac..fd2f87db2ab1 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 708dc579634d..0f095d1cc16d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -410,7 +410,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(&rcu_preempt_state, rnp); + rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); } else { local_irq_restore(flags); } @@ -732,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * + * Most callers will set the "wake" flag, but the task initiating the + * expedited grace period need not wake itself. + * * Caller must hold sync_rcu_preempt_exp_mutex. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { unsigned long flags; unsigned long mask; @@ -747,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up(&sync_rcu_preempt_exp_wq); + if (wake) + wake_up(&sync_rcu_preempt_exp_wq); break; } mask = rnp->grpmask; @@ -780,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) must_wait = 1; } if (!must_wait) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } /* @@ -1072,7 +1077,8 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { return; } -- cgit v1.2.3-70-g09d2 From e6b80a3b0994ea6c3d876d72464f2debbfcfeb05 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 16:25:18 -0700 Subject: rcu: Detect illegal rcu dereference in extended quiescent state Report that none of the rcu read lock maps are held while in an RCU extended quiescent state (the section between rcu_idle_enter() and rcu_idle_exit()). This helps detect any use of rcu_dereference() and friends from within the section in idle where RCU is not allowed. This way we can guarantee an extended quiescent window where the CPU can be put in dyntick idle mode or can simply aoid to be part of any global grace period completion while in the idle loop. Uses of RCU from such mode are totally ignored by RCU, hence the importance of these checks. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 26 ++++++++++++++++++++++++++ kernel/rcupdate.c | 2 ++ kernel/rcutiny.c | 1 + kernel/rcutree.c | 1 + 4 files changed, 30 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8d315b013e37..bf91fcfe181c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -228,6 +228,15 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) #ifdef CONFIG_DEBUG_LOCK_ALLOC +#ifdef CONFIG_PROVE_RCU +extern int rcu_is_cpu_idle(void); +#else /* !CONFIG_PROVE_RCU */ +static inline int rcu_is_cpu_idle(void) +{ + return 0; +} +#endif /* else !CONFIG_PROVE_RCU */ + extern struct lockdep_map rcu_lock_map; # define rcu_read_acquire() \ lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) @@ -262,6 +271,8 @@ static inline int rcu_read_lock_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; return lock_is_held(&rcu_lock_map); } @@ -285,6 +296,19 @@ extern int rcu_read_lock_bh_held(void); * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. */ #ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) @@ -293,6 +317,8 @@ static inline int rcu_read_lock_sched_held(void) if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 92e771d7b44b..2bc4e135ff23 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 3ab77bdc90c4..b4e0b4981768 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -157,6 +157,7 @@ int rcu_is_cpu_idle(void) { return !rcu_dynticks_nesting; } +EXPORT_SYMBOL(rcu_is_cpu_idle); #endif /* #ifdef CONFIG_PROVE_RCU */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8afb2e89745b..489b62a67d35 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -567,6 +567,7 @@ int rcu_is_cpu_idle(void) preempt_enable(); return ret; } +EXPORT_SYMBOL(rcu_is_cpu_idle); #endif /* #ifdef CONFIG_PROVE_RCU */ -- cgit v1.2.3-70-g09d2 From 4145fa7fbee3ec1e61c52825b146192885d9759f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Oct 2011 15:01:54 -0700 Subject: rcu: Deconfuse dynticks entry-exit tracing The trace_rcu_dyntick() trace event did not print both the old and the new value of the nesting level, and furthermore printed only the low-order 32 bits of it. This could result in some confusion when interpreting trace-event dumps, so this commit prints both the old and the new value, prints the full 64 bits, and also selects the process-entry/exit increment to print nicely in hexadecimal. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/trace/events/rcu.h | 15 +++++++++------ kernel/rcu.h | 7 +++++++ kernel/rcutiny.c | 28 +++++++++++++++++----------- kernel/rcutree.c | 35 ++++++++++++++++++++--------------- 4 files changed, 53 insertions(+), 32 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 172620a92b1a..c29fb2f55909 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -246,21 +246,24 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity, int nesting), + TP_PROTO(char *polarity, long long oldnesting, long long newnesting), - TP_ARGS(polarity, nesting), + TP_ARGS(polarity, oldnesting, newnesting), TP_STRUCT__entry( __field(char *, polarity) - __field(int, nesting) + __field(long long, oldnesting) + __field(long long, newnesting) ), TP_fast_assign( __entry->polarity = polarity; - __entry->nesting = nesting; + __entry->oldnesting = oldnesting; + __entry->newnesting = newnesting; ), - TP_printk("%s %d", __entry->polarity, __entry->nesting) + TP_printk("%s %llx %llx", __entry->polarity, + __entry->oldnesting, __entry->newnesting) ); /* @@ -470,7 +473,7 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity, nesting) do { } while (0) +#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) diff --git a/kernel/rcu.h b/kernel/rcu.h index f600868d550d..aa88baab5f78 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -29,6 +29,13 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +/* + * Process-level increment to ->dynticks_nesting field. This allows for + * architectures that use half-interrupts and half-exceptions from + * process context. + */ +#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index b4e0b4981768..9b9bdf666fb5 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,20 +53,21 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -static long long rcu_dynticks_nesting = LLONG_MAX / 2; +static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(void) +static void rcu_idle_enter_common(long long oldval) { if (rcu_dynticks_nesting) { - RCU_TRACE(trace_rcu_dyntick("--=", rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("--=", + oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", - rcu_dynticks_nesting)); + oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); } rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ @@ -79,10 +80,12 @@ static void rcu_idle_enter_common(void) void rcu_idle_enter(void) { unsigned long flags; + long long oldval; local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting = 0; - rcu_idle_enter_common(); + rcu_idle_enter_common(oldval); local_irq_restore(flags); } @@ -92,11 +95,13 @@ void rcu_idle_enter(void) void rcu_irq_exit(void) { unsigned long flags; + long long oldval; local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting--; WARN_ON_ONCE(rcu_dynticks_nesting < 0); - rcu_idle_enter_common(); + rcu_idle_enter_common(oldval); local_irq_restore(flags); } @@ -104,14 +109,15 @@ void rcu_irq_exit(void) static void rcu_idle_exit_common(long long oldval) { if (oldval) { - RCU_TRACE(trace_rcu_dyntick("++=", rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("++=", + oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick("End", oldval)); + RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", - oldval)); + oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); } } @@ -127,7 +133,7 @@ void rcu_idle_exit(void) local_irq_save(flags); oldval = rcu_dynticks_nesting; WARN_ON_ONCE(oldval != 0); - rcu_dynticks_nesting = LLONG_MAX / 2; + rcu_dynticks_nesting = DYNTICK_TASK_NESTING; rcu_idle_exit_common(oldval); local_irq_restore(flags); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 489b62a67d35..06e40dd53b23 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -196,7 +196,7 @@ void rcu_note_context_switch(int cpu) EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = LLONG_MAX / 2, + .dynticks_nesting = DYNTICK_TASK_NESTING, .dynticks = ATOMIC_INIT(1), }; @@ -348,17 +348,17 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp) +static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) { if (rdtp->dynticks_nesting) { - trace_rcu_dyntick("--=", rdtp->dynticks_nesting); + trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); return; } - trace_rcu_dyntick("Start", rdtp->dynticks_nesting); + trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ trace_rcu_dyntick("Error on entry: not idle task", - rdtp->dynticks_nesting); + oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); } /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ @@ -383,12 +383,14 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp) void rcu_idle_enter(void) { unsigned long flags; + long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting = 0; - rcu_idle_enter_common(rdtp); + rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -411,13 +413,15 @@ void rcu_idle_enter(void) void rcu_irq_exit(void) { unsigned long flags; + long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); - rcu_idle_enter_common(rdtp); + rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -431,7 +435,7 @@ void rcu_irq_exit(void) static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) { if (oldval) { - trace_rcu_dyntick("++=", rdtp->dynticks_nesting); + trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); return; } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ @@ -439,10 +443,11 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - trace_rcu_dyntick("End", oldval); + trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ - trace_rcu_dyntick("Error on exit: not idle task", oldval); + trace_rcu_dyntick("Error on exit: not idle task", + oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); } } @@ -453,8 +458,8 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to LLONG_MAX/2 to allow for - * the possibility of usermode upcalls messing up our count + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. */ @@ -468,7 +473,7 @@ void rcu_idle_exit(void) rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval != 0); - rdtp->dynticks_nesting = LLONG_MAX / 2; + rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -2012,7 +2017,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->nxttail[i] = &rdp->nxtlist; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; @@ -2040,7 +2045,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ -- cgit v1.2.3-70-g09d2 From 0989cb46783188ea7346ba6490be0046b9b7a725 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Nov 2011 08:57:21 -0700 Subject: rcu: Add more information to the wrong-idle-task complaint The current code just complains if the current task is not the idle task. This commit therefore adds printing of the identity of the idle task. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 12 ++++++++++-- kernel/rcutree.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 9b9bdf666fb5..6d70ff71a875 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -65,10 +65,14 @@ static void rcu_idle_enter_common(long long oldval) } RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ } @@ -115,10 +119,14 @@ static void rcu_idle_exit_common(long long oldval) } RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 06e40dd53b23..9888a0ad2d4e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -356,10 +356,14 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + trace_rcu_dyntick("Error on entry: not idle task", oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ @@ -445,10 +449,14 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + trace_rcu_dyntick("Error on exit: not idle task", oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } } -- cgit v1.2.3-70-g09d2 From aea1b35e29e658d42d7ba2237f3aa7f93e18509d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2011 06:54:54 -0700 Subject: rcu: Allow dyntick-idle mode for CPUs with callbacks Currently, RCU does not permit a CPU to enter dyntick-idle mode if that CPU has any RCU callbacks queued. This means that workloads for which each CPU wakes up and does some RCU updates every few ticks will never enter dyntick-idle mode. This can result in significant unnecessary power consumption, so this patch permits a given to enter dyntick-idle mode if it has callbacks, but only if that same CPU has completed all current work for the RCU core. We determine use rcu_pending() to determine whether a given CPU has completed all current work for the RCU core. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 +- kernel/rcutree.h | 4 ++ kernel/rcutree_plugin.h | 156 ++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 132 insertions(+), 33 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 9888a0ad2d4e..b1711c48a7ec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -365,6 +365,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ atomic_inc(&rdtp->dynticks); @@ -1085,6 +1086,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * callbacks are waiting on the grace period that just now * completed. */ + rcu_schedule_wake_gp_end(); if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1670,6 +1672,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); + rcu_wake_cpus_for_gp_end(); trace_rcu_utilization("End RCU core"); } @@ -1923,7 +1926,7 @@ static int rcu_pending(int cpu) * by the current CPU, even if none need be done immediately, returning * 1 if so. */ -static int rcu_needs_cpu_quick_check(int cpu) +static int rcu_cpu_has_callbacks(int cpu) { /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || diff --git a/kernel/rcutree.h b/kernel/rcutree.h index fd2f87db2ab1..ea32405177c9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,6 +88,7 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + int wake_gp_end; /* A GP ended, need to wake up CPUs. */ }; /* RCU's kthread states for tracing. */ @@ -467,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); #endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_for_idle(int cpu); +static void rcu_wake_cpus_for_gp_end(void); +static void rcu_schedule_wake_gp_end(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7a7961feeecf..b70ca8cc52e1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1953,7 +1953,31 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); */ int rcu_needs_cpu(int cpu) { - return rcu_needs_cpu_quick_check(cpu); + return rcu_cpu_has_callbacks(cpu); +} + +/* + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * is nothing. + */ +static void rcu_prepare_for_idle(int cpu) +{ +} + +/* + * CPUs are never putting themselves to sleep with callbacks pending, + * so there is no need to awaken them. + */ +static void rcu_wake_cpus_for_gp_end(void) +{ +} + +/* + * CPUs are never putting themselves to sleep with callbacks pending, + * so there is no need to schedule the act of awakening them. + */ +static void rcu_schedule_wake_gp_end(void) +{ } #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -1961,47 +1985,56 @@ int rcu_needs_cpu(int cpu) #define RCU_NEEDS_CPU_FLUSHES 5 static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); +static DEFINE_PER_CPU(bool, rcu_awake_at_gp_end); /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + */ +int rcu_needs_cpu(int cpu) +{ + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) + return 0; + /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ + return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; +} + +/* + * Check to see if any RCU-related work can be done by the current CPU, + * and if so, schedule a softirq to get it done. This function is part + * of the RCU implementation; it is -not- an exported member of the RCU API. * - * Because we are not supporting preemptible RCU, attempt to accelerate - * any current grace periods so that RCU no longer needs this CPU, but - * only if all other CPUs are already in dynticks-idle mode. This will - * allow the CPU cores to be powered down immediately, as opposed to after - * waiting many milliseconds for grace periods to elapse. + * The idea is for the current CPU to clear out all work required by the + * RCU core for the current grace period, so that this CPU can be permitted + * to enter dyntick-idle mode. In some cases, it will need to be awakened + * at the end of the grace period by whatever CPU ends the grace period. + * This allows CPUs to go dyntick-idle more quickly, and to reduce the + * number of wakeups by a modest integer factor. * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu) +static void rcu_prepare_for_idle(int cpu) { int c = 0; - int snap; - int thatcpu; - /* Check for being in the holdoff period. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) - return rcu_needs_cpu_quick_check(cpu); - - /* Don't bother unless we are the last non-dyntick-idle CPU. */ - for_each_online_cpu(thatcpu) { - if (thatcpu == cpu) - continue; - snap = atomic_add_return(0, &per_cpu(rcu_dynticks, - thatcpu).dynticks); - smp_mb(); /* Order sampling of snap with end of grace period. */ - if ((snap & 0x1) != 0) { - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - return rcu_needs_cpu_quick_check(cpu); - } + /* If no callbacks or in the holdoff period, enter dyntick-idle. */ + if (!rcu_cpu_has_callbacks(cpu)) { + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + return; } + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + return; /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { @@ -2010,10 +2043,25 @@ int rcu_needs_cpu(int cpu) } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - return rcu_needs_cpu_quick_check(cpu); + if (!rcu_pending(cpu)) { + per_cpu(rcu_awake_at_gp_end, cpu) = 1; + return; /* Nothing to do immediately. */ + } + invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + return; } - /* Do one step pushing remaining RCU callbacks through. */ + /* + * Do one step of pushing the remaining RCU callbacks through + * the RCU core state machine. + */ +#ifdef CONFIG_TREE_PREEMPT_RCU + if (per_cpu(rcu_preempt_data, cpu).nxtlist) { + rcu_preempt_qs(cpu); + force_quiescent_state(&rcu_preempt_state, 0); + c = c || per_cpu(rcu_preempt_data, cpu).nxtlist; + } +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); @@ -2028,7 +2076,51 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) invoke_rcu_core(); - return c; } +/* + * Wake up a CPU by invoking the RCU core. Intended for use by + * rcu_wake_cpus_for_gp_end(), which passes this function to + * smp_call_function_single(). + */ +static void rcu_wake_cpu(void *unused) +{ + invoke_rcu_core(); +} + +/* + * If an RCU grace period ended recently, scan the rcu_awake_at_gp_end + * per-CPU variables, and wake up any CPUs that requested a wakeup. + */ +static void rcu_wake_cpus_for_gp_end(void) +{ + int cpu; + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (!rdtp->wake_gp_end) + return; + rdtp->wake_gp_end = 0; + for_each_online_cpu(cpu) { + if (per_cpu(rcu_awake_at_gp_end, cpu)) { + per_cpu(rcu_awake_at_gp_end, cpu) = 0; + smp_call_function_single(cpu, rcu_wake_cpu, NULL, 0); + } + } +} + +/* + * A grace period has just ended, and so we will need to awaken CPUs + * that now have work to do. But we cannot send IPIs with interrupts + * disabled, so just set a flag so that this will happen upon exit + * from RCU core processing. + */ +static void rcu_schedule_wake_gp_end(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + rdtp->wake_gp_end = 1; +} + +/* @@@ need tracing as well. */ + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -- cgit v1.2.3-70-g09d2 From 11dbaa8cb79a6e4a234a134898436f717a663f01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2011 07:38:25 -0700 Subject: rcu: Fix idle-task checks RCU has traditionally relied on idle_cpu() to determine whether a given CPU is running in the context of an idle task, but commit 908a3283 (Fix idle_cpu()) has invalidated this approach. After commit 908a3283, idle_cpu() will return true if the current CPU is currently running the idle task, and will be doing so for the foreseeable future. RCU instead needs to know whether or not the current CPU is currently running the idle task, regardless of what the near future might bring. This commit therefore switches from idle_cpu() to "current->pid != 0". Reported-by: Wu Fengguang Suggested-by: Carsten Emde Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt Tested-by: Wu Fengguang Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 4 ++-- kernel/rcutree.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 6d70ff71a875..4e16ce36fa03 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -64,7 +64,7 @@ static void rcu_idle_enter_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", @@ -118,7 +118,7 @@ static void rcu_idle_exit_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b1711c48a7ec..49e0783fb200 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -355,7 +355,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) return; } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", @@ -449,7 +449,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", -- cgit v1.2.3-70-g09d2 From 99745b6a83414006f5c1e83efaebb423b41b67ef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 15:48:45 -0800 Subject: rcu: Make RCU use the new is_idle_task() API Change from direct comparison of ->pid with zero to is_idle_task(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 4 ++-- kernel/rcutree.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 4e16ce36fa03..e5bd94954fa3 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -64,7 +64,7 @@ static void rcu_idle_enter_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", @@ -118,7 +118,7 @@ static void rcu_idle_exit_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 49e0783fb200..7fb8b0e60811 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -355,7 +355,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) return; } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", @@ -449,7 +449,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", -- cgit v1.2.3-70-g09d2 From f535a607c13c7b674e0788ca5765779aa74a01c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 20:43:02 -0800 Subject: rcu: Eliminate RCU_FAST_NO_HZ grace-period hang With the new implementation of RCU_FAST_NO_HZ, it was possible to hang RCU grace periods as follows: o CPU 0 attempts to go idle, cycles several times through the rcu_prepare_for_idle() loop, then goes dyntick-idle when RCU needs nothing more from it, while still having at least on RCU callback pending. o CPU 1 goes idle with no callbacks. Both CPUs can then stay in dyntick-idle mode indefinitely, preventing the RCU grace period from ever completing, possibly hanging the system. This commit therefore prevents CPUs that have RCU callbacks from entering dyntick-idle mode. This approach also eliminates the need for the end-of-grace-period IPIs used previously. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 - kernel/rcutree.c | 2 -- kernel/rcutree.h | 3 -- kernel/rcutree_plugin.h | 78 ++-------------------------------------------- 4 files changed, 2 insertions(+), 82 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index debe453c9623..8dd6fcb94946 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -287,7 +287,6 @@ TRACE_EVENT(rcu_dyntick, * * "No callbacks": Nothing to do, no callbacks on this CPU. * "In holdoff": Nothing to do, holding off after unsuccessful attempt. - * "Dyntick with callbacks": Callbacks remain, but RCU doesn't need CPU. * "Begin holdoff": Attempt failed, don't retry until next jiffy. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7fb8b0e60811..13fab4a9f9fb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1086,7 +1086,6 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * callbacks are waiting on the grace period that just now * completed. */ - rcu_schedule_wake_gp_end(); if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1672,7 +1671,6 @@ static void rcu_process_callbacks(struct softirq_action *unused) &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - rcu_wake_cpus_for_gp_end(); trace_rcu_utilization("End RCU core"); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea32405177c9..70d8a557090f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,7 +88,6 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - int wake_gp_end; /* A GP ended, need to wake up CPUs. */ }; /* RCU's kthread states for tracing. */ @@ -469,7 +468,5 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle(int cpu); -static void rcu_wake_cpus_for_gp_end(void); -static void rcu_schedule_wake_gp_end(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c4daf1e19e01..3d84dbc113d6 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1964,28 +1964,11 @@ static void rcu_prepare_for_idle(int cpu) { } -/* - * CPUs are never putting themselves to sleep with callbacks pending, - * so there is no need to awaken them. - */ -static void rcu_wake_cpus_for_gp_end(void) -{ -} - -/* - * CPUs are never putting themselves to sleep with callbacks pending, - * so there is no need to schedule the act of awakening them. - */ -static void rcu_schedule_wake_gp_end(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #define RCU_NEEDS_CPU_FLUSHES 5 static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(bool, rcu_awake_at_gp_end); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2032,26 +2015,16 @@ static void rcu_prepare_for_idle(int cpu) local_irq_save(flags); /* - * If there are no callbacks on this CPU or if RCU has no further - * need for this CPU at the moment, enter dyntick-idle mode. - * Also reset state so as to not prejudice later attempts. + * If there are no callbacks on this CPU, enter dyntick-idle mode. + * Also reset state to avoid prejudicing later attempts. */ if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_awake_at_gp_end, cpu) = 0; local_irq_restore(flags); trace_rcu_prep_idle("No callbacks"); return; } - if (!rcu_pending(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_awake_at_gp_end, cpu) = 1; - local_irq_restore(flags); - trace_rcu_prep_idle("Dyntick with callbacks"); - return; /* Nothing to do immediately. */ - } /* * If in holdoff mode, just return. We will presumably have @@ -2067,7 +2040,6 @@ static void rcu_prepare_for_idle(int cpu) if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; - per_cpu(rcu_awake_at_gp_end, cpu) = 0; } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; @@ -2113,50 +2085,4 @@ static void rcu_prepare_for_idle(int cpu) } } -/* - * Wake up a CPU by invoking the RCU core. Intended for use by - * rcu_wake_cpus_for_gp_end(), which passes this function to - * smp_call_function_single(). - */ -static void rcu_wake_cpu(void *unused) -{ - trace_rcu_prep_idle("CPU awakened at GP end"); - invoke_rcu_core(); -} - -/* - * If an RCU grace period ended recently, scan the rcu_awake_at_gp_end - * per-CPU variables, and wake up any CPUs that requested a wakeup. - */ -static void rcu_wake_cpus_for_gp_end(void) -{ - int cpu; - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (!rdtp->wake_gp_end) - return; - rdtp->wake_gp_end = 0; - for_each_online_cpu(cpu) { - if (per_cpu(rcu_awake_at_gp_end, cpu)) { - per_cpu(rcu_awake_at_gp_end, cpu) = 0; - smp_call_function_single(cpu, rcu_wake_cpu, NULL, 0); - } - } -} - -/* - * A grace period has just ended, and so we will need to awaken CPUs - * that now have work to do. But we cannot send IPIs with interrupts - * disabled, so just set a flag so that this will happen upon exit - * from RCU core processing. - */ -static void rcu_schedule_wake_gp_end(void) -{ - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - rdtp->wake_gp_end = 1; -} - -/* @@@ need tracing as well. */ - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -- cgit v1.2.3-70-g09d2 From c92b131bdcf89bf79870f1631d07547241a98f6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Nov 2011 13:38:58 -0800 Subject: rcu: Remove dynticks false positives and RCU failures Assertions in rcu_init_percpu_data() unknowingly relied on outgoing CPUs being turned off before reaching the idle loop. Unfortunately, when running under kvm/qemu on x86, CPUs really can get to idle before begin shut off. These CPUs are then born in dyntick-idle mode from an RCU perspective, which results in splats in rcu_init_percpu_data() and in RCU wrongly ignoring those CPUs despite them being active. This in turn can cause RCU to end grace periods prematurely, potentially freeing up memory that the newly onlined CPUs were still using. This is most decidedly not what we need to see in an RCU implementation. This commit therefore replaces the assertions in rcu_init_percpu_data() with code that forces RCU's dyntick-idle view of newly onlined CPUs to match reality. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 13fab4a9f9fb..aab9ed504b17 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2054,8 +2054,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); - WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1); + rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; + atomic_set(&rdp->dynticks->dynticks, + (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* -- cgit v1.2.3-70-g09d2 From f0e7c19db8798b4b991a2c71911e71f5dfdb348f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Nov 2011 15:02:05 -0800 Subject: rcu: Identify dyntick-idle CPUs on first force_quiescent_state() pass Fixes and workarounds for a number of issues (for example, that in df4012edc) make it safe to once again detect dyntick-idle CPUs on the first pass of force_quiescent_state(), so this commit makes that change. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index aab9ed504b17..69bb37287cc8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -607,7 +607,7 @@ int rcu_is_cpu_rrupt_from_idle(void) static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); - return 0; + return (rdp->dynticks_snap & 0x1) == 0; } /* -- cgit v1.2.3-70-g09d2 From 7cb92499000e3c86dae653077b1465458a039ef6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Nov 2011 12:28:34 -0800 Subject: rcu: Permit dyntick-idle with callbacks pending The current implementation of RCU_FAST_NO_HZ prevents CPUs from entering dyntick-idle state if they have RCU callbacks pending. Unfortunately, this has the side-effect of often preventing them from entering this state, especially if at least one other CPU is not in dyntick-idle state. However, the resulting per-tick wakeup is wasteful in many cases: if the CPU has already fully responded to the current RCU grace period, there will be nothing for it to do until this grace period ends, which will frequently take several jiffies. This commit therefore permits a CPU that has done everything that the current grace period has asked of it (rcu_pending() == 0) even if it still as RCU callbacks pending. However, such a CPU posts a timer to wake it up several jiffies later (6 jiffies, based on experience with grace-period lengths). This wakeup is required to handle situations that can result in all CPUs being in dyntick-idle mode, thus failing to ever complete the current grace period. If a CPU wakes up before the timer goes off, then it cancels that timer, thus avoiding spurious wakeups. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 3 +- kernel/rcutree.c | 3 ++ kernel/rcutree.h | 2 ++ kernel/rcutree_plugin.h | 75 +++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 78 insertions(+), 5 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 8dd6fcb94946..c75418c3ccb8 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -288,9 +288,10 @@ TRACE_EVENT(rcu_dyntick, * "No callbacks": Nothing to do, no callbacks on this CPU. * "In holdoff": Nothing to do, holding off after unsuccessful attempt. * "Begin holdoff": Attempt failed, don't retry until next jiffy. + * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! - * "CPU awakened at GP end": + * "Timer": Timer fired to cause CPU to continue processing callbacks. */ TRACE_EVENT(rcu_prep_idle, diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 69bb37287cc8..bf085d7f6a3f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -448,6 +448,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); @@ -2057,6 +2058,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); + rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* @@ -2138,6 +2140,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_send_cbs_to_online(&rcu_bh_state); rcu_send_cbs_to_online(&rcu_sched_state); rcu_preempt_send_cbs_to_online(); + rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 70d8a557090f..9bcfbc9d16c6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -467,6 +467,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); #endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_for_idle_init(int cpu); +static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 42ca5a400ae3..dbcea6b93aea 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1947,15 +1947,29 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. * - * Because we have preemptible RCU, just check whether this CPU needs - * any flavor of RCU. Do not chew up lots of CPU cycles with preemption - * disabled in a most-likely vain attempt to cause RCU not to need this CPU. + * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs + * any flavor of RCU. */ int rcu_needs_cpu(int cpu) { return rcu_cpu_has_callbacks(cpu); } +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ +} + +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up + * after it. + */ +static void rcu_cleanup_after_idle(int cpu) +{ +} + /* * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, * is nothing. @@ -1966,9 +1980,12 @@ static void rcu_prepare_for_idle(int cpu) #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#define RCU_NEEDS_CPU_FLUSHES 5 +#define RCU_NEEDS_CPU_FLUSHES 5 /* Allow for callback self-repost. */ +#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); +static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); +static ktime_t rcu_idle_gp_wait; /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -1988,6 +2005,47 @@ int rcu_needs_cpu(int cpu) return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; } +/* + * Timer handler used to force CPU to start pushing its remaining RCU + * callbacks in the case where it entered dyntick-idle mode with callbacks + * pending. The hander doesn't really need to do anything because the + * real work is done upon re-entry to idle, or by the next scheduling-clock + * interrupt should idle not be re-entered. + */ +static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +{ + trace_rcu_prep_idle("Timer"); + return HRTIMER_NORESTART; +} + +/* + * Initialize the timer used to pull CPUs out of dyntick-idle mode. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ + static int firsttime = 1; + struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + + hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtp->function = rcu_idle_gp_timer_func; + if (firsttime) { + unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); + + rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); + firsttime = 0; + } +} + +/* + * Clean up for exit from idle. Because we are exiting from idle, there + * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * do nothing if this timer is not active, so just cancel it unconditionally. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); +} + /* * Check to see if any RCU-related work can be done by the current CPU, * and if so, schedule a softirq to get it done. This function is part @@ -2040,6 +2098,15 @@ static void rcu_prepare_for_idle(int cpu) /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* Can we go dyntick-idle despite still having callbacks? */ + if (!rcu_pending(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); + return; /* Nothing more to do immediately. */ + } + /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; local_irq_restore(flags); -- cgit v1.2.3-70-g09d2 From b6fc6020140db437069d5bec447858fcfd64d31c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 28 Nov 2011 16:18:56 -0800 Subject: rcu: Don't check irq nesting from rcu idle entry/exit Because tasks do not nest, rcu_idle_enter() and rcu_idle_exit() do not need to check for nesting. This commit therefore moves nesting checks from rcu_idle_enter_common() to rcu_irq_exit() and from rcu_idle_exit_common() to rcu_irq_enter(). Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index bf085d7f6a3f..860c02c7c959 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -350,10 +350,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) */ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) { - if (rdtp->dynticks_nesting) { - trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); - return; - } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); @@ -426,7 +422,10 @@ void rcu_irq_exit(void) oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); - rcu_idle_enter_common(rdtp, oldval); + if (rdtp->dynticks_nesting) + trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); + else + rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -439,10 +438,6 @@ void rcu_irq_exit(void) */ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) { - if (oldval) { - trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); - return; - } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ @@ -518,7 +513,10 @@ void rcu_irq_enter(void) oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); - rcu_idle_exit_common(rdtp, oldval); + if (oldval) + trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); + else + rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } -- cgit v1.2.3-70-g09d2 From facc4e159672b4ed10aa18147bfa187b013c9505 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 28 Nov 2011 16:26:56 -0800 Subject: rcu: Irq nesting is always 0 on rcu_enter_idle_common Because tasks don't nest, the ->dyntick_nesting must always be zero upon entry to rcu_idle_enter_common(). Therefore, pass "0" rather than the counter itself. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 860c02c7c959..c0ed3765ec39 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -350,12 +350,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) */ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) { - trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick("Start", oldval, 0); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on entry: not idle task", - oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, -- cgit v1.2.3-70-g09d2 From dff1672d9199fffddb58fa7970ccf59005fc35f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Nov 2011 15:57:13 -0800 Subject: rcu: Keep invoking callbacks if CPU otherwise idle The rcu_do_batch() function that invokes callbacks for TREE_RCU and TREE_PREEMPT_RCU normally throttles callback invocation to avoid degrading scheduling latency. However, as long as the CPU would otherwise be idle, there is no downside to continuing to invoke any callbacks that have passed through their grace periods. In fact, processing such callbacks in a timely manner has the benefit of increasing the probability that the CPU can enter the power-saving dyntick-idle mode. Therefore, this commit allows callback invocation to continue beyond the preset limit as long as the scheduler does not have some other task to run and as long as context is that of the idle task or the relevant RCU kthread. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 ++++- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c0ed3765ec39..4ec4b14cfba6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1403,7 +1403,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) debug_rcu_head_unqueue(list); __rcu_reclaim(rsp->name, list); list = next; - if (++count >= bl) + /* Stop only if limit reached and CPU has something to do. */ + if (++count >= bl && + (need_resched() || + (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 9bcfbc9d16c6..fddff92d6676 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -455,6 +455,7 @@ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index dbcea6b93aea..adb6e666c6f4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1336,6 +1336,15 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_restore(flags); } +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return __get_cpu_var(rcu_cpu_kthread_task) == current; +} + /* * Set the affinity of the boost kthread. The CPU-hotplug locks are * held, so no one should be messing with the existence of the boost @@ -1780,6 +1789,11 @@ static void invoke_rcu_callbacks_kthread(void) WARN_ON_ONCE(1); } +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -- cgit v1.2.3-70-g09d2 From 2d1dc9a600edf33321bcdc1c808b7957d8a3f3e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Nov 2011 17:29:18 -0800 Subject: rcu: Remove redundant rcu_cpu_stall_suppress declaration No point in having two identical rcu_cpu_stall_suppress declarations, so remove the more obscure of the two. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4ec4b14cfba6..2b2e1a996a65 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -642,8 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -int rcu_cpu_stall_suppress __read_mostly; - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; -- cgit v1.2.3-70-g09d2 From 4968c300e1fa5389fdf1f1ebd8b8e4aec9aa4a9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Dec 2011 16:32:40 -0800 Subject: rcu: Augment rcu_batch_end tracing for idle and callback state The current rcu_batch_end event trace records only the name of the RCU flavor and the total number of callbacks that remain queued on the current CPU. This is insufficient for testing and tuning the new dyntick-idle RCU_FAST_NO_HZ code, so this commit adds idle state along with whether or not any of the callbacks that were ready to invoke at the beginning of rcu_do_batch() are still queued. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 38 +++++++++++++++++++++++++++++--------- kernel/rcutiny.c | 10 ++++++++-- kernel/rcutiny_plugin.h | 25 +++++++++++++++++++++++++ kernel/rcutree.c | 8 ++++++-- 4 files changed, 68 insertions(+), 13 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index c75418c3ccb8..d2d88bed891b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -461,27 +461,46 @@ TRACE_EVENT(rcu_invoke_kfree_callback, /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been - * invoked. The first argument is the name of the RCU flavor and - * the second argument is number of callbacks actually invoked. + * invoked. The first argument is the name of the RCU flavor, + * the second argument is number of callbacks actually invoked, + * the third argument (cb) is whether or not any of the callbacks that + * were ready to invoke at the beginning of this batch are still + * queued, the fourth argument (nr) is the return value of need_resched(), + * the fifth argument (iit) is 1 if the current task is the idle task, + * and the sixth argument (risk) is the return value from + * rcu_is_callbacks_kthread(). */ TRACE_EVENT(rcu_batch_end, - TP_PROTO(char *rcuname, int callbacks_invoked), + TP_PROTO(char *rcuname, int callbacks_invoked, + bool cb, bool nr, bool iit, bool risk), - TP_ARGS(rcuname, callbacks_invoked), + TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk), TP_STRUCT__entry( __field(char *, rcuname) __field(int, callbacks_invoked) + __field(bool, cb) + __field(bool, nr) + __field(bool, iit) + __field(bool, risk) ), TP_fast_assign( __entry->rcuname = rcuname; __entry->callbacks_invoked = callbacks_invoked; - ), - - TP_printk("%s CBs-invoked=%d", - __entry->rcuname, __entry->callbacks_invoked) + __entry->cb = cb; + __entry->nr = nr; + __entry->iit = iit; + __entry->risk = risk; + ), + + TP_printk("%s CBs-invoked=%d idle=%c%c%c%c", + __entry->rcuname, __entry->callbacks_invoked, + __entry->cb ? 'C' : '.', + __entry->nr ? 'S' : '.', + __entry->iit ? 'I' : '.', + __entry->risk ? 'R' : '.') ); /* @@ -524,7 +543,8 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) -#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) +#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \ + do { } while (0) #define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) #endif /* #else #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e5bd94954fa3..977296dca0a4 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -259,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) { RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, + ACCESS_ONCE(rcp->rcucblist), + need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread())); return; } @@ -288,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread())); } static void rcu_process_callbacks(struct softirq_action *unused) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 2b0484a5dc28..dfa97cbb3910 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void) wake_up(&rcu_kthread_wq); } +#ifdef CONFIG_RCU_TRACE + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return rcu_kthread_task == current; +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + /* * This kthread invokes RCU callbacks whose grace periods have * elapsed. It is awakened as needed, and takes the place of the @@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void) raise_softirq(RCU_SOFTIRQ); } +#ifdef CONFIG_RCU_TRACE + +/* + * There is no callback kthread, so this thread is never it. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + void rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2b2e1a996a65..6c4a6722abfd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1373,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, 0, 0); - trace_rcu_batch_end(rsp->name, 0); + trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), + need_resched(), is_idle_task(current), + rcu_is_callbacks_kthread()); return; } @@ -1409,7 +1411,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count); + trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; -- cgit v1.2.3-70-g09d2