From e2c73a6860bdf54f2c6bf8cddc34ddc91a1343e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Sep 2021 14:18:51 -0700 Subject: rcu: Remove the RCU_FAST_NO_HZ Kconfig option All of the uses of CONFIG_RCU_FAST_NO_HZ=y that I have seen involve systems with RCU callbacks offloaded. In this situation, all that this Kconfig option does is slow down idle entry/exit with an additional allways-taken early exit. If this is the only use case, then this Kconfig option nothing but an attractive nuisance that needs to go away. This commit therefore removes the RCU_FAST_NO_HZ Kconfig option. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..7ca1aa46083c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -189,11 +189,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; @@ -419,8 +414,6 @@ static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static void __init rcu_spawn_boost_kthreads(void); -static void rcu_cleanup_after_idle(void); -static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); -- cgit v1.2.3-70-g09d2 From 6120b72e25e195b6fa15b0a674479a38166c392a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Sep 2021 14:10:48 +0200 Subject: rcu: Remove rcu_data.exp_deferred_qs and convert to rcu_data.cpu no_qs.b.exp Having two fields for the same purpose with subtle differences on different RCU flavours is confusing, especially when both fields always exist on both RCU flavours. Fortunately, it is now safe for preemptible RCU to rely on the rcu_data structure's ->cpu_no_qs.b.exp field, just like non-preemptible RCU. This commit therefore removes the ad-hoc ->exp_deferred_qs field. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_exp.h | 5 ++--- kernel/rcu/tree_plugin.h | 12 ++++++------ 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..ea46ed40f6bc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -157,7 +157,6 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6c6eb3220385..fc2ee326a6f7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -255,7 +255,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->exp_deferred_qs, false); WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -656,7 +655,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -678,7 +677,7 @@ static void rcu_exp_handler(void *unused) if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6d58b75d2782..e1a9fb96e0b9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -260,10 +260,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->exp_deferred_qs); + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); } /* @@ -354,7 +354,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); @@ -481,7 +481,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->exp_deferred_qs) { + if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; } @@ -501,7 +501,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); /* Clean up if blocked during RCU read-side critical section. */ @@ -584,7 +584,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.exp_deferred_qs) || + return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && rcu_preempt_depth() == 0; } -- cgit v1.2.3-70-g09d2 From 118e0d4a1bc85d4ecea0427e440a72d21ffbfa6a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 11 Oct 2021 16:51:30 +0200 Subject: rcu/nocb: Make local rcu_nocb_lock_irqsave() safe against concurrent deoffloading rcu_nocb_lock_irqsave() can be preempted between the call to rcu_segcblist_is_offloaded() and the actual locking. This matters now that rcu_core() is preemptible on PREEMPT_RT and the (de-)offloading process can interrupt the softirq or the rcuc kthread. As a result we may locklessly call into code that requires nocb locking. In practice this is a problem while we accelerate callbacks on rcu_core(). Simply disabling interrupts before (instead of after) checking the NOCB offload state fixes the issue. Reported-and-tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..4f6c67b3ccd5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -447,12 +447,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); -#define rcu_nocb_lock_irqsave(rdp, flags) \ -do { \ - if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ - local_irq_save(flags); \ - else \ - raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ + +/* + * Disable IRQs before checking offloaded state so that local + * locking is safe against concurrent de-offloading. + */ +#define rcu_nocb_lock_irqsave(rdp, flags) \ +do { \ + local_irq_save(flags); \ + if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ + raw_spin_lock(&(rdp)->nocb_lock); \ } while (0) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ #define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) -- cgit v1.2.3-70-g09d2 From 2ebc45c44c4f3cc4c757430b2409ece4f976892e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:03 +0100 Subject: rcu/nocb: Remove rcu_node structure from nocb list when de-offloaded The nocb_gp_wait() function iterates over all CPUs in its group, including even those CPUs that have been de-offloaded. This is of course suboptimal, especially if none of the CPUs within the group are currently offloaded. This will become even more of a problem once a nocb kthread is created for all possible CPUs. Therefore use a standard double linked list to link all the offloaded rcu_data structures and safely add or delete these structure as we offload or de-offload them, respectively. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 7 +++++-- kernel/rcu/tree_nocb.h | 45 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 9 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4f6c67b3ccd5..5884380f4039 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -227,8 +227,11 @@ struct rcu_data { struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; - struct rcu_data *nocb_next_cb_rdp; - /* Next rcu_data in wakeup chain. */ + struct list_head nocb_head_rdp; /* + * Head of rcu_data list in wakeup chain, + * if rdp_gp. + */ + struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2461fe8d0c23..8e94a5344afe 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -625,7 +625,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * and the global grace-period kthread are awakened if needed. */ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + /* + * An rcu_data structure is removed from the list after its + * CPU is de-offloaded and added to the list before that CPU is + * (re-)offloaded. If the following loop happens to be referencing + * that rcu_data structure during the time that the corresponding + * CPU is de-offloaded and then immediately re-offloaded, this + * loop's rdp pointer will be carried to the end of the list by + * the resulting pair of list operations. This can cause the loop + * to skip over some of the rcu_data structures that were supposed + * to have been scanned. Fortunately a new iteration through the + * entire loop is forced after a given CPU's rcu_data structure + * is added to the list, so the skipped-over rcu_data structures + * won't be ignored for long. + */ + list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { bool needwake_state = false; if (!nocb_gp_enabled_cb(rdp)) @@ -1003,6 +1017,8 @@ static long rcu_nocb_rdp_deoffload(void *arg) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + /* Stop nocb_gp_wait() from iterating over this structure. */ + list_del_rcu(&rdp->nocb_entry_rdp); /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1066,6 +1082,17 @@ static long rcu_nocb_rdp_offload(void *arg) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); + + /* + * Cause future nocb_gp_wait() invocations to iterate over + * structure, resetting ->nocb_gp_sleep and waking up the related + * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock + * before setting ->nocb_gp_sleep again, we are guaranteed to + * iterate this newly added structure before "rcuog" goes to + * sleep again. + */ + list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); + /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING * is set. @@ -1268,7 +1295,6 @@ static void __init rcu_organize_nocb_kthreads(void) int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; if (!cpumask_available(rcu_nocb_mask)) return; @@ -1288,8 +1314,8 @@ static void __init rcu_organize_nocb_kthreads(void) /* New GP kthread, set up for CBs & next GP. */ gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; + INIT_LIST_HEAD(&rdp->nocb_head_rdp); if (dump_tree) { if (!firsttime) pr_cont("%s\n", gotnocbscbs @@ -1302,12 +1328,11 @@ static void __init rcu_organize_nocb_kthreads(void) } else { /* Another CB kthread, link to previous GP kthread. */ gotnocbscbs = true; - rdp->nocb_gp_rdp = rdp_gp; - rdp_prev->nocb_next_cb_rdp = rdp; if (dump_tree) pr_cont(" %d", cpu); } - rdp_prev = rdp; + rdp->nocb_gp_rdp = rdp_gp; + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); } if (gotnocbs && dump_tree) pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); @@ -1369,6 +1394,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) { char bufw[20]; char bufr[20]; + struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; bool wassleep; @@ -1376,11 +1402,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, + &rdp->nocb_entry_rdp, + typeof(*rdp), + nocb_entry_rdp); + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, - rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, + nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], "cC"[!!atomic_read(&rdp->nocb_lock_contended)], -- cgit v1.2.3-70-g09d2