diff options
author | Frederic Weisbecker <frederic@kernel.org> | 2024-01-12 16:46:21 +0100 |
---|---|---|
committer | Boqun Feng <boqun.feng@gmail.com> | 2024-02-14 07:51:36 -0800 |
commit | 23da2ad64dbe9f3fab10af90484fe41e144337b1 (patch) | |
tree | dbdfc8b44dbb2d048b0d35b005e3c5e500f136ff /kernel/rcu/tree.c | |
parent | b67cffcbbf9dc759d95d330a5af5d1480af2b1f1 (diff) |
rcu/exp: Remove rcu_par_gp_wq
TREE04 running on short iterations can produce writer stalls of the
following kind:
??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0
task:rcu_torture_wri state:D stack:14568 pid:83 ppid:2 flags:0x00004000
Call Trace:
<TASK>
__schedule+0x2de/0x850
? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0
schedule+0x4f/0x90
synchronize_rcu_expedited+0x430/0x670
? __pfx_autoremove_wake_function+0x10/0x10
? __pfx_synchronize_rcu_expedited+0x10/0x10
do_rtws_sync.constprop.0+0xde/0x230
rcu_torture_writer+0x4b4/0xcd0
? __pfx_rcu_torture_writer+0x10/0x10
kthread+0xc7/0xf0
? __pfx_kthread+0x10/0x10
ret_from_fork+0x2f/0x50
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1b/0x30
</TASK>
Waiting for an expedited grace period and polling for an expedited
grace period both are operations that internally rely on the same
workqueue performing necessary asynchronous work.
However, a dependency chain is involved between those two operations,
as depicted below:
====== CPU 0 ======= ====== CPU 1 =======
synchronize_rcu_expedited()
exp_funnel_lock()
mutex_lock(&rcu_state.exp_mutex);
start_poll_synchronize_rcu_expedited
queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
synchronize_rcu_expedited_queue_work()
queue_work(rcu_gp_wq, &rew->rew_work);
wait_event() // A, wait for &rew->rew_work completion
mutex_unlock() // B
//======> switch to kworker
sync_rcu_do_polled_gp() {
synchronize_rcu_expedited()
exp_funnel_lock()
mutex_lock(&rcu_state.exp_mutex); // C, wait B
....
} // D
Since workqueues are usually implemented on top of several kworkers
handling the queue concurrently, the above situation wouldn't deadlock
most of the time because A then doesn't depend on D. But in case of
memory stress, a single kworker may end up handling alone all the works
in a serialized way. In that case the above layout becomes a problem
because A then waits for D, closing a circular dependency:
A -> D -> C -> B -> A
This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed
synchronize_rcu_expedited() is otherwise implemented on top of a kthread
worker while polling still relies on rcu_gp_wq workqueue, breaking the
above circular dependency chain.
Fix this with making expedited grace period to always rely on kthread
worker. The workqueue based implementation is essentially a duplicate
anyway now that the per-node initialization is performed by per-node
kthread workers.
Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to
manage the scheduler policy of these kthread workers.
Reported-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Joel Fernandes <joel@joelfernandes.org>
Suggested-by: Paul E. McKenney <paulmck@kernel.org>
Suggested-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 40 |
1 files changed, 6 insertions, 34 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 312c4c5d4509..9591c22408a1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4394,7 +4394,6 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } -#ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) @@ -4414,7 +4413,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) return; } WRITE_ONCE(rnp->exp_kworker, kworker); - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); } static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) @@ -4438,39 +4439,14 @@ static void __init rcu_start_exp_gp_kworker(void) rcu_exp_gp_kworker = NULL; return; } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); -} - -static inline void rcu_alloc_par_gp_wq(void) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -struct workqueue_struct *rcu_par_gp_wq; - -static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) -{ -} - -static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) -{ - return NULL; -} - -static void __init rcu_start_exp_gp_kworker(void) -{ -} -static inline void rcu_alloc_par_gp_wq(void) -{ - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); } -#endif /* CONFIG_RCU_EXP_KTHREAD */ static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) { - if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) || - IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) { + if (rcu_scheduler_fully_active) { mutex_lock(&rnp->kthread_mutex); rcu_spawn_one_boost_kthread(rnp); rcu_spawn_exp_par_gp_kworker(rnp); @@ -4554,9 +4530,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) struct rcu_node *rnp; struct task_struct *task_boost, *task_exp; - if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST)) - return; - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; @@ -5245,7 +5218,6 @@ void __init rcu_init(void) /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ |