summaryrefslogtreecommitdiff
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorFrederic Weisbecker <frederic@kernel.org>2024-01-12 16:46:21 +0100
committerBoqun Feng <boqun.feng@gmail.com>2024-02-14 07:51:36 -0800
commit23da2ad64dbe9f3fab10af90484fe41e144337b1 (patch)
treedbdfc8b44dbb2d048b0d35b005e3c5e500f136ff /kernel/rcu/tree.c
parentb67cffcbbf9dc759d95d330a5af5d1480af2b1f1 (diff)
rcu/exp: Remove rcu_par_gp_wq
TREE04 running on short iterations can produce writer stalls of the following kind: ??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0 task:rcu_torture_wri state:D stack:14568 pid:83 ppid:2 flags:0x00004000 Call Trace: <TASK> __schedule+0x2de/0x850 ? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0 schedule+0x4f/0x90 synchronize_rcu_expedited+0x430/0x670 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_synchronize_rcu_expedited+0x10/0x10 do_rtws_sync.constprop.0+0xde/0x230 rcu_torture_writer+0x4b4/0xcd0 ? __pfx_rcu_torture_writer+0x10/0x10 kthread+0xc7/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 </TASK> Waiting for an expedited grace period and polling for an expedited grace period both are operations that internally rely on the same workqueue performing necessary asynchronous work. However, a dependency chain is involved between those two operations, as depicted below: ====== CPU 0 ======= ====== CPU 1 ======= synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); start_poll_synchronize_rcu_expedited queue_work(rcu_gp_wq, &rnp->exp_poll_wq); synchronize_rcu_expedited_queue_work() queue_work(rcu_gp_wq, &rew->rew_work); wait_event() // A, wait for &rew->rew_work completion mutex_unlock() // B //======> switch to kworker sync_rcu_do_polled_gp() { synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); // C, wait B .... } // D Since workqueues are usually implemented on top of several kworkers handling the queue concurrently, the above situation wouldn't deadlock most of the time because A then doesn't depend on D. But in case of memory stress, a single kworker may end up handling alone all the works in a serialized way. In that case the above layout becomes a problem because A then waits for D, closing a circular dependency: A -> D -> C -> B -> A This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed synchronize_rcu_expedited() is otherwise implemented on top of a kthread worker while polling still relies on rcu_gp_wq workqueue, breaking the above circular dependency chain. Fix this with making expedited grace period to always rely on kthread worker. The workqueue based implementation is essentially a duplicate anyway now that the per-node initialization is performed by per-node kthread workers. Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to manage the scheduler policy of these kthread workers. Reported-by: Anna-Maria Behnsen <anna-maria@linutronix.de> Reported-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Joel Fernandes <joel@joelfernandes.org> Suggested-by: Paul E. McKenney <paulmck@kernel.org> Suggested-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com> Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c40
1 files changed, 6 insertions, 34 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 312c4c5d4509..9591c22408a1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4394,7 +4394,6 @@ rcu_boot_init_percpu_data(int cpu)
rcu_boot_init_nocb_percpu_data(rdp);
}
-#ifdef CONFIG_RCU_EXP_KTHREAD
struct kthread_worker *rcu_exp_gp_kworker;
static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
@@ -4414,7 +4413,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
return;
}
WRITE_ONCE(rnp->exp_kworker, kworker);
- sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
}
static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
@@ -4438,39 +4439,14 @@ static void __init rcu_start_exp_gp_kworker(void)
rcu_exp_gp_kworker = NULL;
return;
}
- sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-struct workqueue_struct *rcu_par_gp_wq;
-
-static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
-{
-}
-
-static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
-{
- return NULL;
-}
-
-static void __init rcu_start_exp_gp_kworker(void)
-{
-}
-static inline void rcu_alloc_par_gp_wq(void)
-{
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
{
- if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) ||
- IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) {
+ if (rcu_scheduler_fully_active) {
mutex_lock(&rnp->kthread_mutex);
rcu_spawn_one_boost_kthread(rnp);
rcu_spawn_exp_par_gp_kworker(rnp);
@@ -4554,9 +4530,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
struct rcu_node *rnp;
struct task_struct *task_boost, *task_exp;
- if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST))
- return;
-
rdp = per_cpu_ptr(&rcu_data, cpu);
rnp = rdp->mynode;
@@ -5245,7 +5218,6 @@ void __init rcu_init(void)
/* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */