From e2c73a6860bdf54f2c6bf8cddc34ddc91a1343e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Sep 2021 14:18:51 -0700 Subject: rcu: Remove the RCU_FAST_NO_HZ Kconfig option All of the uses of CONFIG_RCU_FAST_NO_HZ=y that I have seen involve systems with RCU callbacks offloaded. In this situation, all that this Kconfig option does is slow down idle entry/exit with an additional allways-taken early exit. If this is the only use case, then this Kconfig option nothing but an attractive nuisance that needs to go away. This commit therefore removes the RCU_FAST_NO_HZ Kconfig option. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ---- 1 file changed, 4 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..36b83a47e54a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4489,10 +4489,6 @@ on rcutree.qhimark at boot time and to zero to disable more aggressive help enlistment. - rcutree.rcu_idle_gp_delay= [KNL] - Set wakeup interval for idle CPUs that have - RCU callbacks (RCU_FAST_NO_HZ=y). - rcutree.rcu_kick_kthreads= [KNL] Cause the grace-period kthread to get an extra wake_up() if it sleeps three times longer than -- cgit v1.2.3-70-g09d2 From 82e310033d7c21a7a88427f14e0dad78d731a5cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Nov 2021 20:55:18 -0800 Subject: rcutorture: Enable multiple concurrent callback-flood kthreads This commit converts the rcutorture.fwd_progress module parameter from bool to int, so that it specifies the number of callback-flood kthreads. Values less than zero specify one kthread per CPU, however, the number of kthreads executing concurrently is limited to the number of online CPUs. This commit also reverse the order of the need-resched and callback-flood operations to cause the callback flooding to happen more nearly at the same time. Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 6 +- kernel/rcu/rcutorture.c | 114 ++++++++++++++++++------ 2 files changed, 91 insertions(+), 29 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..ba60da85d1c8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4603,8 +4603,12 @@ in seconds. rcutorture.fwd_progress= [KNL] - Enable RCU grace-period forward-progress testing + Specifies the number of kthreads to be used + for RCU grace-period forward-progress testing for the types of RCU supporting this notion. + Defaults to 1 kthread, values less than zero or + greater than the number of CPUs cause the number + of CPUs to be used. rcutorture.fwd_progress_div= [KNL] Specify the fraction of a CPU-stall-warning diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bc854f935548..9390f867ba6c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -79,7 +79,7 @@ torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); -torture_param(bool, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress, 1, "Test grace-period forward progress"); torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)"); @@ -146,7 +146,7 @@ static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; -static struct task_struct *fwd_prog_task; +static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; @@ -2161,10 +2161,12 @@ struct rcu_fwd { unsigned long rcu_fwd_startat; struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; unsigned long rcu_launder_gp_seq_start; + int rcu_fwd_id; }; static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; +static unsigned long rcu_fwd_seq; static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -2177,8 +2179,9 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) if (rfp->n_launders_hist[i].n_launders > 0) break; - pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rfp->rcu_fwd_startat); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. + pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", + __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); gps_old = rfp->rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { gps = rfp->n_launders_hist[j].launder_gp_seq; @@ -2189,6 +2192,7 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) gps_old = gps; } pr_cont("\n"); + mutex_unlock(&rcu_fwd_mutex); } /* Callback function for continuous-flood RCU callbacks. */ @@ -2314,7 +2318,8 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__, + rfp->rcu_fwd_id, dur, cver, gps); } if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); @@ -2432,6 +2437,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + int i; + long ncbs; struct rcu_fwd *rfp; mutex_lock(&rcu_fwd_mutex); @@ -2442,18 +2449,26 @@ static int rcutorture_oom_notify(struct notifier_block *self, } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(rfp); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2); + for (i = 0; i < fwd_progress; i++) { + rcu_torture_fwd_cb_hist(&rfp[i]); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2); + } WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); @@ -2469,6 +2484,7 @@ static struct notifier_block rcutorture_oom_nb = { static int rcu_torture_fwd_prog(void *args) { int oldnice = task_nice(current); + unsigned long oldseq = READ_ONCE(rcu_fwd_seq); struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -2478,21 +2494,31 @@ static int rcu_torture_fwd_prog(void *args) if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); do { - schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - WRITE_ONCE(rcu_fwd_emergency_stop, false); + if (!rfp->rcu_fwd_id) { + schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + WRITE_ONCE(rcu_fwd_emergency_stop, false); + WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); + } else { + while (READ_ONCE(rcu_fwd_seq) == oldseq) + schedule_timeout_interruptible(1); + oldseq = READ_ONCE(rcu_fwd_seq); + } + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); + if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id) + rcu_torture_fwd_prog_cr(rfp); if (!IS_ENABLED(CONFIG_TINY_RCU) || - rcu_inkernel_boot_has_ended()) + (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - if (rcu_inkernel_boot_has_ended()) - rcu_torture_fwd_prog_cr(rfp); /* Avoid slow periods, better to test when busy. */ if (stutter_wait("rcu_torture_fwd_prog")) sched_set_normal(current, oldnice); } while (!torture_must_stop()); /* Short runs might not contain a valid forward-progress attempt. */ - WARN_ON(!tested && tested_tries >= 5); - pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + if (!rfp->rcu_fwd_id) { + WARN_ON(!tested && tested_tries >= 5); + pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + } torture_kthread_stopping("rcu_torture_fwd_prog"); return 0; } @@ -2500,17 +2526,27 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + int i; + int ret = 0; struct rcu_fwd *rfp; if (!fwd_progress) return 0; /* Not requested, so don't do it. */ + if (fwd_progress >= nr_cpu_ids) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n"); + fwd_progress = nr_cpu_ids; + } else if (fwd_progress < 0) { + fwd_progress = nr_cpu_ids; + } if ((!cur_ops->sync && !cur_ops->call) || !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); + fwd_progress = 0; return 0; } if (stall_cpu > 0) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ WARN_ON(1); /* Make sure rcutorture notices conflict. */ @@ -2520,29 +2556,51 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - rfp = kzalloc(sizeof(*rfp), GFP_KERNEL); - if (!rfp) + rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL); + fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL); + if (!rfp || !fwd_prog_tasks) { + kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; + fwd_progress = 0; return -ENOMEM; - spin_lock_init(&rfp->rcu_fwd_lock); - rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + } + for (i = 0; i < fwd_progress; i++) { + spin_lock_init(&rfp[i].rcu_fwd_lock); + rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head; + rfp[i].rcu_fwd_id = i; + } mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; mutex_unlock(&rcu_fwd_mutex); register_oom_notifier(&rcutorture_oom_nb); - return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); + for (i = 0; i < fwd_progress; i++) { + ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]); + if (ret) { + fwd_progress = i; + return ret; + } + } + return 0; } static void rcu_torture_fwd_prog_cleanup(void) { + int i; struct rcu_fwd *rfp; - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); - rfp = rcu_fwds; + if (!rcu_fwds || !fwd_prog_tasks) + return; + for (i = 0; i < fwd_progress; i++) + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]); + unregister_oom_notifier(&rcutorture_oom_nb); mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; rcu_fwds = NULL; mutex_unlock(&rcu_fwd_mutex); - unregister_oom_notifier(&rcutorture_oom_nb); kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; } /* Callback function for RCU barrier testing. */ -- cgit v1.2.3-70-g09d2 From 8610b65680390a103b58f46282a1b05f7eebbba4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Nov 2021 07:33:40 -0800 Subject: rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing This commit adds a rcupdate.rcu_task_enqueue_lim module parameter that sets the initial number of callback queues to use for the RCU Tasks family of RCU implementations. This parameter allows testing of various fanout values. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ kernel/rcu/tasks.h | 24 ++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..9b09fc5dfe66 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4805,6 +4805,13 @@ period to instead use normal non-expedited grace-period processing. + rcupdate.rcu_task_enqueue_lim= [KNL] + Set the number of callback queues to use for the + RCU Tasks family of RCU flavors. The default + of -1 allows this to be automatically (and + dynamically) adjusted. This parameter is intended + for use in testing. + rcupdate.rcu_task_ipi_delay= [KNL] Set time in jiffies during which RCU tasks will avoid sending IPIs, starting with the beginning diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8e6601c5fd2e..9d3eddaecfde 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -130,6 +130,9 @@ module_param(rcu_task_ipi_delay, int, 0644); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +static int rcu_task_enqueue_lim __read_mostly = -1; +module_param(rcu_task_enqueue_lim, int, 0444); + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -192,10 +195,19 @@ static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; unsigned long flags; + int lim; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - rtp->percpu_enqueue_shift = ilog2(nr_cpu_ids); - rtp->percpu_enqueue_lim = 1; + if (rcu_task_enqueue_lim < 0) + rcu_task_enqueue_lim = nr_cpu_ids; + else if (rcu_task_enqueue_lim == 0) + rcu_task_enqueue_lim = 1; + lim = rcu_task_enqueue_lim; + + if (lim > nr_cpu_ids) + lim = nr_cpu_ids; + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim)); + smp_store_release(&rtp->percpu_enqueue_lim, lim); for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -211,7 +223,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - + pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); } // Enqueue a callback for the specified flavor of Tasks RCU. @@ -307,7 +319,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) unsigned long flags; int needgpcb = 0; - for (cpu = 0; cpu < rtp->percpu_enqueue_lim; cpu++) { + for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_enqueue_lim); cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -338,11 +350,11 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu cpu = rtpcp->cpu; cpunext = cpu * 2 + 1; - if (cpunext < rtp->percpu_enqueue_lim) { + if (cpunext < smp_load_acquire(&rtp->percpu_enqueue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); cpunext++; - if (cpunext < rtp->percpu_enqueue_lim) { + if (cpunext < smp_load_acquire(&rtp->percpu_enqueue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); } -- cgit v1.2.3-70-g09d2 From ab97152f88a4d580b89f0b7cc3028ffac438216f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Nov 2021 15:12:15 -0800 Subject: rcu-tasks: Use more callback queues if contention encountered The rcupdate.rcu_task_enqueue_lim module parameter allows system administrators to tune the number of callback queues used by the RCU Tasks flavors. However if callback storms are infrequent, it would be better to operate with a single queue on a given system unless and until that system actually needed more queues. Systems not needing more queues can then avoid the overhead of checking the extra queues and especially avoid the overhead of fanning workqueue handlers out to all CPUs to invoke callbacks. This commit therefore switches to using all the CPUs' callback queues if call_rcu_tasks_generic() encounters too much lock contention. The amount of lock contention to tolerate defaults to 100 contended lock acquisitions per jiffy, and can be adjusted using the new rcupdate.rcu_task_contend_lim module parameter. Such switching is undertaken only if the rcupdate.rcu_task_enqueue_lim module parameter is negative, which is its default value (-1). This allows savvy systems administrators to set the number of queues to some known good value and to not have to worry about the kernel doing any second guessing. [ paulmck: Apply feedback from Guillaume Tucker and kernelci. ] Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ kernel/rcu/tasks.h | 27 +++++++++++++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9b09fc5dfe66..089f4c5f8225 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4805,6 +4805,14 @@ period to instead use normal non-expedited grace-period processing. + rcupdate.rcu_task_contend_lim= [KNL] + Set the minimum number of callback-queuing-time + lock-contention events per jiffy required to + cause the RCU Tasks flavors to switch to per-CPU + callback queuing. This switching only occurs + when rcupdate.rcu_task_enqueue_lim is set to + the default value of -1. + rcupdate.rcu_task_enqueue_lim= [KNL] Set the number of callback queues to use for the RCU Tasks family of RCU flavors. The default diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6e8e4cf6fff3..d37ab69b9db8 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -142,6 +142,10 @@ module_param(rcu_task_stall_timeout, int, 0644); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); +static bool rcu_task_cb_adjust; +static int rcu_task_contend_lim __read_mostly = 100; +module_param(rcu_task_contend_lim, int, 0444); + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -207,10 +211,13 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int lim; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - if (rcu_task_enqueue_lim < 0) - rcu_task_enqueue_lim = nr_cpu_ids; - else if (rcu_task_enqueue_lim == 0) + if (rcu_task_enqueue_lim < 0) { + rcu_task_enqueue_lim = 1; + rcu_task_cb_adjust = true; + pr_info("%s: Setting adjustable number of callback queues.\n", __func__); + } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; + } lim = rcu_task_enqueue_lim; if (lim > nr_cpu_ids) @@ -251,6 +258,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, { unsigned long flags; unsigned long j; + bool needadjust = false; bool needwake; struct rcu_tasks_percpu *rtpcp; @@ -266,7 +274,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp->rtp_jiffies = j; rtpcp->rtp_n_lock_retries = 0; } - rtpcp->rtp_n_lock_retries++; + if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && + READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + needadjust = true; // Defer adjustment to avoid deadlock. } if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. @@ -276,6 +286,15 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, needwake = rcu_segcblist_empty(&rtpcp->cblist); rcu_segcblist_enqueue(&rtpcp->cblist, rhp); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (unlikely(needadjust)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) irq_work_queue(&rtpcp->rtp_irq_work); -- cgit v1.2.3-70-g09d2 From fd796e4139b481733a701c4d406056538f4c73cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Nov 2021 16:52:31 -0800 Subject: rcu-tasks: Use fewer callbacks queues if callback flood ends By default, when lock contention is encountered, the RCU Tasks flavors of RCU switch to using per-CPU queueing. However, if the callback flood ends, per-CPU queueing continues to be used, which introduces significant additional overhead, especially for callback invocation, which fans out a series of workqueue handlers. This commit therefore switches back to single-queue operation if at the beginning of a grace period there are very few callbacks. The definition of "very few" is set by the rcupdate.rcu_task_collapse_lim module parameter, which defaults to 10. This switch happens in two phases, with the first phase causing future callbacks to be enqueued on CPU 0's queue, but with all queues continuing to be checked for grace periods and callback invocation. The second phase checks to see if an RCU grace period has elapsed and if all remaining RCU-Tasks callbacks are queued on CPU 0. If so, only CPU 0 is checked for future grace periods and callback operation. Of course, the return of contention anywhere during this process will result in returning to per-CPU callback queueing. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 +++++ kernel/rcu/tasks.h | 48 +++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 2 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 089f4c5f8225..d1b0542b8564 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4805,6 +4805,14 @@ period to instead use normal non-expedited grace-period processing. + rcupdate.rcu_task_collapse_lim= [KNL] + Set the maximum number of callbacks present + at the beginning of a grace period that allows + the RCU Tasks flavors to collapse back to using + a single callback queue. This switching only + occurs when rcupdate.rcu_task_enqueue_lim is + set to the default value of -1. + rcupdate.rcu_task_contend_lim= [KNL] Set the minimum number of callback-queuing-time lock-contention events per jiffy required to diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b4a2cab6985a..84f1d91604cc 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -68,6 +68,7 @@ struct rcu_tasks_percpu { * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. + * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers. * @barrier_q_mutex: Serialize barrier operations. * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. @@ -98,6 +99,7 @@ struct rcu_tasks { int percpu_enqueue_shift; int percpu_enqueue_lim; int percpu_dequeue_lim; + unsigned long percpu_dequeue_gpseq; struct mutex barrier_q_mutex; atomic_t barrier_q_count; struct completion barrier_q_completion; @@ -148,6 +150,8 @@ module_param(rcu_task_enqueue_lim, int, 0444); static bool rcu_task_cb_adjust; static int rcu_task_contend_lim __read_mostly = 100; module_param(rcu_task_contend_lim, int, 0444); +static int rcu_task_collapse_lim __read_mostly = 10; +module_param(rcu_task_collapse_lim, int, 0444); /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 @@ -269,6 +273,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->next = NULL; rhp->func = func; local_irq_save(flags); + rcu_read_lock(); rtpcp = per_cpu_ptr(rtp->rtpcpu, smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. @@ -294,12 +299,13 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); - WRITE_ONCE(rtp->percpu_enqueue_lim, nr_cpu_ids); + WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } + rcu_read_unlock(); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) irq_work_queue(&rtpcp->rtp_irq_work); @@ -369,15 +375,25 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; unsigned long flags; + long n; + long ncbs = 0; + long ncbsnz = 0; int needgpcb = 0; for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (!rcu_segcblist_n_cbs(&rtpcp->cblist)) continue; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + // Should we shrink down to a single callback queue? + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (n) { + ncbs += n; + if (cpu > 0) + ncbsnz += n; + } rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) @@ -386,6 +402,34 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) needgpcb |= 0x1; raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } + + // Shrink down to a single callback queue if appropriate. + // This is done in two stages: (1) If there are no more than + // rcu_task_collapse_lim callbacks on CPU 0 and none on any other + // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period, + // if there has not been an increase in callbacks, limit dequeuing + // to CPU 0. Note the matching RCU read-side critical section in + // call_rcu_tasks_generic(). + if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim > 1) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + smp_store_release(&rtp->percpu_enqueue_lim, 1); + rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); + pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + if (rcu_task_cb_adjust && !ncbsnz && + poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) { + WRITE_ONCE(rtp->percpu_dequeue_lim, 1); + pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + return needgpcb; } -- cgit v1.2.3-70-g09d2 From d2cf0854d728c42524efc169edb3505de8c1a9dc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:07 +0100 Subject: rcu/nocb: Allow empty "rcu_nocbs" kernel parameter Allow the rcu_nocbs kernel parameter to be specified just by itself, without specifying any CPUs. This allows systems administrators to use "rcu_nocbs" to specify that none of the CPUs are to be offloaded at boot time, but than any of them may be offloaded at runtime via cpusets. In contrast, if the "rcu_nocbs" or "nohz_full" kernel parameters are not specified at all, then not only are none of the CPUs offloaded at boot, none of them can be offloaded at runtime, either. While in the area, modernize the description of the "rcuo" kthreads' naming scheme. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 37 ++++++++++++++++--------- kernel/rcu/tree_nocb.h | 10 ++++--- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..8b5da10f932e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4343,19 +4343,30 @@ Disable the Correctable Errors Collector, see CONFIG_RAS_CEC help text. - rcu_nocbs= [KNL] - The argument is a cpu list, as described above. - - In kernels built with CONFIG_RCU_NOCB_CPU=y, set - the specified list of CPUs to be no-callback CPUs. - Invocation of these CPUs' RCU callbacks will be - offloaded to "rcuox/N" kthreads created for that - purpose, where "x" is "p" for RCU-preempt, and - "s" for RCU-sched, and "N" is the CPU number. - This reduces OS jitter on the offloaded CPUs, - which can be useful for HPC and real-time - workloads. It can also improve energy efficiency - for asymmetric multiprocessors. + rcu_nocbs[=cpu-list] + [KNL] The optional argument is a cpu list, + as described above. + + In kernels built with CONFIG_RCU_NOCB_CPU=y, + enable the no-callback CPU mode, which prevents + such CPUs' callbacks from being invoked in + softirq context. Invocation of such CPUs' RCU + callbacks will instead be offloaded to "rcuox/N" + kthreads created for that purpose, where "x" is + "p" for RCU-preempt, "s" for RCU-sched, and "g" + for the kthreads that mediate grace periods; and + "N" is the CPU number. This reduces OS jitter on + the offloaded CPUs, which can be useful for HPC + and real-time workloads. It can also improve + energy efficiency for asymmetric multiprocessors. + + If a cpulist is passed as an argument, the specified + list of CPUs is set to no-callback mode from boot. + + Otherwise, if the '=' sign and the cpulist + arguments are omitted, no CPU will be set to + no-callback mode from boot but the mode may be + toggled at runtime via cpusets. rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f580a6b2e74e..0c1802ce4764 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -66,14 +66,16 @@ static bool rcu_nocb_is_setup; static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (cpulist_parse(str, rcu_nocb_mask)) { - pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); - cpumask_setall(rcu_nocb_mask); + if (*str == '=') { + if (cpulist_parse(++str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } } rcu_nocb_is_setup = true; return 1; } -__setup("rcu_nocbs=", rcu_nocb_setup); +__setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { -- cgit v1.2.3-70-g09d2