summaryrefslogtreecommitdiff
path: root/kernel/sched/rt.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 13:12:15 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 13:12:15 -1000
commit63ce50fff9240d66cf3b59663f458f55ba6dcfcc (patch)
treec2613289116d6f80a50f534e9c5e4afc26064574 /kernel/sched/rt.c
parent3cf3fabccb9dc821ffaec3ad6bf0cd6b278bd012 (diff)
parent984ffb6a4366752c949f7b39640aecdce222607f (diff)
Merge tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Fair scheduler (SCHED_OTHER) improvements: - Remove the old and now unused SIS_PROP code & option - Scan cluster before LLC in the wake-up path - Use candidate prev/recent_used CPU if scanning failed for cluster wakeup NUMA scheduling improvements: - Improve the VMA access-PID code to better skip/scan VMAs - Extend tracing to cover VMA-skipping decisions - Improve/fix the recently introduced sched_numa_find_nth_cpu() code - Generalize numa_map_to_online_node() Energy scheduling improvements: - Remove the EM_MAX_COMPLEXITY limit - Add tracepoints to track energy computation - Make the behavior of the 'sched_energy_aware' sysctl more consistent - Consolidate and clean up access to a CPU's max compute capacity - Fix uclamp code corner cases RT scheduling improvements: - Drive dl_rq->overloaded with dl_rq->pushable_dl_tasks updates - Drive the ->rto_mask with rt_rq->pushable_tasks updates Scheduler scalability improvements: - Rate-limit updates to tg->load_avg - On x86 disable IBRS when CPU is offline to improve single-threaded performance - Micro-optimize in_task() and in_interrupt() - Micro-optimize the PSI code - Avoid updating PSI triggers and ->rtpoll_total when there are no state changes Core scheduler infrastructure improvements: - Use saved_state to reduce some spurious freezer wakeups - Bring in a handful of fast-headers improvements to scheduler headers - Make the scheduler UAPI headers more widely usable by user-space - Simplify the control flow of scheduler syscalls by using lock guards - Fix sched_setaffinity() vs. CPU hotplug race Scheduler debuggability improvements: - Disallow writing invalid values to sched_rt_period_us - Fix a race in the rq-clock debugging code triggering warnings - Fix a warning in the bandwidth distribution code - Micro-optimize in_atomic_preempt_off() checks - Enforce that the tasklist_lock is held in for_each_thread() - Print the TGID in sched_show_task() - Remove the /proc/sys/kernel/sched_child_runs_first sysctl ... and misc cleanups & fixes" * tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits) sched/fair: Remove SIS_PROP sched/fair: Use candidate prev/recent_used CPU if scanning failed for cluster wakeup sched/fair: Scan cluster before scanning LLC in wake-up path sched: Add cpus_share_resources API sched/core: Fix RQCF_ACT_SKIP leak sched/fair: Remove unused 'curr' argument from pick_next_entity() sched/nohz: Update comments about NEWILB_KICK sched/fair: Remove duplicate #include sched/psi: Update poll => rtpoll in relevant comments sched: Make PELT acronym definition searchable sched: Fix stop_one_cpu_nowait() vs hotplug sched/psi: Bail out early from irq time accounting sched/topology: Rename 'DIE' domain to 'PKG' sched/psi: Delete the 'update_total' function parameter from update_triggers() sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes sched/headers: Remove comment referring to rq::cpu_load, since this has been removed sched/numa: Complete scanning of inactive VMAs when there is no alternative sched/numa: Complete scanning of partial VMAs regardless of PID activity sched/numa: Move up the access pid reset logic sched/numa: Trace decisions related to skipping VMAs ...
Diffstat (limited to 'kernel/sched/rt.c')
-rw-r--r--kernel/sched/rt.c95
1 files changed, 25 insertions, 70 deletions
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0597ba0f85ff..6aaf0a3d6081 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth;
* period over which we measure -rt task CPU usage in us.
* default: 1s
*/
-unsigned int sysctl_sched_rt_period = 1000000;
+int sysctl_sched_rt_period = 1000000;
/*
* part of the period that we allow rt tasks to run in us.
@@ -34,9 +34,11 @@ static struct ctl_table sched_rt_sysctls[] = {
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rt_runtime_us",
@@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = (void *)&sysctl_sched_rt_period,
},
{
.procname = "sched_rr_timeslice_ms",
@@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
- rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
@@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
rq->rt.highest_prio.next = p->prio;
} else {
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
}
}
@@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -515,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
max_cap = uclamp_eff_value(p, UCLAMP_MAX);
- cpu_cap = capacity_orig_of(cpu);
+ cpu_cap = arch_scale_cpu_capacity(cpu);
return cpu_cap >= min(min_cap, max_cap);
}
@@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
/*
* When we're idle and a woken (rt) task is
- * throttled check_preempt_curr() will set
+ * throttled wakeup_preempt() will set
* skip_update and the time between the wakeup
* and this unthrottle will get accounted as
* 'runtime'.
@@ -1281,7 +1237,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}
@@ -1294,7 +1249,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
@@ -1715,7 +1669,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
resched_curr(rq);
@@ -2109,9 +2063,11 @@ retry:
*/
push_task = get_push_task(rq);
if (push_task) {
+ preempt_disable();
raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
push_task, &rq->push_work);
+ preempt_enable();
raw_spin_rq_lock(rq);
}
@@ -2448,9 +2404,11 @@ skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
+ preempt_disable();
raw_spin_rq_unlock(this_rq);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
+ preempt_enable();
raw_spin_rq_lock(this_rq);
}
}
@@ -2702,7 +2660,7 @@ DEFINE_SCHED_CLASS(rt) = {
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
- .check_preempt_curr = check_preempt_curr_rt,
+ .wakeup_preempt = wakeup_preempt_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
@@ -2985,9 +2943,6 @@ static int sched_rt_global_constraints(void)
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
@@ -3018,7 +2973,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_validate();