diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-28 16:43:39 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-28 16:43:39 -0700 |
commit | 3ca9a836ff53db8eb76d559764c07fb3b015886a (patch) | |
tree | edbd56fb3069895e1d8306df53aaa3d742a9bec8 /kernel/sched/core.c | |
parent | 1a7c611546e552193180941ecf6b191e659db979 (diff) | |
parent | 2f88c8e802c8b128a155976631f4eb2ce4f3c805 (diff) |
Merge tag 'sched-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- The biggest change is introduction of a new iteration of the
SCHED_FAIR interactivity code: the EEVDF ("Earliest Eligible Virtual
Deadline First") scheduler
EEVDF too is a virtual-time scheduler, with two parameters (weight
and relative deadline), compared to CFS that had weight only. It
completely reworks the base scheduler: placement, preemption, picking
-- everything
LWN.net, as usual, has a terrific writeup about EEVDF:
https://lwn.net/Articles/925371/
Preemption (both tick and wakeup) is driven by testing against a
fresh pick. Because the tree is now effectively an interval tree, and
the selection is no longer the 'leftmost' task, over-scheduling is
less of a problem. A lot of the CFS heuristics are removed or
replaced by more natural latency-space parameters & constructs
In terms of expected performance regressions: we will and can fix
everything where a 'good' workload misbehaves with the new scheduler,
but EEVDF inevitably changes workload scheduling in a binary fashion,
hopefully for the better in the overwhelming majority of cases, but
in some cases it won't, especially in adversarial loads that got
lucky with the previous code, such as some variants of hackbench. We
are trying hard to err on the side of fixing all performance
regressions, but we expect some inevitable post-release iterations of
that process
- Improve load-balancing on hybrid x86 systems: enable cluster
scheduling (again)
- Improve & fix bandwidth-scheduling on nohz systems
- Improve bandwidth-throttling
- Use lock guards to simplify and de-goto-ify control flow
- Misc improvements, cleanups and fixes
* tag 'sched-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (43 commits)
sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
sched/eevdf: Curb wakeup-preemption
sched: Simplify sched_core_cpu_{starting,deactivate}()
sched: Simplify try_steal_cookie()
sched: Simplify sched_tick_remote()
sched: Simplify sched_exec()
sched: Simplify ttwu()
sched: Simplify wake_up_if_idle()
sched: Simplify: migrate_swap_stop()
sched: Simplify sysctl_sched_uclamp_handler()
sched: Simplify get_nohz_timer_target()
sched/rt: sysctl_sched_rr_timeslice show default timeslice after reset
sched/rt: Fix sysctl_sched_rr_timeslice intial value
sched/fair: Block nohz tick_stop when cfs bandwidth in use
sched, cgroup: Restore meaning to hierarchical_quota
MAINTAINERS: Add Peter explicitly to the psi section
sched/psi: Select KERNFS as needed
sched/topology: Align group flags when removing degenerate domain
sched/fair: remove util_est boosting
sched/fair: Propagate enqueue flags into place_entity()
...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 496 |
1 files changed, 272 insertions, 224 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d63e063608a..2299a5cfbfb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1097,25 +1097,22 @@ int get_nohz_timer_target(void) hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); - rcu_read_lock(); + guard(rcu)(); + for_each_domain(cpu, sd) { for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } + if (!idle_cpu(i)) + return i; } } if (default_cpu == -1) default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); - cpu = default_cpu; -unlock: - rcu_read_unlock(); - return cpu; + + return default_cpu; } /* @@ -1194,6 +1191,20 @@ static void nohz_csd_func(void *info) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +static inline bool __need_bw_check(struct rq *rq, struct task_struct *p) +{ + if (rq->nr_running != 1) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + if (!task_on_rq_queued(p)) + return false; + + return true; +} + bool sched_can_stop_tick(struct rq *rq) { int fifo_nr_running; @@ -1229,6 +1240,18 @@ bool sched_can_stop_tick(struct rq *rq) if (rq->nr_running > 1) return false; + /* + * If there is one task and it has CFS runtime bandwidth constraints + * and it's on the cpu now we don't want to stop the tick. + * This check prevents clearing the bit if a newly enqueued task here is + * dequeued by migrating while the constrained task continues to run. + * E.g. going from 2->1 without going through pick_next_task(). + */ + if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (cfs_task_bw_constrained(rq->curr)) + return false; + } + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -1804,7 +1827,8 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, int old_min, old_max, old_min_rt; int result; - mutex_lock(&uclamp_mutex); + guard(mutex)(&uclamp_mutex); + old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; old_min_rt = sysctl_sched_uclamp_util_min_rt_default; @@ -1813,7 +1837,7 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (result) goto undo; if (!write) - goto done; + return 0; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || @@ -1849,16 +1873,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, * Otherwise, keep it simple and do just a lazy update at each next * task enqueue time. */ - - goto done; + return 0; undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; sysctl_sched_uclamp_util_min_rt_default = old_min_rt; -done: - mutex_unlock(&uclamp_mutex); - return result; } #endif @@ -3413,7 +3433,6 @@ static int migrate_swap_stop(void *data) { struct migration_swap_arg *arg = data; struct rq *src_rq, *dst_rq; - int ret = -EAGAIN; if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) return -EAGAIN; @@ -3421,33 +3440,25 @@ static int migrate_swap_stop(void *data) src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); - double_raw_lock(&arg->src_task->pi_lock, - &arg->dst_task->pi_lock); - double_rq_lock(src_rq, dst_rq); + guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); + guard(double_rq_lock)(src_rq, dst_rq); if (task_cpu(arg->dst_task) != arg->dst_cpu) - goto unlock; + return -EAGAIN; if (task_cpu(arg->src_task) != arg->src_cpu) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) - goto unlock; + return -EAGAIN; __migrate_swap_task(arg->src_task, arg->dst_cpu); __migrate_swap_task(arg->dst_task, arg->src_cpu); - ret = 0; - -unlock: - double_rq_unlock(src_rq, dst_rq); - raw_spin_unlock(&arg->dst_task->pi_lock); - raw_spin_unlock(&arg->src_task->pi_lock); - - return ret; + return 0; } /* @@ -3722,14 +3733,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) struct sched_domain *sd; __schedstat_inc(p->stats.nr_wakeups_remote); - rcu_read_lock(); + + guard(rcu)(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { __schedstat_inc(sd->ttwu_wake_remote); break; } } - rcu_read_unlock(); } if (wake_flags & WF_MIGRATED) @@ -3928,21 +3939,13 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - - rcu_read_lock(); - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; - - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - resched_curr(rq); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - -out: - rcu_read_unlock(); + guard(rcu)(); + if (is_idle_task(rcu_dereference(rq->curr))) { + guard(rq_lock_irqsave)(rq); + if (is_idle_task(rq->curr)) + resched_curr(rq); + } } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -4195,10 +4198,9 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) */ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - unsigned long flags; + guard(preempt)(); int cpu, success = 0; - preempt_disable(); if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4225,129 +4227,127 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with smp_store_mb() * in set_current_state() that the waiting thread does. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - smp_mb__after_spinlock(); - if (!ttwu_state_match(p, state, &success)) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + smp_mb__after_spinlock(); + if (!ttwu_state_match(p, state, &success)) + break; - trace_sched_waking(p); + trace_sched_waking(p); - /* - * Ensure we load p->on_rq _after_ p->state, otherwise it would - * be possible to, falsely, observe p->on_rq == 0 and get stuck - * in smp_cond_load_acquire() below. - * - * sched_ttwu_pending() try_to_wake_up() - * STORE p->on_rq = 1 LOAD p->state - * UNLOCK rq->lock - * - * __schedule() (switch to task 'p') - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * UNLOCK rq->lock - * - * [task p] - * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). - */ - smp_rmb(); - if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) - goto unlock; + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * + * __schedule() (switch to task 'p') + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * UNLOCK rq->lock + * + * [task p] + * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + */ + smp_rmb(); + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) + break; #ifdef CONFIG_SMP - /* - * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be - * possible to, falsely, observe p->on_cpu == 0. - * - * One must be running (->on_cpu == 1) in order to remove oneself - * from the runqueue. - * - * __schedule() (switch to task 'p') try_to_wake_up() - * STORE p->on_cpu = 1 LOAD p->on_rq - * UNLOCK rq->lock - * - * __schedule() (put 'p' to sleep) - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * STORE p->on_rq = 0 LOAD p->on_cpu - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer - * care about it's own p->state. See the comment in __schedule(). - */ - smp_acquire__after_ctrl_dep(); + /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * __schedule() (switch to task 'p') try_to_wake_up() + * STORE p->on_cpu = 1 LOAD p->on_rq + * UNLOCK rq->lock + * + * __schedule() (put 'p' to sleep) + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * STORE p->on_rq = 0 LOAD p->on_cpu + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). + */ + smp_acquire__after_ctrl_dep(); - /* - * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq - * == 0), which means we need to do an enqueue, change p->state to - * TASK_WAKING such that we can unlock p->pi_lock before doing the - * enqueue, such as ttwu_queue_wakelist(). - */ - WRITE_ONCE(p->__state, TASK_WAKING); + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + WRITE_ONCE(p->__state, TASK_WAKING); - /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, considering queueing p on the remote CPUs wake_list - * which potentially sends an IPI instead of spinning on p->on_cpu to - * let the waker make forward progress. This is safe because IRQs are - * disabled and the IPI will deliver after on_cpu is cleared. - * - * Ensure we load task_cpu(p) after p->on_cpu: - * - * set_task_cpu(p, cpu); - * STORE p->cpu = @cpu - * __schedule() (switch to task 'p') - * LOCK rq->lock - * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) - * STORE p->on_cpu = 1 LOAD p->cpu - * - * to ensure we observe the correct CPU on which the task is currently - * scheduling. - */ - if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) - goto unlock; + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. + */ + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) + break; - /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, wait until it's done referencing the task. - * - * Pairs with the smp_store_release() in finish_task(). - * - * This ensures that tasks getting woken will be fully ordered against - * their previous state and preserve Program Order. - */ - smp_cond_load_acquire(&p->on_cpu, !VAL); + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, wait until it's done referencing the task. + * + * Pairs with the smp_store_release() in finish_task(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); - if (task_cpu(p) != cpu) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } - wake_flags |= WF_MIGRATED; - psi_ttwu_dequeue(p); - set_task_cpu(p, cpu); - } + wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); + set_task_cpu(p, cpu); + } #else - cpu = task_cpu(p); + cpu = task_cpu(p); #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu, wake_flags); -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + ttwu_queue(p, cpu, wake_flags); + } out: if (success) ttwu_stat(p, task_cpu(p), wake_flags); - preempt_enable(); return success; } @@ -4500,6 +4500,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.vlag = 0; + p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -5495,23 +5497,20 @@ unsigned int nr_iowait(void) void sched_exec(void) { struct task_struct *p = current; - unsigned long flags; + struct migration_arg arg; int dest_cpu; - raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); - if (dest_cpu == smp_processor_id()) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); + if (dest_cpu == smp_processor_id()) + return; - if (likely(cpu_active(dest_cpu))) { - struct migration_arg arg = { p, dest_cpu }; + if (unlikely(!cpu_active(dest_cpu))) + return; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - return; + arg = (struct migration_arg){ p, dest_cpu }; } -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } #endif @@ -5721,9 +5720,6 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; - struct rq_flags rf; - u64 delta; int os; /* @@ -5733,30 +5729,26 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!tick_nohz_tick_stopped_cpu(cpu)) - goto out_requeue; + if (tick_nohz_tick_stopped_cpu(cpu)) { + guard(rq_lock_irq)(rq); + struct task_struct *curr = rq->curr; - rq_lock_irq(rq, &rf); - curr = rq->curr; - if (cpu_is_offline(cpu)) - goto out_unlock; + if (cpu_online(cpu)) { + update_rq_clock(rq); - update_rq_clock(rq); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a + * reasonable amount of time. + */ + u64 delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } + curr->sched_class->task_tick(rq, curr, 0); - if (!is_idle_task(curr)) { - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + calc_load_nohz_remote(rq); + } } - curr->sched_class->task_tick(rq, curr, 0); - - calc_load_nohz_remote(rq); -out_unlock: - rq_unlock_irq(rq, &rf); -out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary @@ -6305,19 +6297,19 @@ static bool try_steal_cookie(int this, int that) unsigned long cookie; bool success = false; - local_irq_disable(); - double_rq_lock(dst, src); + guard(irq)(); + guard(double_rq_lock)(dst, src); cookie = dst->core->core_cookie; if (!cookie) - goto unlock; + return false; if (dst->curr != dst->idle) - goto unlock; + return false; p = sched_core_find(src, cookie); if (!p) - goto unlock; + return false; do { if (p == src->core_pick || p == src->curr) @@ -6329,9 +6321,10 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; /* - * sched_core_find() and sched_core_next() will ensure that task @p - * is not throttled now, we also need to check whether the runqueue - * of the destination CPU is being throttled. + * sched_core_find() and sched_core_next() will ensure + * that task @p is not throttled now, we also need to + * check whether the runqueue of the destination CPU is + * being throttled. */ if (sched_task_is_throttled(p, this)) goto next; @@ -6349,10 +6342,6 @@ next: p = sched_core_next(p, cookie); } while (p); -unlock: - double_rq_unlock(dst, src); - local_irq_enable(); - return success; } @@ -6410,20 +6399,24 @@ static void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); } +DEFINE_LOCK_GUARD_1(core_lock, int, + sched_core_lock(*_T->lock, &_T->flags), + sched_core_unlock(*_T->lock, &_T->flags), + unsigned long flags) + static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; - unsigned long flags; int t; - sched_core_lock(cpu, &flags); + guard(core_lock)(&cpu); WARN_ON_ONCE(rq->core != rq); /* if we're the first, we'll be our own leader */ if (cpumask_weight(smt_mask) == 1) - goto unlock; + return; /* find the leader */ for_each_cpu(t, smt_mask) { @@ -6437,7 +6430,7 @@ static void sched_core_cpu_starting(unsigned int cpu) } if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ - goto unlock; + return; /* install and validate core_rq */ for_each_cpu(t, smt_mask) { @@ -6448,29 +6441,25 @@ static void sched_core_cpu_starting(unsigned int cpu) WARN_ON_ONCE(rq->core != core_rq); } - -unlock: - sched_core_unlock(cpu, &flags); } static void sched_core_cpu_deactivate(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; - unsigned long flags; int t; - sched_core_lock(cpu, &flags); + guard(core_lock)(&cpu); /* if we're the last man standing, nothing to do */ if (cpumask_weight(smt_mask) == 1) { WARN_ON_ONCE(rq->core != rq); - goto unlock; + return; } /* if we're not the leader, nothing to do */ if (rq->core != rq) - goto unlock; + return; /* find a new leader */ for_each_cpu(t, smt_mask) { @@ -6481,7 +6470,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu) } if (WARN_ON_ONCE(!core_rq)) /* impossible */ - goto unlock; + return; /* copy the shared state to the new leader */ core_rq->core_task_seq = rq->core_task_seq; @@ -6503,9 +6492,6 @@ static void sched_core_cpu_deactivate(unsigned int cpu) rq = cpu_rq(t); rq->core = core_rq; } - -unlock: - sched_core_unlock(cpu, &flags); } static inline void sched_core_cpu_dying(unsigned int cpu) @@ -7382,6 +7368,19 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SCHED_CORE +int sched_core_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (sched_core_enabled(rq) && rq->curr == rq->idle) + return 1; + + return idle_cpu(cpu); +} + +#endif + #ifdef CONFIG_SMP /* * This function computes an effective utilization for the given CPU, to be @@ -9939,7 +9938,7 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -11073,11 +11072,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) /* * Ensure max(child_quota) <= parent_quota. On cgroup2, - * always take the min. On cgroup1, only inherit when no - * limit is set: + * always take the non-RUNTIME_INF min. On cgroup1, only + * inherit when no limit is set. In both cases this is used + * by the scheduler to determine if a given CFS task has a + * bandwidth constraint at some higher level. */ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { - quota = min(quota, parent_quota); + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF) + quota = min(quota, parent_quota); } else { if (quota == RUNTIME_INF) quota = parent_quota; @@ -11138,6 +11142,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) return 0; } + +static u64 throttled_time_self(struct task_group *tg) +{ + int i; + u64 total = 0; + + for_each_possible_cpu(i) { + total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + } + + return total; +} + +static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg)); + + return 0; +} #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -11214,6 +11239,10 @@ static struct cftype cpu_legacy_files[] = { .name = "stat", .seq_show = cpu_cfs_stat_show, }, + { + .name = "stat.local", + .seq_show = cpu_cfs_local_stat_show, + }, #endif #ifdef CONFIG_RT_GROUP_SCHED { @@ -11270,6 +11299,24 @@ static int cpu_extra_stat_show(struct seq_file *sf, return 0; } +static int cpu_local_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) +{ +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(css); + u64 throttled_self_usec; + + throttled_self_usec = throttled_time_self(tg); + do_div(throttled_self_usec, NSEC_PER_USEC); + + seq_printf(sf, "throttled_usec %llu\n", + throttled_self_usec); + } +#endif + return 0; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -11448,6 +11495,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, + .css_local_stat_show = cpu_local_stat_show, #ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, #endif |