diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 17:41:08 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 17:41:08 -0800 |
commit | 1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f (patch) | |
tree | a5dabaa924d50867cbe347e20a7643b2850f11c0 /kernel | |
parent | a2f0e7eee1344eb9f91b22bc72d9eb0a52b849c9 (diff) | |
parent | 7c4a5b89a0b5a57a64b601775b296abf77a9fe97 (diff) |
Merge tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Improve the scalability of the CFS bandwidth unthrottling logic with
large number of CPUs.
- Fix & rework various cpuidle routines, simplify interaction with the
generic scheduler code. Add __cpuidle methods as noinstr to objtool's
noinstr detection and fix boatloads of cpuidle bugs & quirks.
- Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query
previously issued registrations.
- Limit scheduler slice duration to the sysctl_sched_latency period, to
improve scheduling granularity with a large number of SCHED_IDLE
tasks.
- Debuggability enhancement on sys_exit(): warn about disabled IRQs,
but also enable them to prevent a cascade of followup problems and
repeat warnings.
- Fix the rescheduling logic in prio_changed_dl().
- Micro-optimize cpufreq and sched-util methods.
- Micro-optimize ttwu_runnable()
- Micro-optimize the idle-scanning in update_numa_stats(),
select_idle_capacity() and steal_cookie_task().
- Update the RSEQ code & self-tests
- Constify various scheduler methods
- Remove unused methods
- Refine __init tags
- Documentation updates
- Misc other cleanups, fixes
* tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (110 commits)
sched/rt: pick_next_rt_entity(): check list_entry
sched/deadline: Add more reschedule cases to prio_changed_dl()
sched/fair: sanitize vruntime of entity being placed
sched/fair: Remove capacity inversion detection
sched/fair: unlink misfit task from cpu overutilized
objtool: mem*() are not uaccess safe
cpuidle: Fix poll_idle() noinstr annotation
sched/clock: Make local_clock() noinstr
sched/clock/x86: Mark sched_clock() noinstr
x86/pvclock: Improve atomic update of last_value in pvclock_clocksource_read()
x86/atomics: Always inline arch_atomic64*()
cpuidle: tracing, preempt: Squash _rcuidle tracing
cpuidle: tracing: Warn about !rcu_is_watching()
cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG
cpuidle: drivers: firmware: psci: Dont instrument suspend code
KVM: selftests: Fix build of rseq test
exit: Detect and fix irq disabled state in oops
cpuidle, arm64: Fix the ARM64 cpuidle logic
cpuidle: mvebu: Fix duplicate flags assignment
sched/fair: Limit sched slice duration
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/context_tracking.c | 12 | ||||
-rw-r--r-- | kernel/cpu_pm.c | 9 | ||||
-rw-r--r-- | kernel/exit.c | 7 | ||||
-rw-r--r-- | kernel/fork.c | 8 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 3 | ||||
-rw-r--r-- | kernel/panic.c | 5 | ||||
-rw-r--r-- | kernel/printk/printk.c | 2 | ||||
-rw-r--r-- | kernel/ptrace.c | 2 | ||||
-rw-r--r-- | kernel/rseq.c | 65 | ||||
-rw-r--r-- | kernel/sched/clock.c | 27 | ||||
-rw-r--r-- | kernel/sched/core.c | 134 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 43 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 4 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 42 | ||||
-rw-r--r-- | kernel/sched/fair.c | 389 | ||||
-rw-r--r-- | kernel/sched/idle.c | 47 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 39 | ||||
-rw-r--r-- | kernel/sched/rt.c | 5 | ||||
-rw-r--r-- | kernel/sched/sched.h | 107 | ||||
-rw-r--r-- | kernel/sched/topology.c | 4 | ||||
-rw-r--r-- | kernel/signal.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 29 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_preemptirq.c | 61 |
25 files changed, 689 insertions, 366 deletions
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 77978e372377..a09f1c19336a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Even if context tracking is disabled on this CPU, because it's outside @@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state) */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_add(state, &ct->state); + arch_atomic_add(state, &ct->state); } } } @@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_sub(state, &ct->state); + arch_atomic_sub(state, &ct->state); } } } diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index ba4ba71facf9..b0f0d15085db 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event) { int ret; - /* - * This introduces a RCU read critical section, which could be - * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know - * this. - */ - ct_irq_enter_irqson(); rcu_read_lock(); ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); rcu_read_unlock(); - ct_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev unsigned long flags; int ret; - ct_irq_enter_irqson(); raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); - ct_irq_exit_irqson(); return notifier_to_errno(ret); } diff --git a/kernel/exit.c b/kernel/exit.c index 15dc2ec80c46..bccfa4218356 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -807,6 +807,8 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; + WARN_ON(irqs_disabled()); + synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); @@ -938,6 +940,11 @@ void __noreturn make_task_dead(int signr) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + if (unlikely(irqs_disabled())) { + pr_info("note: %s[%d] exited with irqs disabled\n", + current->comm, task_pid_nr(current)); + local_irq_enable(); + } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), diff --git a/kernel/fork.c b/kernel/fork.c index d9c97704b7c9..038b898dad52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->reported_split_lock = 0; #endif +#ifdef CONFIG_SCHED_MM_CID + tsk->mm_cid = -1; + tsk->mm_cid_active = 0; +#endif return tsk; free_stack: @@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); + mm_init_cid(mm); return mm; fail_pcpu: @@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->mm = mm; tsk->active_mm = mm; + sched_mm_cid_fork(tsk); return 0; } @@ -3034,7 +3040,7 @@ void __init mm_cache_init(void) * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ - mm_size = sizeof(struct mm_struct) + cpumask_size(); + mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e3375bc40dad..50d4863974e7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include <linux/rcupdate.h> #include <linux/kprobes.h> #include <linux/lockdep.h> +#include <linux/context_tracking.h> #include <asm/sections.h> @@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; int dl = READ_ONCE(debug_locks); + bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); @@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/panic.c b/kernel/panic.c index 463c9295bc28..487f5b03bf83 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ #include <linux/ratelimit.h> #include <linux/debugfs.h> #include <linux/sysfs.h> +#include <linux/context_tracking.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -679,6 +680,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { + bool rcu = warn_rcu_enter(); struct warn_args args; pr_warn(CUT_HERE); @@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { + bool rcu = warn_rcu_enter(); va_list args; pr_warn(CUT_HERE); @@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...) va_start(args, fmt); vprintk(fmt, args); va_end(args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(__warn_printk); #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a5ed2e53547c..94f136b25f6a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2196,7 +2196,7 @@ static u16 printk_sprint(char *text, u16 size, int facility, } } - trace_console_rcuidle(text, text_len); + trace_console(text, text_len); return text_len; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54482193e1ed..0786450074c1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, { struct ptrace_rseq_configuration conf = { .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = sizeof(*task->rseq), + .rseq_abi_size = task->rseq_len, .signature = task->rseq_sig, .flags = 0, }; diff --git a/kernel/rseq.c b/kernel/rseq.c index d38ab944105d..9de6e35fe679 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -18,6 +18,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) @@ -82,15 +85,25 @@ * F1. <failure> */ -static int rseq_update_cpu_id(struct task_struct *t) +static int rseq_update_cpu_node_id(struct task_struct *t) { - u32 cpu_id = raw_smp_processor_id(); struct rseq __user *rseq = t->rseq; + u32 cpu_id = raw_smp_processor_id(); + u32 node_id = cpu_to_node(cpu_id); + u32 mm_cid = task_mm_cid(t); - if (!user_write_access_begin(rseq, sizeof(*rseq))) + WARN_ON_ONCE((int) mm_cid < 0); + if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); + unsafe_put_user(node_id, &rseq->node_id, efault_end); + unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally updated only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ user_write_access_end(); trace_rseq_update(t); return 0; @@ -101,9 +114,10 @@ efault: return -EFAULT; } -static int rseq_reset_rseq_cpu_id(struct task_struct *t) +static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, + mm_cid = 0; /* * Reset cpu_id_start to its initial state (0). @@ -117,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; + /* + * Reset node_id to its initial state (0). + */ + if (put_user(node_id, &t->rseq->node_id)) + return -EFAULT; + /* + * Reset mm_cid to its initial state (0). + */ + if (put_user(mm_cid, &t->rseq->mm_cid)) + return -EFAULT; + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally reset only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ return 0; } @@ -301,7 +330,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(ret < 0)) goto error; } - if (unlikely(rseq_update_cpu_id(t))) + if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; @@ -344,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (rseq_len != sizeof(*rseq)) + if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_id(current); + ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; + current->rseq_len = 0; return 0; } @@ -365,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != sizeof(*rseq)) + if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -374,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, } /* - * If there was no rseq previously registered, - * ensure the provided rseq is properly aligned and valid. + * If there was no rseq previously registered, ensure the provided rseq + * is properly aligned, as communcated to user-space through the ELF + * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq + * size, the required alignment is the original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ - if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || - rseq_len != sizeof(*rseq)) + if (rseq_len < ORIG_RSEQ_SIZE || + (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; + current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e374c0c923da..5732fa75ebab 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -93,7 +93,7 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -notrace static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } @@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -notrace static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -notrace static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -notrace static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -287,13 +287,28 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -notrace static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock() + __sched_clock_offset; + + preempt_disable_notrace(); + clock = sched_clock_local(this_scd()); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a4918a1faa9..fb49dbf61273 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -152,7 +152,7 @@ __read_mostly int scheduler_running; DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); /* kernel prio, less is more */ -static inline int __task_prio(struct task_struct *p) +static inline int __task_prio(const struct task_struct *p) { if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; @@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p) */ /* real prio, less is less */ -static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +static inline bool prio_less(const struct task_struct *a, + const struct task_struct *b, bool in_fi) { int pa = __task_prio(a), pb = __task_prio(b); @@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool return false; } -static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +static inline bool __sched_core_less(const struct task_struct *a, + const struct task_struct *b) { if (a->core_cookie < b->core_cookie) return true; @@ -3675,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } /* - * Mark the task runnable and perform wakeup-preemption. + * Mark the task runnable. */ -static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) +static inline void ttwu_do_wakeup(struct task_struct *p) { - check_preempt_curr(rq, p, wake_flags); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct rq_flags *rf) +{ + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + + lockdep_assert_rq_held(rq); + + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + +#ifdef CONFIG_SMP + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; + else +#endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + + activate_task(rq, p, en_flags); + check_preempt_curr(rq, p, wake_flags); + + ttwu_do_wakeup(p); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { @@ -3712,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, #endif } -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) -{ - int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - - lockdep_assert_rq_held(rq); - - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; - -#ifdef CONFIG_SMP - if (wake_flags & WF_MIGRATED) - en_flags |= ENQUEUE_MIGRATED; - else -#endif - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - - activate_task(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, rf); -} - /* * Consider @p being inside a wait loop: * @@ -3770,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { - /* check_preempt_curr() may use rq clock */ - update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, &rf); + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + update_rq_clock(rq); + check_preempt_curr(rq, p, wake_flags); + } + ttwu_do_wakeup(p); ret = 1; } __task_rq_unlock(rq, &rf); @@ -4138,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; trace_sched_waking(p); - WRITE_ONCE(p->__state, TASK_RUNNING); - trace_sched_wakeup(p); + ttwu_do_wakeup(p); goto out; } @@ -5104,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); + switch_mm_cid(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -6260,7 +6268,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd) { int i; - for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) { if (i == cpu) continue; @@ -11365,3 +11373,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_before_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_after_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + t->mm_cid = mm_cid_get(mm); + t->mm_cid_active = 1; + local_irq_restore(flags); + rseq_set_notify_resume(t); +} + +void sched_mm_cid_fork(struct task_struct *t) +{ + WARN_ON_ONCE(!t->mm || t->mm_cid != -1); + t->mm_cid_active = 1; +} +#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1207c78f85c1..5c840151f3bb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,7 +48,6 @@ struct sugov_cpu { unsigned long util; unsigned long bw_dl; - unsigned long max; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), FREQUENCY_UTIL, NULL); @@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller + * @max_cap: the max CPU capacity * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long max_cap) { unsigned long boost; @@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; @@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, - u64 time, unsigned int flags) + u64 time, unsigned long max_cap, + unsigned int flags) { sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, return false; sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time); + sugov_iowait_apply(sg_cpu, time, max_cap); return true; } @@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned long max_cap; unsigned int next_f; - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); + next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); unsigned long prev_util = sg_cpu->util; + unsigned long max_cap; /* * Fall back to the "frequency" path if frequency invariance is not @@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, return; } - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; /* @@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), sg_cpu->max); + map_util_perf(sg_cpu->util), max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0, max_cap; unsigned int j; + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time); - j_util = j_sg_cpu->util; - j_max = j_sg_cpu->max; + sugov_iowait_apply(j_sg_cpu, time, max_cap); - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, max_cap); } static void diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 95fc77853743..af7952f12e6c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,10 @@ * Simple CPU accounting cgroup controller */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + #include <asm/cputime.h> +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0d97d54276cc..71b24371a6f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || task_current(rq, p)) { + if (!task_on_rq_queued(p)) + return; + #ifdef CONFIG_SMP - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) - deadline_queue_pull_task(rq); + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + deadline_queue_pull_task(rq); + if (task_current(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this @@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) resched_curr(rq); -#else + } else { /* - * Again, we don't know if p has a earlier - * or later deadline, so let's blindly set a - * (maybe not needed) rescheduling point. + * Current may not be deadline in case p was throttled but we + * have just replenished it (e.g. rt_mutex_setprio()). + * + * Otherwise, if p was given an earlier deadline, reschedule. */ - resched_curr(rq); -#endif /* CONFIG_SMP */ + if (!dl_task(rq->curr) || + dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) + resched_curr(rq); } +#else + /* + * We don't know if p has a earlier or later deadline, so let's blindly + * set a (maybe not needed) rescheduling point. + */ + resched_curr(rq); +#endif } DEFINE_SCHED_CLASS(dl) = { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0f8736991427..ff4dbbae3b10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse) return NULL; } -static inline struct sched_entity *parent_entity(struct sched_entity *se) +static inline struct sched_entity *parent_entity(const struct sched_entity *se) { return se->parent; } @@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline bool entity_before(struct sched_entity *a, - struct sched_entity *b) +static inline bool entity_before(const struct sched_entity *a, + const struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } @@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); - if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { if (READ_ONCE(rq->numa_migrate_on) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; @@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env, int start = env->dst_cpu; /* Find alternative idle CPU. */ - for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { if (cpu == env->best_cpu || !idle_cpu(cpu) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { continue; @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util, * * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. - * - * In case of capacity inversion we should honour the inverted capacity - * for both uclamp_min and uclamp_max all the time. */ - capacity_orig = cpu_in_capacity_inversion(cpu); - if (capacity_orig) { - capacity_orig_thermal = capacity_orig; - } else { - capacity_orig = capacity_orig_of(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); - } + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) - fits = fits && (uclamp_min <= capacity_orig_thermal); + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + return -1; return fits; } @@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); unsigned long util = task_util_est(p); - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements, which + * include the utilization but also the performance hints. + */ + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4656,6 +4652,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; + u64 sleep_time; /* * The 'current' period is already promised to the current tasks, @@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * Pull vruntime of the entity being placed to the base level of + * cfs_rq, to prevent boosting it if placed backwards. If the entity + * slept for a long time, don't even try to compare its vruntime with + * the base as it may be too far off and the comparison may get + * inversed due to s64 overflow. + */ + sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; + if ((s64)sleep_time > 60LL * NSEC_PER_SEC) + se->vruntime = vruntime; + else + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se; s64 delta; - ideal_runtime = sched_slice(cfs_rq, curr); + /* + * When many tasks blow up the sched_period; it is possible that + * sched_slice() reports unusually large results (when many tasks are + * very light for example). Therefore impose a maximum. + */ + ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); @@ -5461,22 +5474,105 @@ unthrottle_throttle: resched_curr(rq); } -static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +#ifdef CONFIG_SMP +static void __cfsb_csd_unthrottle(void *arg) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cursor, *tmp; + struct rq *rq = arg; + struct rq_flags rf; + + rq_lock(rq, &rf); + + /* + * Since we hold rq lock we're safe from concurrent manipulation of + * the CSD list. However, this RCU critical section annotates the + * fact that we pair with sched_free_group_rcu(), so that we cannot + * race with group being freed in the window between removing it + * from the list and advancing to the next entry in the list. + */ + rcu_read_lock(); + + list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, + throttled_csd_list) { + list_del_init(&cursor->throttled_csd_list); + + if (cfs_rq_throttled(cursor)) + unthrottle_cfs_rq(cursor); + } + + rcu_read_unlock(); + + rq_unlock(rq, &rf); +} + +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + bool first; + + if (rq == this_rq()) { + unthrottle_cfs_rq(cfs_rq); + return; + } + + /* Already enqueued */ + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + return; + + first = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); + if (first) + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); +} +#else +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + unthrottle_cfs_rq(cfs_rq); +} +#endif + +static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + lockdep_assert_rq_held(rq_of(cfs_rq)); + + if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + cfs_rq->runtime_remaining <= 0)) + return; + + __unthrottle_cfs_rq_async(cfs_rq); +} + +static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +{ + struct cfs_rq *local_unthrottle = NULL; + int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; + bool throttled = false; + struct cfs_rq *cfs_rq; + struct rq_flags rf; + struct rq *rq; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { - struct rq *rq = rq_of(cfs_rq); - struct rq_flags rf; + rq = rq_of(cfs_rq); + + if (!remaining) { + throttled = true; + break; + } rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; - /* By the above check, this should never be true */ +#ifdef CONFIG_SMP + /* Already queued for async unthrottle */ + if (!list_empty(&cfs_rq->throttled_csd_list)) + goto next; +#endif + + /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); @@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) cfs_rq->runtime_remaining += runtime; /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) - unthrottle_cfs_rq(cfs_rq); + if (cfs_rq->runtime_remaining > 0) { + if (cpu_of(rq) != this_cpu || + SCHED_WARN_ON(local_unthrottle)) + unthrottle_cfs_rq_async(cfs_rq); + else + local_unthrottle = cfs_rq; + } else { + throttled = true; + } next: rq_unlock_irqrestore(rq, &rf); - - if (!remaining) - break; } rcu_read_unlock(); + + if (local_unthrottle) { + rq = cpu_rq(this_cpu); + rq_lock_irqsave(rq, &rf); + if (cfs_rq_throttled(local_unthrottle)) + unthrottle_cfs_rq(local_unthrottle); + rq_unlock_irqrestore(rq, &rf); + } + + return throttled; } /* @@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - distribute_cfs_runtime(cfs_b); + throttled = distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); } /* @@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_SMP + INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + int __maybe_unused i; + /* init_cfs_bandwidth() was not called */ if (!cfs_b->throttled_cfs_rq.next) return; hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); + + /* + * It is possible that we still have some cfs_rq's pending on a CSD + * list, though this race is very rare. In order for this to occur, we + * must have raced with the last task leaving the group while there + * exist throttled cfs_rq(s), and the period_timer must have queued the + * CSD item but the remote cpu has not yet processed it. To handle this, + * we can simply flush all pending CSD work inline here. We're + * guaranteed at this point that no additional cfs_rq of this group can + * join a CSD list. + */ +#ifdef CONFIG_SMP + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + if (list_empty(&rq->cfsb_csd_list)) + continue; + + local_irq_save(flags); + __cfsb_csd_unthrottle(rq); + local_irq_restore(flags); + } +#endif } /* @@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu) unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } @@ -6801,6 +6939,7 @@ static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { unsigned long task_util, util_min, util_max, best_cap = 0; + int fits, best_fits = 0; int cpu, best_cpu = -1; struct cpumask *cpus; @@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (util_fits_cpu(task_util, util_min, util_max, cpu)) + + fits = util_fits_cpu(task_util, util_min, util_max, cpu); + + /* This CPU fits with all requirements */ + if (fits > 0) return cpu; + /* + * Only the min performance hint (i.e. uclamp_min) doesn't fit. + * Look for the CPU with best capacity. + */ + else if (fits < 0) + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); - if (cpu_cap > best_cap) { + /* + * First, select CPU which fits better (-1 being better than 0). + * Then, select the one with best capacity at same level. + */ + if ((fits < best_fits) || + ((fits == best_fits) && (cpu_cap > best_cap))) { best_cap = cpu_cap; best_cpu = cpu; + best_fits = fits; } } @@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util, int cpu) { if (sched_asym_cpucap_active()) - return util_fits_cpu(util, util_min, util_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements + * which include the utilization and the performance hints. + */ + return (util_fits_cpu(util, util_min, util_max, cpu) > 0); return true; } @@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; + int prev_fits = -1, best_fits = -1; + unsigned long best_thermal_cap = 0; + unsigned long prev_thermal_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; @@ -7236,6 +7398,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) unsigned long prev_spare_cap = 0; int max_spare_cap_cpu = -1; unsigned long base_energy; + int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7285,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) util_min = max(rq_util_min, p_util_min); util_max = max(rq_util_max, p_util_max); } - if (!util_fits_cpu(util, util_min, util_max, cpu)) + + fits = util_fits_cpu(util, util_min, util_max, cpu); + if (!fits) continue; lsub_positive(&cpu_cap, util); @@ -7293,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ prev_spare_cap = cpu_cap; - } else if (cpu_cap > max_spare_cap) { + prev_fits = fits; + } else if ((fits > max_fits) || + ((fits == max_fits) && (cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7301,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) */ max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; + max_fits = fits; } } @@ -7319,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; + prev_thermal_cap = cpu_thermal_cap; best_delta = min(best_delta, prev_delta); } /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { + /* Current best energy cpu fits better */ + if (max_fits < best_fits) + continue; + + /* + * Both don't fit performance hint (i.e. uclamp_min) + * but best energy cpu has better capacity. + */ + if ((max_fits < 0) && + (cpu_thermal_cap <= best_thermal_cap)) + continue; + cur_delta = compute_energy(&eenv, pd, cpus, p, max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) goto unlock; cur_delta -= base_energy; - if (cur_delta < best_delta) { - best_delta = cur_delta; - best_energy_cpu = max_spare_cap_cpu; - } + + /* + * Both fit for the task but best energy cpu has lower + * energy impact. + */ + if ((max_fits > 0) && (best_fits > 0) && + (cur_delta >= best_delta)) + continue; + + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + best_fits = max_fits; + best_thermal_cap = cpu_thermal_cap; } } rcu_read_unlock(); - if (best_delta < prev_delta) + if ((best_fits > prev_fits) || + ((best_fits > 0) && (best_delta < prev_delta)) || + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) target = best_energy_cpu; return target; @@ -8838,82 +9030,16 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - struct rq *rq = cpu_rq(cpu); - rq->cpu_capacity_orig = capacity_orig; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; - rq->cpu_capacity = capacity; - - /* - * Detect if the performance domain is in capacity inversion state. - * - * Capacity inversion happens when another perf domain with equal or - * lower capacity_orig_of() ends up having higher capacity than this - * domain after subtracting thermal pressure. - * - * We only take into account thermal pressure in this detection as it's - * the only metric that actually results in *real* reduction of - * capacity due to performance points (OPPs) being dropped/become - * unreachable due to thermal throttling. - * - * We assume: - * * That all cpus in a perf domain have the same capacity_orig - * (same uArch). - * * Thermal pressure will impact all cpus in this perf domain - * equally. - */ - if (sched_energy_enabled()) { - unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); - struct perf_domain *pd; - - rcu_read_lock(); - - pd = rcu_dereference(rq->rd->pd); - rq->cpu_capacity_inverted = 0; - - for (; pd; pd = pd->next) { - struct cpumask *pd_span = perf_domain_span(pd); - unsigned long pd_cap_orig, pd_cap; - - /* We can't be inverted against our own pd */ - if (cpumask_test_cpu(cpu_of(rq), pd_span)) - continue; - - cpu = cpumask_any(pd_span); - pd_cap_orig = arch_scale_cpu_capacity(cpu); - - if (capacity_orig < pd_cap_orig) - continue; - - /* - * handle the case of multiple perf domains have the - * same capacity_orig but one of them is under higher - * thermal pressure. We record it as capacity - * inversion. - */ - if (capacity_orig == pd_cap_orig) { - pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); - - if (pd_cap > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } else if (pd_cap_orig > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } - - rcu_read_unlock(); - } - - trace_sched_cpu_capacity_tp(rq); + cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; @@ -10141,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } - - local = &sds.local_stat; - busiest = &sds.busiest_stat; - /* There is no busy sibling group to pull tasks from */ if (!sds.busiest) goto out_balanced; + busiest = &sds.busiest_stat; + /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; + if (sched_energy_enabled()) { + struct root_domain *rd = env->dst_rq->rd; + + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + goto out_balanced; + } + /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) goto force_balance; @@ -10171,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; + local = &sds.local_stat; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -11734,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) /* * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. */ -static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, + bool forceidle) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -11759,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) se_fi_update(se, rq->core->core_forceidle_seq, in_fi); } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) { struct rq *rq = task_rq(a); - struct sched_entity *sea = &a->se; - struct sched_entity *seb = &b->se; + const struct sched_entity *sea = &a->se; + const struct sched_entity *seb = &b->se; struct cfs_rq *cfs_rqa; struct cfs_rq *cfs_rqb; s64 delta; @@ -12480,6 +12608,11 @@ __init void init_sched_fair_class(void) for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + +#ifdef CONFIG_CFS_BANDWIDTH + INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); + INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); +#endif } open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f26ab2675f7d..e9ef66be2870 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { + instrumentation_begin(); trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); - ct_idle_enter(); - local_irq_enable(); + ct_cpuidle_enter(); + raw_local_irq_enable(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + raw_local_irq_disable(); - ct_idle_exit(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + local_irq_enable(); + instrumentation_end(); return 1; } @@ -75,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - raw_local_irq_enable(); } /** @@ -85,44 +88,20 @@ void __weak arch_cpu_idle(void) */ void __cpuidle default_idle_call(void) { - if (current_clr_polling_and_test()) { - local_irq_enable(); - } else { - + instrumentation_begin(); + if (!current_clr_polling_and_test()) { trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); - /* - * arch_cpu_idle() is supposed to enable IRQs, however - * we can't do that because of RCU and tracing. - * - * Trace IRQs enable here, then switch off RCU, and have - * arch_cpu_idle() use raw_local_irq_enable(). Note that - * ct_idle_enter() relies on lockdep IRQ state, so switch that - * last -- this is very similar to the entry code. - */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_idle_enter(); - lockdep_hardirqs_on(_THIS_IP_); - + ct_cpuidle_enter(); arch_cpu_idle(); - - /* - * OK, so IRQs are enabled here, but RCU needs them disabled to - * turn itself back on.. funny thing is that disabling IRQs - * will cause tracing, which needs RCU. Jump through hoops to - * make it 'work'. - */ - raw_local_irq_disable(); - lockdep_hardirqs_off(_THIS_IP_); - ct_idle_exit(); - lockdep_hardirqs_on(_THIS_IP_); - raw_local_irq_enable(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } + local_irq_enable(); + instrumentation_end(); } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 0c5be7ebb1dc..2ad881d07752 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -159,7 +159,8 @@ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ - | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + | MEMBARRIER_CMD_GET_REGISTRATIONS) static void ipi_mb(void *info) { @@ -540,6 +541,40 @@ static int membarrier_register_private_expedited(int flags) return 0; } +static int membarrier_get_registrations(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + int registrations_mask = 0, membarrier_state, i; + static const int states[] = { + MEMBARRIER_STATE_GLOBAL_EXPEDITED | + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY + }; + static const int registration_cmds[] = { + MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ + }; + BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds)); + + membarrier_state = atomic_read(&mm->membarrier_state); + for (i = 0; i < ARRAY_SIZE(states); ++i) { + if (membarrier_state & states[i]) { + registrations_mask |= registration_cmds[i]; + membarrier_state &= ~states[i]; + } + } + WARN_ON_ONCE(membarrier_state != 0); + return registrations_mask; +} + /** * sys_membarrier - issue memory barriers on a set of threads * @cmd: Takes command values defined in enum membarrier_cmd. @@ -623,6 +658,8 @@ SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); + case MEMBARRIER_CMD_GET_REGISTRATIONS: + return membarrier_get_registrations(); default: return -EINVAL; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ed2a47e4ddae..0a11f44adee5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1777,6 +1777,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; + if (SCHED_WARN_ON(list_empty(queue))) + return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; @@ -1789,7 +1791,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) do { rt_se = pick_next_rt_entity(rt_rq); - BUG_ON(!rt_se); + if (unlikely(!rt_se)) + return NULL; rt_rq = group_rt_rq(rt_se); } while (rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 771f8ddb7053..3e8df6d31c1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -248,7 +248,7 @@ static inline void update_avg(u64 *avg, u64 sample) #define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) -static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); @@ -260,8 +260,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) /* * Tells if entity @a should preempt entity @b. */ -static inline bool -dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +static inline bool dl_entity_preempt(const struct sched_dl_entity *a, + const struct sched_dl_entity *b) { return dl_entity_is_special(a) || dl_time_before(a->deadline, b->deadline); @@ -645,6 +645,9 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; +#ifdef CONFIG_SMP + struct list_head throttled_csd_list; +#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1041,7 +1044,6 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; - unsigned long cpu_capacity_inverted; struct balance_callback *balance_callback; @@ -1154,6 +1156,11 @@ struct rq { /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) + call_single_data_t cfsb_csd; + struct list_head cfsb_csd_list; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1236,7 +1243,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1415,7 +1423,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) } /* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) { return se->cfs_rq; } @@ -1428,19 +1436,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #else -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} +#define task_of(_se) container_of(_se, struct task_struct, se) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) { return &task_rq(p)->cfs; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) { - struct task_struct *p = task_of(se); + const struct task_struct *p = task_of(se); struct rq *rq = task_rq(p); return &rq->cfs; @@ -2893,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } -/* - * Returns inverted capacity if the CPU is in capacity inversion state. - * 0 otherwise. - * - * Capacity inversion detection only considers thermal impact where actual - * performance points (OPPs) gets dropped. - * - * Capacity inversion state happens when another performance domain that has - * equal or lower capacity_orig_of() becomes effectively larger than the perf - * domain this CPU belongs to due to thermal pressure throttling it hard. - * - * See comment in update_cpu_capacity(). - */ -static inline unsigned long cpu_in_capacity_inversion(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_inverted; -} - /** * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -3261,4 +3248,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr, cgroup_account_cputime(curr, delta_exec); } +#ifdef CONFIG_SCHED_MM_CID +static inline int __mm_cid_get(struct mm_struct *mm) +{ + struct cpumask *cpumask; + int cid; + + cpumask = mm_cidmask(mm); + cid = cpumask_first_zero(cpumask); + if (cid >= nr_cpu_ids) + return -1; + __cpumask_set_cpu(cid, cpumask); + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm, int cid) +{ + lockdep_assert_irqs_disabled(); + if (cid < 0) + return; + raw_spin_lock(&mm->cid_lock); + __cpumask_clear_cpu(cid, mm_cidmask(mm)); + raw_spin_unlock(&mm->cid_lock); +} + +static inline int mm_cid_get(struct mm_struct *mm) +{ + int ret; + + lockdep_assert_irqs_disabled(); + raw_spin_lock(&mm->cid_lock); + ret = __mm_cid_get(mm); + raw_spin_unlock(&mm->cid_lock); + return ret; +} + +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) +{ + if (prev->mm_cid_active) { + if (next->mm_cid_active && next->mm == prev->mm) { + /* + * Context switch between threads in same mm, hand over + * the mm_cid from prev to next. + */ + next->mm_cid = prev->mm_cid; + prev->mm_cid = -1; + return; + } + mm_cid_put(prev->mm, prev->mm_cid); + prev->mm_cid = -1; + } + if (next->mm_cid_active) + next->mm_cid = mm_cid_get(next->mm); +} + +#else +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +#endif + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8739c2a5a54e..d93c3379e901 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -578,7 +578,7 @@ out: */ struct root_domain def_root_domain; -void init_defrootdomain(void) +void __init init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -2451,7 +2451,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * Set up scheduler domains and groups. For now this just excludes isolated * CPUs, but could be used to exclude other special cases in the future. */ -int sched_init_domains(const struct cpumask *cpu_map) +int __init sched_init_domains(const struct cpumask *cpu_map) { int err; diff --git a/kernel/signal.c b/kernel/signal.c index ae26da61c4d9..8cb28f1df294 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ + sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 797eb93103ad..e28f9210f8a1 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer callback function is currently running, then * hrtimer_start() cannot move it and the timer stays on the CPU on * which it is assigned at the moment. + */ + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. * - * As this can be called from idle code, the hrtimer_start() - * invocation has to be wrapped with RCU_NONIDLE() as - * hrtimer_start() can call into tracing. + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. */ - RCU_NONIDLE( { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); - /* - * The core tick broadcast mode expects bc->bound_on to be set - * correctly to prevent a CPU which has the broadcast hrtimer - * armed from going deep idle. - * - * As tick_broadcast_lock is held, nothing can change the cpu - * base which was just established in hrtimer_start() above. So - * the below access is safe even without holding the hrtimer - * base lock. - */ - bc->bound_on = bctimer.base->cpu_base->cpu; - } ); + bc->bound_on = bctimer.base->cpu_base->cpu; + return 0; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f7fe6fe36173..93bf2b4e47e5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -622,9 +622,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void) * to avoid a deep idle transition as we are about to get the * broadcast IPI right away. */ -int tick_check_broadcast_expired(void) +noinstr int tick_check_broadcast_expired(void) { +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); +#else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +#endif } /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c9e40f692650..54a163ae4815 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3128,6 +3128,9 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, return; } + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) + return; + /* * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 1e130da1b742..e37446f7916e 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -15,6 +15,20 @@ #define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> +/* + * Use regular trace points on architectures that implement noinstr + * tooling: these calls will only happen with RCU enabled, which can + * use a regular tracepoint. + * + * On older architectures, use the rcuidle tracing methods (which + * aren't NMI-safe - so exclude NMI contexts): + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define trace(point) trace_##point +#else +#define trace(point) if (!in_nmi()) trace_##point##_rcuidle +#endif + #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) - trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) - trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - if (!in_nmi()) - trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); } } @@ -78,56 +89,24 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - if (!in_nmi()) - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off); NOKPROBE_SYMBOL(trace_hardirqs_off); - -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) -{ - if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) - trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_on(CALLER_ADDR0, caller_addr); - this_cpu_write(tracing_irq_cpu, 0); - } - - lockdep_hardirqs_on_prepare(); - lockdep_hardirqs_on(caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); -NOKPROBE_SYMBOL(trace_hardirqs_on_caller); - -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) -{ - lockdep_hardirqs_off(caller_addr); - - if (!this_cpu_read(tracing_irq_cpu)) { - this_cpu_write(tracing_irq_cpu, 1); - tracer_hardirqs_off(CALLER_ADDR0, caller_addr); - if (!in_nmi()) - trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); - } -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); -NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (!in_nmi()) - trace_preempt_enable_rcuidle(a0, a1); + trace(preempt_enable)(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (!in_nmi()) - trace_preempt_disable_rcuidle(a0, a1); + trace(preempt_disable)(a0, a1); tracer_preempt_off(a0, a1); } #endif |