diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 703 | 
1 files changed, 566 insertions, 137 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..c4462c454ab9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex);  static atomic_t sched_core_count;  static struct cpumask sched_core_mask; +static void sched_core_lock(int cpu, unsigned long *flags) +{ +	const struct cpumask *smt_mask = cpu_smt_mask(cpu); +	int t, i = 0; + +	local_irq_save(*flags); +	for_each_cpu(t, smt_mask) +		raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ +	const struct cpumask *smt_mask = cpu_smt_mask(cpu); +	int t; + +	for_each_cpu(t, smt_mask) +		raw_spin_unlock(&cpu_rq(t)->__lock); +	local_irq_restore(*flags); +} +  static void __sched_core_flip(bool enabled)  { -	int cpu, t, i; +	unsigned long flags; +	int cpu, t;  	cpus_read_lock(); @@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled)  	for_each_cpu(cpu, &sched_core_mask) {  		const struct cpumask *smt_mask = cpu_smt_mask(cpu); -		i = 0; -		local_irq_disable(); -		for_each_cpu(t, smt_mask) { -			/* supports up to SMT8 */ -			raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); -		} +		sched_core_lock(cpu, &flags);  		for_each_cpu(t, smt_mask)  			cpu_rq(t)->core_enabled = enabled; -		for_each_cpu(t, smt_mask) -			raw_spin_unlock(&cpu_rq(t)->__lock); -		local_irq_enable(); +		sched_core_unlock(cpu, &flags);  		cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);  	} @@ -993,6 +1007,7 @@ int get_nohz_timer_target(void)  {  	int i, cpu = smp_processor_id(), default_cpu = -1;  	struct sched_domain *sd; +	const struct cpumask *hk_mask;  	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {  		if (!idle_cpu(cpu)) @@ -1000,10 +1015,11 @@ int get_nohz_timer_target(void)  		default_cpu = cpu;  	} +	hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); +  	rcu_read_lock();  	for_each_domain(cpu, sd) { -		for_each_cpu_and(i, sched_domain_span(sd), -			housekeeping_cpumask(HK_FLAG_TIMER)) { +		for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {  			if (cpu == i)  				continue; @@ -1619,6 +1635,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)  		uclamp_rq_dec_id(rq, p, clamp_id);  } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, +				      enum uclamp_id clamp_id) +{ +	if (!p->uclamp[clamp_id].active) +		return; + +	uclamp_rq_dec_id(rq, p, clamp_id); +	uclamp_rq_inc_id(rq, p, clamp_id); + +	/* +	 * Make sure to clear the idle flag if we've transiently reached 0 +	 * active tasks on rq. +	 */ +	if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) +		rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} +  static inline void  uclamp_update_active(struct task_struct *p)  { @@ -1642,12 +1675,8 @@ uclamp_update_active(struct task_struct *p)  	 * affecting a valid clamp bucket, the next time it's enqueued,  	 * it will already see the updated clamp bucket value.  	 */ -	for_each_clamp_id(clamp_id) { -		if (p->uclamp[clamp_id].active) { -			uclamp_rq_dec_id(rq, p, clamp_id); -			uclamp_rq_inc_id(rq, p, clamp_id); -		} -	} +	for_each_clamp_id(clamp_id) +		uclamp_rq_reinc_id(rq, p, clamp_id);  	task_rq_unlock(rq, p, &rf);  } @@ -2161,7 +2190,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)  	/* Non kernel threads are not allowed during either online or offline. */  	if (!(p->flags & PF_KTHREAD)) -		return cpu_active(cpu); +		return cpu_active(cpu) && task_cpu_possible(cpu, p);  	/* KTHREAD_IS_PER_CPU is always allowed. */  	if (kthread_is_per_cpu(p)) @@ -2468,6 +2497,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  	__do_set_cpus_allowed(p, new_mask, 0);  } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, +		      int node) +{ +	if (!src->user_cpus_ptr) +		return 0; + +	dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); +	if (!dst->user_cpus_ptr) +		return -ENOMEM; + +	cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); +	return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ +	struct cpumask *user_mask = NULL; + +	swap(p->user_cpus_ptr, user_mask); + +	return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ +	kfree(clear_user_cpus_ptr(p)); +} +  /*   * This function is wildly self concurrent; here be dragons.   * @@ -2685,28 +2742,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  }  /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning.   */ -static int __set_cpus_allowed_ptr(struct task_struct *p, -				  const struct cpumask *new_mask, -				  u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, +					 const struct cpumask *new_mask, +					 u32 flags, +					 struct rq *rq, +					 struct rq_flags *rf) +	__releases(rq->lock) +	__releases(p->pi_lock)  { +	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);  	const struct cpumask *cpu_valid_mask = cpu_active_mask; +	bool kthread = p->flags & PF_KTHREAD; +	struct cpumask *user_mask = NULL;  	unsigned int dest_cpu; -	struct rq_flags rf; -	struct rq *rq;  	int ret = 0; -	rq = task_rq_lock(p, &rf);  	update_rq_clock(rq); -	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { +	if (kthread || is_migration_disabled(p)) {  		/*  		 * Kernel threads are allowed on online && !active CPUs,  		 * however, during cpu-hot-unplug, even these might get pushed @@ -2720,6 +2775,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  		cpu_valid_mask = cpu_online_mask;  	} +	if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { +		ret = -EINVAL; +		goto out; +	} +  	/*  	 * Must re-check here, to close a race against __kthread_bind(),  	 * sched_setaffinity() is not guaranteed to observe the flag. @@ -2754,20 +2814,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  	__do_set_cpus_allowed(p, new_mask, flags); -	return affine_move_task(rq, p, &rf, dest_cpu, flags); +	if (flags & SCA_USER) +		user_mask = clear_user_cpus_ptr(p); + +	ret = affine_move_task(rq, p, rf, dest_cpu, flags); + +	kfree(user_mask); + +	return ret;  out: -	task_rq_unlock(rq, p, &rf); +	task_rq_unlock(rq, p, rf);  	return ret;  } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, +				  const struct cpumask *new_mask, u32 flags) +{ +	struct rq_flags rf; +	struct rq *rq; + +	rq = task_rq_lock(p, &rf); +	return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} +  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  {  	return __set_cpus_allowed_ptr(p, new_mask, 0);  }  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, +				     struct cpumask *new_mask, +				     const struct cpumask *subset_mask) +{ +	struct cpumask *user_mask = NULL; +	struct rq_flags rf; +	struct rq *rq; +	int err; + +	if (!p->user_cpus_ptr) { +		user_mask = kmalloc(cpumask_size(), GFP_KERNEL); +		if (!user_mask) +			return -ENOMEM; +	} + +	rq = task_rq_lock(p, &rf); + +	/* +	 * Forcefully restricting the affinity of a deadline task is +	 * likely to cause problems, so fail and noisily override the +	 * mask entirely. +	 */ +	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { +		err = -EPERM; +		goto err_unlock; +	} + +	if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { +		err = -EINVAL; +		goto err_unlock; +	} + +	/* +	 * We're about to butcher the task affinity, so keep track of what +	 * the user asked for in case we're able to restore it later on. +	 */ +	if (user_mask) { +		cpumask_copy(user_mask, p->cpus_ptr); +		p->user_cpus_ptr = user_mask; +	} + +	return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: +	task_rq_unlock(rq, p, &rf); +	kfree(user_mask); +	return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ +	cpumask_var_t new_mask; +	const struct cpumask *override_mask = task_cpu_possible_mask(p); + +	alloc_cpumask_var(&new_mask, GFP_KERNEL); + +	/* +	 * __migrate_task() can fail silently in the face of concurrent +	 * offlining of the chosen destination CPU, so take the hotplug +	 * lock to ensure that the migration succeeds. +	 */ +	cpus_read_lock(); +	if (!cpumask_available(new_mask)) +		goto out_set_mask; + +	if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) +		goto out_free_mask; + +	/* +	 * We failed to find a valid subset of the affinity mask for the +	 * task, so override it based on its cpuset hierarchy. +	 */ +	cpuset_cpus_allowed(p, new_mask); +	override_mask = new_mask; + +out_set_mask: +	if (printk_ratelimit()) { +		printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", +				task_pid_nr(p), p->comm, +				cpumask_pr_args(override_mask)); +	} + +	WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: +	cpus_read_unlock(); +	free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ +	struct cpumask *user_mask = p->user_cpus_ptr; +	unsigned long flags; + +	/* +	 * Try to restore the old affinity mask. If this fails, then +	 * we free the mask explicitly to avoid it being inherited across +	 * a subsequent fork(). +	 */ +	if (!user_mask || !__sched_setaffinity(p, user_mask)) +		return; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	user_mask = clear_user_cpus_ptr(p); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +	kfree(user_mask); +} +  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  {  #ifdef CONFIG_SCHED_DEBUG @@ -3112,9 +3330,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  		/* Look for allowed, online CPU in same node. */  		for_each_cpu(dest_cpu, nodemask) { -			if (!cpu_active(dest_cpu)) -				continue; -			if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) +			if (is_cpu_allowed(p, dest_cpu))  				return dest_cpu;  		}  	} @@ -3131,8 +3347,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  		/* No more Mr. Nice Guy. */  		switch (state) {  		case cpuset: -			if (IS_ENABLED(CONFIG_CPUSETS)) { -				cpuset_cpus_allowed_fallback(p); +			if (cpuset_cpus_allowed_fallback(p)) {  				state = possible;  				break;  			} @@ -3144,10 +3359,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  			 *  			 * More yuck to audit.  			 */ -			do_set_cpus_allowed(p, cpu_possible_mask); +			do_set_cpus_allowed(p, task_cpu_possible_mask(p));  			state = fail;  			break; -  		case fail:  			BUG();  			break; @@ -3562,6 +3776,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)  }  /* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + * + * The rules of PREEMPT_RT saved_state: + * + *   The related locking code always holds p::pi_lock when updating + *   p::saved_state, which means the code is fully serialized in both cases. + * + *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other + *   bits set. This allows to distinguish all wakeup scenarios. + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ +	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { +		WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && +			     state != TASK_RTLOCK_WAIT); +	} + +	if (READ_ONCE(p->__state) & state) { +		*success = 1; +		return true; +	} + +#ifdef CONFIG_PREEMPT_RT +	/* +	 * Saved state preserves the task state across blocking on +	 * an RT lock.  If the state matches, set p::saved_state to +	 * TASK_RUNNING, but do not wake the task because it waits +	 * for a lock wakeup. Also indicate success because from +	 * the regular waker's point of view this has succeeded. +	 * +	 * After acquiring the lock the task will restore p::__state +	 * from p::saved_state which ensures that the regular +	 * wakeup is not lost. The restore will also set +	 * p::saved_state to TASK_RUNNING so any further tests will +	 * not result in false positives vs. @success +	 */ +	if (p->saved_state & state) { +		p->saved_state = TASK_RUNNING; +		*success = 1; +	} +#endif +	return false; +} + +/*   * Notes on Program-Order guarantees on SMP systems.   *   *  MIGRATION @@ -3700,10 +3963,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  		 *  - we're serialized against set_special_state() by virtue of  		 *    it disabling IRQs (this allows not taking ->pi_lock).  		 */ -		if (!(READ_ONCE(p->__state) & state)) +		if (!ttwu_state_match(p, state, &success))  			goto out; -		success = 1;  		trace_sched_waking(p);  		WRITE_ONCE(p->__state, TASK_RUNNING);  		trace_sched_wakeup(p); @@ -3718,14 +3980,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	 */  	raw_spin_lock_irqsave(&p->pi_lock, flags);  	smp_mb__after_spinlock(); -	if (!(READ_ONCE(p->__state) & state)) +	if (!ttwu_state_match(p, state, &success))  		goto unlock;  	trace_sched_waking(p); -	/* We're going to change ->state: */ -	success = 1; -  	/*  	 * Ensure we load p->on_rq _after_ p->state, otherwise it would  	 * be possible to, falsely, observe p->on_rq == 0 and get stuck @@ -5660,11 +5919,9 @@ static bool try_steal_cookie(int this, int that)  		if (p->core_occupation > dst->idle->core_occupation)  			goto next; -		p->on_rq = TASK_ON_RQ_MIGRATING;  		deactivate_task(src, p, 0);  		set_task_cpu(p, this);  		activate_task(dst, p, 0); -		p->on_rq = TASK_ON_RQ_QUEUED;  		resched_curr(dst); @@ -5736,35 +5993,109 @@ void queue_core_balance(struct rq *rq)  	queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);  } -static inline void sched_core_cpu_starting(unsigned int cpu) +static void sched_core_cpu_starting(unsigned int cpu)  {  	const struct cpumask *smt_mask = cpu_smt_mask(cpu); -	struct rq *rq, *core_rq = NULL; -	int i; +	struct rq *rq = cpu_rq(cpu), *core_rq = NULL; +	unsigned long flags; +	int t; -	core_rq = cpu_rq(cpu)->core; +	sched_core_lock(cpu, &flags); -	if (!core_rq) { -		for_each_cpu(i, smt_mask) { -			rq = cpu_rq(i); -			if (rq->core && rq->core == rq) -				core_rq = rq; +	WARN_ON_ONCE(rq->core != rq); + +	/* if we're the first, we'll be our own leader */ +	if (cpumask_weight(smt_mask) == 1) +		goto unlock; + +	/* find the leader */ +	for_each_cpu(t, smt_mask) { +		if (t == cpu) +			continue; +		rq = cpu_rq(t); +		if (rq->core == rq) { +			core_rq = rq; +			break;  		} +	} -		if (!core_rq) -			core_rq = cpu_rq(cpu); +	if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ +		goto unlock; -		for_each_cpu(i, smt_mask) { -			rq = cpu_rq(i); +	/* install and validate core_rq */ +	for_each_cpu(t, smt_mask) { +		rq = cpu_rq(t); -			WARN_ON_ONCE(rq->core && rq->core != core_rq); +		if (t == cpu)  			rq->core = core_rq; -		} + +		WARN_ON_ONCE(rq->core != core_rq); +	} + +unlock: +	sched_core_unlock(cpu, &flags); +} + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ +	const struct cpumask *smt_mask = cpu_smt_mask(cpu); +	struct rq *rq = cpu_rq(cpu), *core_rq = NULL; +	unsigned long flags; +	int t; + +	sched_core_lock(cpu, &flags); + +	/* if we're the last man standing, nothing to do */ +	if (cpumask_weight(smt_mask) == 1) { +		WARN_ON_ONCE(rq->core != rq); +		goto unlock; +	} + +	/* if we're not the leader, nothing to do */ +	if (rq->core != rq) +		goto unlock; + +	/* find a new leader */ +	for_each_cpu(t, smt_mask) { +		if (t == cpu) +			continue; +		core_rq = cpu_rq(t); +		break;  	} + +	if (WARN_ON_ONCE(!core_rq)) /* impossible */ +		goto unlock; + +	/* copy the shared state to the new leader */ +	core_rq->core_task_seq      = rq->core_task_seq; +	core_rq->core_pick_seq      = rq->core_pick_seq; +	core_rq->core_cookie        = rq->core_cookie; +	core_rq->core_forceidle     = rq->core_forceidle; +	core_rq->core_forceidle_seq = rq->core_forceidle_seq; + +	/* install new leader */ +	for_each_cpu(t, smt_mask) { +		rq = cpu_rq(t); +		rq->core = core_rq; +	} + +unlock: +	sched_core_unlock(cpu, &flags);  } + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +	if (rq->core != rq) +		rq->core = rq; +} +  #else /* !CONFIG_SCHED_CORE */  static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {}  static struct task_struct *  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -5775,6 +6106,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  #endif /* CONFIG_SCHED_CORE */  /* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. Note that + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to + * optimize the AND operation out and just check for zero. + */ +#define SM_NONE			0x0 +#define SM_PREEMPT		0x1 +#define SM_RTLOCK_WAIT		0x2 + +#ifndef CONFIG_PREEMPT_RT +# define SM_MASK_PREEMPT	(~0U) +#else +# define SM_MASK_PREEMPT	SM_PREEMPT +#endif + +/*   * __schedule() is the main scheduler function.   *   * The main means of driving the scheduler and thus entering this function are: @@ -5813,7 +6162,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)   *   * WARNING: must be called with preemption disabled!   */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(unsigned int sched_mode)  {  	struct task_struct *prev, *next;  	unsigned long *switch_count; @@ -5826,13 +6175,13 @@ static void __sched notrace __schedule(bool preempt)  	rq = cpu_rq(cpu);  	prev = rq->curr; -	schedule_debug(prev, preempt); +	schedule_debug(prev, !!sched_mode);  	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))  		hrtick_clear(rq);  	local_irq_disable(); -	rcu_note_context_switch(preempt); +	rcu_note_context_switch(!!sched_mode);  	/*  	 * Make sure that signal_pending_state()->signal_pending() below @@ -5866,7 +6215,7 @@ static void __sched notrace __schedule(bool preempt)  	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.  	 */  	prev_state = READ_ONCE(prev->__state); -	if (!preempt && prev_state) { +	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {  		if (signal_pending_state(prev_state, prev)) {  			WRITE_ONCE(prev->__state, TASK_RUNNING);  		} else { @@ -5932,7 +6281,7 @@ static void __sched notrace __schedule(bool preempt)  		migrate_disable_switch(rq, prev);  		psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -		trace_sched_switch(preempt, prev, next); +		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);  		/* Also unlocks the rq: */  		rq = context_switch(rq, prev, next, &rf); @@ -5953,7 +6302,7 @@ void __noreturn do_task_dead(void)  	/* Tell freezer to ignore us: */  	current->flags |= PF_NOFREEZE; -	__schedule(false); +	__schedule(SM_NONE);  	BUG();  	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -6014,7 +6363,7 @@ asmlinkage __visible void __sched schedule(void)  	sched_submit_work(tsk);  	do {  		preempt_disable(); -		__schedule(false); +		__schedule(SM_NONE);  		sched_preempt_enable_no_resched();  	} while (need_resched());  	sched_update_worker(tsk); @@ -6042,7 +6391,7 @@ void __sched schedule_idle(void)  	 */  	WARN_ON_ONCE(current->__state);  	do { -		__schedule(false); +		__schedule(SM_NONE);  	} while (need_resched());  } @@ -6077,6 +6426,18 @@ void __sched schedule_preempt_disabled(void)  	preempt_disable();  } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ +	do { +		preempt_disable(); +		__schedule(SM_RTLOCK_WAIT); +		sched_preempt_enable_no_resched(); +	} while (need_resched()); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif +  static void __sched notrace preempt_schedule_common(void)  {  	do { @@ -6095,7 +6456,7 @@ static void __sched notrace preempt_schedule_common(void)  		 */  		preempt_disable_notrace();  		preempt_latency_start(1); -		__schedule(true); +		__schedule(SM_PREEMPT);  		preempt_latency_stop(1);  		preempt_enable_no_resched_notrace(); @@ -6174,7 +6535,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)  		 * an infinite recursion.  		 */  		prev_ctx = exception_enter(); -		__schedule(true); +		__schedule(SM_PREEMPT);  		exception_exit(prev_ctx);  		preempt_latency_stop(1); @@ -6323,7 +6684,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)  	do {  		preempt_disable();  		local_irq_enable(); -		__schedule(true); +		__schedule(SM_PREEMPT);  		local_irq_disable();  		sched_preempt_enable_no_resched();  	} while (need_resched()); @@ -7300,6 +7661,16 @@ err_size:  	return -E2BIG;  } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ +	if (task_has_dl_policy(p)) +		__getparam_dl(p, attr); +	else if (task_has_rt_policy(p)) +		attr->sched_priority = p->rt_priority; +	else +		attr->sched_nice = task_nice(p); +} +  /**   * sys_sched_setscheduler - set/change the scheduler policy and RT priority   * @pid: the pid in question. @@ -7361,6 +7732,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,  	rcu_read_unlock();  	if (likely(p)) { +		if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) +			get_params(p, &attr);  		retval = sched_setattr(p, &attr);  		put_task_struct(p);  	} @@ -7509,12 +7882,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,  	kattr.sched_policy = p->policy;  	if (p->sched_reset_on_fork)  		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -	if (task_has_dl_policy(p)) -		__getparam_dl(p, &kattr); -	else if (task_has_rt_policy(p)) -		kattr.sched_priority = p->rt_priority; -	else -		kattr.sched_nice = task_nice(p); +	get_params(p, &kattr); +	kattr.sched_flags &= SCHED_FLAG_ALL;  #ifdef CONFIG_UCLAMP_TASK  	/* @@ -7535,9 +7904,76 @@ out_unlock:  	return retval;  } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)  { +	int ret = 0; + +	/* +	 * If the task isn't a deadline task or admission control is +	 * disabled then we don't care about affinity changes. +	 */ +	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) +		return 0; + +	/* +	 * Since bandwidth control happens on root_domain basis, +	 * if admission test is enabled, we only admit -deadline +	 * tasks allowed to run on all the CPUs in the task's +	 * root_domain. +	 */ +	rcu_read_lock(); +	if (!cpumask_subset(task_rq(p)->rd->span, mask)) +		ret = -EBUSY; +	rcu_read_unlock(); +	return ret; +} +#endif + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) +{ +	int retval;  	cpumask_var_t cpus_allowed, new_mask; + +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) +		return -ENOMEM; + +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { +		retval = -ENOMEM; +		goto out_free_cpus_allowed; +	} + +	cpuset_cpus_allowed(p, cpus_allowed); +	cpumask_and(new_mask, mask, cpus_allowed); + +	retval = dl_task_check_affinity(p, new_mask); +	if (retval) +		goto out_free_new_mask; +again: +	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); +	if (retval) +		goto out_free_new_mask; + +	cpuset_cpus_allowed(p, cpus_allowed); +	if (!cpumask_subset(new_mask, cpus_allowed)) { +		/* +		 * We must have raced with a concurrent cpuset update. +		 * Just reset the cpumask to the cpuset's cpus_allowed. +		 */ +		cpumask_copy(new_mask, cpus_allowed); +		goto again; +	} + +out_free_new_mask: +	free_cpumask_var(new_mask); +out_free_cpus_allowed: +	free_cpumask_var(cpus_allowed); +	return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{  	struct task_struct *p;  	int retval; @@ -7557,68 +7993,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  		retval = -EINVAL;  		goto out_put_task;  	} -	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -		retval = -ENOMEM; -		goto out_put_task; -	} -	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -		retval = -ENOMEM; -		goto out_free_cpus_allowed; -	} -	retval = -EPERM; +  	if (!check_same_owner(p)) {  		rcu_read_lock();  		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {  			rcu_read_unlock(); -			goto out_free_new_mask; +			retval = -EPERM; +			goto out_put_task;  		}  		rcu_read_unlock();  	}  	retval = security_task_setscheduler(p);  	if (retval) -		goto out_free_new_mask; - - -	cpuset_cpus_allowed(p, cpus_allowed); -	cpumask_and(new_mask, in_mask, cpus_allowed); - -	/* -	 * Since bandwidth control happens on root_domain basis, -	 * if admission test is enabled, we only admit -deadline -	 * tasks allowed to run on all the CPUs in the task's -	 * root_domain. -	 */ -#ifdef CONFIG_SMP -	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { -		rcu_read_lock(); -		if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { -			retval = -EBUSY; -			rcu_read_unlock(); -			goto out_free_new_mask; -		} -		rcu_read_unlock(); -	} -#endif -again: -	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); +		goto out_put_task; -	if (!retval) { -		cpuset_cpus_allowed(p, cpus_allowed); -		if (!cpumask_subset(new_mask, cpus_allowed)) { -			/* -			 * We must have raced with a concurrent cpuset -			 * update. Just reset the cpus_allowed to the -			 * cpuset's cpus_allowed -			 */ -			cpumask_copy(new_mask, cpus_allowed); -			goto again; -		} -	} -out_free_new_mask: -	free_cpumask_var(new_mask); -out_free_cpus_allowed: -	free_cpumask_var(cpus_allowed); +	retval = __sched_setaffinity(p, in_mask);  out_put_task:  	put_task_struct(p);  	return retval; @@ -7761,6 +8151,17 @@ int __sched __cond_resched(void)  		preempt_schedule_common();  		return 1;  	} +	/* +	 * In preemptible kernels, ->rcu_read_lock_nesting tells the tick +	 * whether the current CPU is in an RCU read-side critical section, +	 * so the tick can report quiescent states even for CPUs looping +	 * in kernel context.  In contrast, in non-preemptible kernels, +	 * RCU readers leave no in-memory hints, which means that CPU-bound +	 * processes executing in kernel context might never report an +	 * RCU quiescent state.  Therefore, the following code causes +	 * cond_resched() to report a quiescent state, but only when RCU +	 * is in urgent need of one. +	 */  #ifndef CONFIG_PREEMPT_RCU  	rcu_all_qs();  #endif @@ -8707,6 +9108,8 @@ int sched_cpu_deactivate(unsigned int cpu)  	 */  	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)  		static_branch_dec_cpuslocked(&sched_smt_present); + +	sched_core_cpu_deactivate(cpu);  #endif  	if (!sched_smp_initialized) @@ -8811,6 +9214,7 @@ int sched_cpu_dying(unsigned int cpu)  	calc_load_migrate(rq);  	update_max_interval();  	hrtick_clear(rq); +	sched_core_cpu_dying(cpu);  	return 0;  }  #endif @@ -9022,7 +9426,7 @@ void __init sched_init(void)  		atomic_set(&rq->nr_iowait, 0);  #ifdef CONFIG_SCHED_CORE -		rq->core = NULL; +		rq->core = rq;  		rq->core_pick = NULL;  		rq->core_enabled = 0;  		rq->core_tree = RB_ROOT; @@ -9804,7 +10208,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,  	 * Prevent race between setting of cfs_rq->runtime_enabled and  	 * unthrottle_offline_cfs_rqs().  	 */ -	get_online_cpus(); +	cpus_read_lock();  	mutex_lock(&cfs_constraints_mutex);  	ret = __cfs_schedulable(tg, period, quota);  	if (ret) @@ -9848,7 +10252,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,  		cfs_bandwidth_usage_dec();  out_unlock:  	mutex_unlock(&cfs_constraints_mutex); -	put_online_cpus(); +	cpus_read_unlock();  	return ret;  } @@ -10099,6 +10503,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,  }  #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, +			       struct cftype *cft) +{ +	return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, +				struct cftype *cft, s64 idle) +{ +	return sched_group_set_idle(css_tg(css), idle); +} +#endif +  static struct cftype cpu_legacy_files[] = {  #ifdef CONFIG_FAIR_GROUP_SCHED  	{ @@ -10106,6 +10524,11 @@ static struct cftype cpu_legacy_files[] = {  		.read_u64 = cpu_shares_read_u64,  		.write_u64 = cpu_shares_write_u64,  	}, +	{ +		.name = "idle", +		.read_s64 = cpu_idle_read_s64, +		.write_s64 = cpu_idle_write_s64, +	},  #endif  #ifdef CONFIG_CFS_BANDWIDTH  	{ @@ -10313,6 +10736,12 @@ static struct cftype cpu_files[] = {  		.read_s64 = cpu_weight_nice_read_s64,  		.write_s64 = cpu_weight_nice_write_s64,  	}, +	{ +		.name = "idle", +		.flags = CFTYPE_NOT_ON_ROOT, +		.read_s64 = cpu_idle_read_s64, +		.write_s64 = cpu_idle_write_s64, +	},  #endif  #ifdef CONFIG_CFS_BANDWIDTH  	{  | 
