diff options
| author | Peter Zijlstra <peterz@infradead.org> | 2021-08-20 12:33:05 +0200 | 
|---|---|---|
| committer | Peter Zijlstra <peterz@infradead.org> | 2021-08-20 12:33:05 +0200 | 
| commit | c94d89fafa49ba70fedbb01cb52dfbbdd7dc0986 (patch) | |
| tree | ca4af4efcc3ac38dcb26ccf99acf62f7d30ebcc3 /kernel/sched/core.c | |
| parent | 7c60610d476766e128cc4284bb6349732cbd6606 (diff) | |
| parent | 234b8ab6476c5edd5262e2ff563de9498d60044a (diff) | |
Merge branch 'sched/core'
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 440 | 
1 files changed, 346 insertions, 94 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..8dc67166aa6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -993,6 +993,7 @@ int get_nohz_timer_target(void)  {  	int i, cpu = smp_processor_id(), default_cpu = -1;  	struct sched_domain *sd; +	const struct cpumask *hk_mask;  	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {  		if (!idle_cpu(cpu)) @@ -1000,10 +1001,11 @@ int get_nohz_timer_target(void)  		default_cpu = cpu;  	} +	hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); +  	rcu_read_lock();  	for_each_domain(cpu, sd) { -		for_each_cpu_and(i, sched_domain_span(sd), -			housekeeping_cpumask(HK_FLAG_TIMER)) { +		for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {  			if (cpu == i)  				continue; @@ -1619,6 +1621,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)  		uclamp_rq_dec_id(rq, p, clamp_id);  } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, +				      enum uclamp_id clamp_id) +{ +	if (!p->uclamp[clamp_id].active) +		return; + +	uclamp_rq_dec_id(rq, p, clamp_id); +	uclamp_rq_inc_id(rq, p, clamp_id); + +	/* +	 * Make sure to clear the idle flag if we've transiently reached 0 +	 * active tasks on rq. +	 */ +	if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) +		rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} +  static inline void  uclamp_update_active(struct task_struct *p)  { @@ -1642,12 +1661,8 @@ uclamp_update_active(struct task_struct *p)  	 * affecting a valid clamp bucket, the next time it's enqueued,  	 * it will already see the updated clamp bucket value.  	 */ -	for_each_clamp_id(clamp_id) { -		if (p->uclamp[clamp_id].active) { -			uclamp_rq_dec_id(rq, p, clamp_id); -			uclamp_rq_inc_id(rq, p, clamp_id); -		} -	} +	for_each_clamp_id(clamp_id) +		uclamp_rq_reinc_id(rq, p, clamp_id);  	task_rq_unlock(rq, p, &rf);  } @@ -2161,7 +2176,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)  	/* Non kernel threads are not allowed during either online or offline. */  	if (!(p->flags & PF_KTHREAD)) -		return cpu_active(cpu); +		return cpu_active(cpu) && task_cpu_possible(cpu, p);  	/* KTHREAD_IS_PER_CPU is always allowed. */  	if (kthread_is_per_cpu(p)) @@ -2468,6 +2483,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  	__do_set_cpus_allowed(p, new_mask, 0);  } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, +		      int node) +{ +	if (!src->user_cpus_ptr) +		return 0; + +	dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); +	if (!dst->user_cpus_ptr) +		return -ENOMEM; + +	cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); +	return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ +	struct cpumask *user_mask = NULL; + +	swap(p->user_cpus_ptr, user_mask); + +	return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ +	kfree(clear_user_cpus_ptr(p)); +} +  /*   * This function is wildly self concurrent; here be dragons.   * @@ -2685,28 +2728,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  }  /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning.   */ -static int __set_cpus_allowed_ptr(struct task_struct *p, -				  const struct cpumask *new_mask, -				  u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, +					 const struct cpumask *new_mask, +					 u32 flags, +					 struct rq *rq, +					 struct rq_flags *rf) +	__releases(rq->lock) +	__releases(p->pi_lock)  { +	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);  	const struct cpumask *cpu_valid_mask = cpu_active_mask; +	bool kthread = p->flags & PF_KTHREAD; +	struct cpumask *user_mask = NULL;  	unsigned int dest_cpu; -	struct rq_flags rf; -	struct rq *rq;  	int ret = 0; -	rq = task_rq_lock(p, &rf);  	update_rq_clock(rq); -	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { +	if (kthread || is_migration_disabled(p)) {  		/*  		 * Kernel threads are allowed on online && !active CPUs,  		 * however, during cpu-hot-unplug, even these might get pushed @@ -2720,6 +2761,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  		cpu_valid_mask = cpu_online_mask;  	} +	if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { +		ret = -EINVAL; +		goto out; +	} +  	/*  	 * Must re-check here, to close a race against __kthread_bind(),  	 * sched_setaffinity() is not guaranteed to observe the flag. @@ -2754,20 +2800,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  	__do_set_cpus_allowed(p, new_mask, flags); -	return affine_move_task(rq, p, &rf, dest_cpu, flags); +	if (flags & SCA_USER) +		user_mask = clear_user_cpus_ptr(p); + +	ret = affine_move_task(rq, p, rf, dest_cpu, flags); + +	kfree(user_mask); + +	return ret;  out: -	task_rq_unlock(rq, p, &rf); +	task_rq_unlock(rq, p, rf);  	return ret;  } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, +				  const struct cpumask *new_mask, u32 flags) +{ +	struct rq_flags rf; +	struct rq *rq; + +	rq = task_rq_lock(p, &rf); +	return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} +  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  {  	return __set_cpus_allowed_ptr(p, new_mask, 0);  }  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, +				     struct cpumask *new_mask, +				     const struct cpumask *subset_mask) +{ +	struct cpumask *user_mask = NULL; +	struct rq_flags rf; +	struct rq *rq; +	int err; + +	if (!p->user_cpus_ptr) { +		user_mask = kmalloc(cpumask_size(), GFP_KERNEL); +		if (!user_mask) +			return -ENOMEM; +	} + +	rq = task_rq_lock(p, &rf); + +	/* +	 * Forcefully restricting the affinity of a deadline task is +	 * likely to cause problems, so fail and noisily override the +	 * mask entirely. +	 */ +	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { +		err = -EPERM; +		goto err_unlock; +	} + +	if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { +		err = -EINVAL; +		goto err_unlock; +	} + +	/* +	 * We're about to butcher the task affinity, so keep track of what +	 * the user asked for in case we're able to restore it later on. +	 */ +	if (user_mask) { +		cpumask_copy(user_mask, p->cpus_ptr); +		p->user_cpus_ptr = user_mask; +	} + +	return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: +	task_rq_unlock(rq, p, &rf); +	kfree(user_mask); +	return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ +	cpumask_var_t new_mask; +	const struct cpumask *override_mask = task_cpu_possible_mask(p); + +	alloc_cpumask_var(&new_mask, GFP_KERNEL); + +	/* +	 * __migrate_task() can fail silently in the face of concurrent +	 * offlining of the chosen destination CPU, so take the hotplug +	 * lock to ensure that the migration succeeds. +	 */ +	cpus_read_lock(); +	if (!cpumask_available(new_mask)) +		goto out_set_mask; + +	if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) +		goto out_free_mask; + +	/* +	 * We failed to find a valid subset of the affinity mask for the +	 * task, so override it based on its cpuset hierarchy. +	 */ +	cpuset_cpus_allowed(p, new_mask); +	override_mask = new_mask; + +out_set_mask: +	if (printk_ratelimit()) { +		printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", +				task_pid_nr(p), p->comm, +				cpumask_pr_args(override_mask)); +	} + +	WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: +	cpus_read_unlock(); +	free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ +	struct cpumask *user_mask = p->user_cpus_ptr; +	unsigned long flags; + +	/* +	 * Try to restore the old affinity mask. If this fails, then +	 * we free the mask explicitly to avoid it being inherited across +	 * a subsequent fork(). +	 */ +	if (!user_mask || !__sched_setaffinity(p, user_mask)) +		return; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	user_mask = clear_user_cpus_ptr(p); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +	kfree(user_mask); +} +  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  {  #ifdef CONFIG_SCHED_DEBUG @@ -3112,9 +3316,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  		/* Look for allowed, online CPU in same node. */  		for_each_cpu(dest_cpu, nodemask) { -			if (!cpu_active(dest_cpu)) -				continue; -			if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) +			if (is_cpu_allowed(p, dest_cpu))  				return dest_cpu;  		}  	} @@ -3131,8 +3333,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  		/* No more Mr. Nice Guy. */  		switch (state) {  		case cpuset: -			if (IS_ENABLED(CONFIG_CPUSETS)) { -				cpuset_cpus_allowed_fallback(p); +			if (cpuset_cpus_allowed_fallback(p)) {  				state = possible;  				break;  			} @@ -3144,10 +3345,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  			 *  			 * More yuck to audit.  			 */ -			do_set_cpus_allowed(p, cpu_possible_mask); +			do_set_cpus_allowed(p, task_cpu_possible_mask(p));  			state = fail;  			break; -  		case fail:  			BUG();  			break; @@ -5660,11 +5860,9 @@ static bool try_steal_cookie(int this, int that)  		if (p->core_occupation > dst->idle->core_occupation)  			goto next; -		p->on_rq = TASK_ON_RQ_MIGRATING;  		deactivate_task(src, p, 0);  		set_task_cpu(p, this);  		activate_task(dst, p, 0); -		p->on_rq = TASK_ON_RQ_QUEUED;  		resched_curr(dst); @@ -7300,6 +7498,16 @@ err_size:  	return -E2BIG;  } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ +	if (task_has_dl_policy(p)) +		__getparam_dl(p, attr); +	else if (task_has_rt_policy(p)) +		attr->sched_priority = p->rt_priority; +	else +		attr->sched_nice = task_nice(p); +} +  /**   * sys_sched_setscheduler - set/change the scheduler policy and RT priority   * @pid: the pid in question. @@ -7361,6 +7569,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,  	rcu_read_unlock();  	if (likely(p)) { +		if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) +			get_params(p, &attr);  		retval = sched_setattr(p, &attr);  		put_task_struct(p);  	} @@ -7509,12 +7719,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,  	kattr.sched_policy = p->policy;  	if (p->sched_reset_on_fork)  		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -	if (task_has_dl_policy(p)) -		__getparam_dl(p, &kattr); -	else if (task_has_rt_policy(p)) -		kattr.sched_priority = p->rt_priority; -	else -		kattr.sched_nice = task_nice(p); +	get_params(p, &kattr); +	kattr.sched_flags &= SCHED_FLAG_ALL;  #ifdef CONFIG_UCLAMP_TASK  	/* @@ -7535,9 +7741,76 @@ out_unlock:  	return retval;  } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)  { +	int ret = 0; + +	/* +	 * If the task isn't a deadline task or admission control is +	 * disabled then we don't care about affinity changes. +	 */ +	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) +		return 0; + +	/* +	 * Since bandwidth control happens on root_domain basis, +	 * if admission test is enabled, we only admit -deadline +	 * tasks allowed to run on all the CPUs in the task's +	 * root_domain. +	 */ +	rcu_read_lock(); +	if (!cpumask_subset(task_rq(p)->rd->span, mask)) +		ret = -EBUSY; +	rcu_read_unlock(); +	return ret; +} +#endif + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) +{ +	int retval;  	cpumask_var_t cpus_allowed, new_mask; + +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) +		return -ENOMEM; + +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { +		retval = -ENOMEM; +		goto out_free_cpus_allowed; +	} + +	cpuset_cpus_allowed(p, cpus_allowed); +	cpumask_and(new_mask, mask, cpus_allowed); + +	retval = dl_task_check_affinity(p, new_mask); +	if (retval) +		goto out_free_new_mask; +again: +	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); +	if (retval) +		goto out_free_new_mask; + +	cpuset_cpus_allowed(p, cpus_allowed); +	if (!cpumask_subset(new_mask, cpus_allowed)) { +		/* +		 * We must have raced with a concurrent cpuset update. +		 * Just reset the cpumask to the cpuset's cpus_allowed. +		 */ +		cpumask_copy(new_mask, cpus_allowed); +		goto again; +	} + +out_free_new_mask: +	free_cpumask_var(new_mask); +out_free_cpus_allowed: +	free_cpumask_var(cpus_allowed); +	return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{  	struct task_struct *p;  	int retval; @@ -7557,68 +7830,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  		retval = -EINVAL;  		goto out_put_task;  	} -	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -		retval = -ENOMEM; -		goto out_put_task; -	} -	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -		retval = -ENOMEM; -		goto out_free_cpus_allowed; -	} -	retval = -EPERM; +  	if (!check_same_owner(p)) {  		rcu_read_lock();  		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {  			rcu_read_unlock(); -			goto out_free_new_mask; +			retval = -EPERM; +			goto out_put_task;  		}  		rcu_read_unlock();  	}  	retval = security_task_setscheduler(p);  	if (retval) -		goto out_free_new_mask; - - -	cpuset_cpus_allowed(p, cpus_allowed); -	cpumask_and(new_mask, in_mask, cpus_allowed); - -	/* -	 * Since bandwidth control happens on root_domain basis, -	 * if admission test is enabled, we only admit -deadline -	 * tasks allowed to run on all the CPUs in the task's -	 * root_domain. -	 */ -#ifdef CONFIG_SMP -	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { -		rcu_read_lock(); -		if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { -			retval = -EBUSY; -			rcu_read_unlock(); -			goto out_free_new_mask; -		} -		rcu_read_unlock(); -	} -#endif -again: -	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); +		goto out_put_task; -	if (!retval) { -		cpuset_cpus_allowed(p, cpus_allowed); -		if (!cpumask_subset(new_mask, cpus_allowed)) { -			/* -			 * We must have raced with a concurrent cpuset -			 * update. Just reset the cpus_allowed to the -			 * cpuset's cpus_allowed -			 */ -			cpumask_copy(new_mask, cpus_allowed); -			goto again; -		} -	} -out_free_new_mask: -	free_cpumask_var(new_mask); -out_free_cpus_allowed: -	free_cpumask_var(cpus_allowed); +	retval = __sched_setaffinity(p, in_mask);  out_put_task:  	put_task_struct(p);  	return retval; @@ -9804,7 +10031,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,  	 * Prevent race between setting of cfs_rq->runtime_enabled and  	 * unthrottle_offline_cfs_rqs().  	 */ -	get_online_cpus(); +	cpus_read_lock();  	mutex_lock(&cfs_constraints_mutex);  	ret = __cfs_schedulable(tg, period, quota);  	if (ret) @@ -9848,7 +10075,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,  		cfs_bandwidth_usage_dec();  out_unlock:  	mutex_unlock(&cfs_constraints_mutex); -	put_online_cpus(); +	cpus_read_unlock();  	return ret;  } @@ -10099,6 +10326,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,  }  #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, +			       struct cftype *cft) +{ +	return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, +				struct cftype *cft, s64 idle) +{ +	return sched_group_set_idle(css_tg(css), idle); +} +#endif +  static struct cftype cpu_legacy_files[] = {  #ifdef CONFIG_FAIR_GROUP_SCHED  	{ @@ -10106,6 +10347,11 @@ static struct cftype cpu_legacy_files[] = {  		.read_u64 = cpu_shares_read_u64,  		.write_u64 = cpu_shares_write_u64,  	}, +	{ +		.name = "idle", +		.read_s64 = cpu_idle_read_s64, +		.write_s64 = cpu_idle_write_s64, +	},  #endif  #ifdef CONFIG_CFS_BANDWIDTH  	{ @@ -10313,6 +10559,12 @@ static struct cftype cpu_files[] = {  		.read_s64 = cpu_weight_nice_read_s64,  		.write_s64 = cpu_weight_nice_write_s64,  	}, +	{ +		.name = "idle", +		.flags = CFTYPE_NOT_ON_ROOT, +		.read_s64 = cpu_idle_read_s64, +		.write_s64 = cpu_idle_write_s64, +	},  #endif  #ifdef CONFIG_CFS_BANDWIDTH  	{  | 
