diff options
Diffstat (limited to 'kernel/sched/ext.c')
| -rw-r--r-- | kernel/sched/ext.c | 222 | 
1 files changed, 123 insertions, 99 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3cd7c50a51c5..5900b06fd036 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,7 +9,6 @@  #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))  enum scx_consts { -	SCX_SLICE_BYPASS		= SCX_SLICE_DFL / 4,  	SCX_DSP_DFL_MAX_BATCH		= 32,  	SCX_DSP_MAX_LOOPS		= 32,  	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ, @@ -19,6 +18,12 @@ enum scx_consts {  	SCX_EXIT_DUMP_DFL_LEN		= 32768,  	SCX_CPUPERF_ONE			= SCHED_CAPACITY_SCALE, + +	/* +	 * Iterating all tasks may take a while. Periodically drop +	 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. +	 */ +	SCX_OPS_TASK_ITER_BATCH		= 32,  };  enum scx_exit_kind { @@ -625,6 +630,10 @@ struct sched_ext_ops {  	/**  	 * exit - Clean up after the BPF scheduler  	 * @info: Exit info +	 * +	 * ops.exit() is also called on ops.init() failure, which is a bit +	 * unusual. This is to allow rich reporting through @info on how +	 * ops.init() failed.  	 */  	void (*exit)(struct scx_exit_info *info); @@ -692,6 +701,7 @@ enum scx_enq_flags {  	/* expose select ENQUEUE_* flags as enums */  	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,  	SCX_ENQ_HEAD		= ENQUEUE_HEAD, +	SCX_ENQ_CPU_SELECTED	= ENQUEUE_RQ_SELECTED,  	/* high 32bits are SCX specific */ @@ -1269,86 +1279,104 @@ struct scx_task_iter {  	struct task_struct		*locked;  	struct rq			*rq;  	struct rq_flags			rf; +	u32				cnt;  };  /** - * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration   * @iter: iterator to init   * - * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, - * @iter must eventually be exited with scx_task_iter_exit(). + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop().   * - * scx_tasks_lock may be released between this and the first next() call or - * between any two next() calls. If scx_tasks_lock is released between two - * next() calls, the caller is responsible for ensuring that the task being - * iterated remains accessible either through RCU read lock or obtaining a - * reference count. + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count.   *   * All tasks which existed when the iteration started are guaranteed to be   * visited as long as they still exist.   */ -static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter)  { -	lockdep_assert_held(&scx_tasks_lock); -  	BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &  		     ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); +	spin_lock_irq(&scx_tasks_lock); +  	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };  	list_add(&iter->cursor.tasks_node, &scx_tasks);  	iter->locked = NULL; +	iter->cnt = 0; +} + +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ +	if (iter->locked) { +		task_rq_unlock(iter->rq, iter->locked, &iter->rf); +		iter->locked = NULL; +	}  }  /** - * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator - * @iter: iterator to unlock rq for + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock   *   * If @iter is in the middle of a locked iteration, it may be locking the rq of - * the task currently being visited. Unlock the rq if so. This function can be - * safely called anytime during an iteration. + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ +	__scx_task_iter_rq_unlock(iter); +	spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock   * - * Returns %true if the rq @iter was locking is unlocked. %false if @iter was - * not locking an rq. + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations.   */ -static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +static void scx_task_iter_relock(struct scx_task_iter *iter)  { -	if (iter->locked) { -		task_rq_unlock(iter->rq, iter->locked, &iter->rf); -		iter->locked = NULL; -		return true; -	} else { -		return false; -	} +	spin_lock_irq(&scx_tasks_lock);  }  /** - * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock   * @iter: iterator to exit   * - * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. - * If the iterator holds a task's rq lock, that rq lock is released. See - * scx_task_iter_init() for details. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details.   */ -static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter)  { -	lockdep_assert_held(&scx_tasks_lock); - -	scx_task_iter_rq_unlock(iter);  	list_del_init(&iter->cursor.tasks_node); +	scx_task_iter_unlock(iter);  }  /**   * scx_task_iter_next - Next task   * @iter: iterator to walk   * - * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long.   */  static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)  {  	struct list_head *cursor = &iter->cursor.tasks_node;  	struct sched_ext_entity *pos; -	lockdep_assert_held(&scx_tasks_lock); +	if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { +		scx_task_iter_unlock(iter); +		cond_resched(); +		scx_task_iter_relock(iter); +	}  	list_for_each_entry(pos, cursor, tasks_node) {  		if (&pos->tasks_node == &scx_tasks) @@ -1369,14 +1397,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)   * @include_dead: Whether we should include dead tasks in the iteration   *   * Visit the non-idle task with its rq lock held. Allows callers to specify - * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start()   * for details.   */  static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)  {  	struct task_struct *p; -	scx_task_iter_rq_unlock(iter); +	__scx_task_iter_rq_unlock(iter);  	while ((p = scx_task_iter_next(iter))) {  		/* @@ -1944,7 +1972,6 @@ static bool scx_rq_online(struct rq *rq)  static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,  			    int sticky_cpu)  { -	bool bypassing = scx_rq_bypassing(rq);  	struct task_struct **ddsp_taskp;  	unsigned long qseq; @@ -1962,7 +1989,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,  	if (!scx_rq_online(rq))  		goto local; -	if (bypassing) +	if (scx_rq_bypassing(rq))  		goto global;  	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2017,7 +2044,7 @@ local_norefill:  global:  	touch_core_sched(rq, p);	/* see the comment in local: */ -	p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; +	p->scx.slice = SCX_SLICE_DFL;  	dispatch_enqueue(find_global_dsq(p), p, enq_flags);  } @@ -2953,8 +2980,8 @@ static struct task_struct *pick_task_scx(struct rq *rq)  		if (unlikely(!p->scx.slice)) {  			if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { -				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", -						p->comm, p->pid); +				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", +						p->comm, p->pid, __func__);  				scx_warned_zero_slice = true;  			}  			p->scx.slice = SCX_SLICE_DFL; @@ -3059,11 +3086,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,  	*found = false; -	if (!static_branch_likely(&scx_builtin_idle_enabled)) { -		scx_ops_error("built-in idle tracking is disabled"); -		return prev_cpu; -	} -  	/*  	 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is  	 * under utilized, wake up @p to the local DSQ of the waker. Checking @@ -3128,7 +3150,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag  	if (unlikely(wake_flags & WF_EXEC))  		return prev_cpu; -	if (SCX_HAS_OP(select_cpu)) { +	if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {  		s32 cpu;  		struct task_struct **ddsp_taskp; @@ -3193,7 +3215,7 @@ void __scx_update_idle(struct rq *rq, bool idle)  {  	int cpu = cpu_of(rq); -	if (SCX_HAS_OP(update_idle)) { +	if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {  		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);  		if (!static_branch_unlikely(&scx_builtin_idle_enabled))  			return; @@ -4048,7 +4070,6 @@ static void scx_cgroup_exit(void)  	percpu_rwsem_assert_held(&scx_cgroup_rwsem); -	WARN_ON_ONCE(!scx_cgroup_enabled);  	scx_cgroup_enabled = false;  	/* @@ -4117,6 +4138,7 @@ static int scx_cgroup_init(void)  				      css->cgroup, &args);  		if (ret) {  			css_put(css); +			scx_ops_error("ops.cgroup_init() failed (%d)", ret);  			return ret;  		}  		tg->scx_flags |= SCX_TG_INITED; @@ -4256,21 +4278,23 @@ bool task_should_scx(struct task_struct *p)   * the DISABLING state and then cycling the queued tasks through dequeue/enqueue   * to force global FIFO scheduling.   * - * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. - *    %SCX_OPS_ENQ_LAST is also ignored. + * - ops.select_cpu() is ignored and the default select_cpu() is used. + * + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + *   %SCX_OPS_ENQ_LAST is also ignored.   * - * b. ops.dispatch() is ignored. + * - ops.dispatch() is ignored.   * - * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice - *    can't be trusted. Whenever a tick triggers, the running task is rotated to - *    the tail of the queue with core_sched_at touched. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + *   can't be trusted. Whenever a tick triggers, the running task is rotated to + *   the tail of the queue with core_sched_at touched.   * - * d. pick_next_task() suppresses zero slice warning. + * - pick_next_task() suppresses zero slice warning.   * - * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM - *    operations. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + *   operations.   * - * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_prio_less() reverts to the default core_sched_at order.   */  static void scx_ops_bypass(bool bypass)  { @@ -4340,7 +4364,7 @@ static void scx_ops_bypass(bool bypass)  		rq_unlock_irqrestore(rq, &rf); -		/* kick to restore ticks */ +		/* resched to restore ticks and idle state */  		resched_cpu(cpu);  	}  } @@ -4462,16 +4486,14 @@ static void scx_ops_disable_workfn(struct kthread_work *work)  	scx_ops_init_task_enabled = false; -	spin_lock_irq(&scx_tasks_lock); -	scx_task_iter_init(&sti); +	scx_task_iter_start(&sti);  	while ((p = scx_task_iter_next_locked(&sti))) {  		const struct sched_class *old_class = p->sched_class;  		struct sched_enq_and_set_ctx ctx;  		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); -		__setscheduler_prio(p, p->prio); +		p->sched_class = __setscheduler_class(p, p->prio);  		check_class_changing(task_rq(p), p, old_class);  		sched_enq_and_set_task(&ctx); @@ -4479,8 +4501,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)  		check_class_changed(task_rq(p), p, old_class, p->prio);  		scx_ops_exit_task(p);  	} -	scx_task_iter_exit(&sti); -	spin_unlock_irq(&scx_tasks_lock); +	scx_task_iter_stop(&sti);  	percpu_up_write(&scx_fork_rwsem);  	/* no task is on scx, turn off all the switches and flush in-progress calls */ @@ -5041,6 +5062,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)  		if (ret) {  			ret = ops_sanitize_err("init", ret);  			cpus_read_unlock(); +			scx_ops_error("ops.init() failed (%d)", ret);  			goto err_disable;  		}  	} @@ -5130,8 +5152,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)  	if (ret)  		goto err_disable_unlock_all; -	spin_lock_irq(&scx_tasks_lock); -	scx_task_iter_init(&sti); +	scx_task_iter_start(&sti);  	while ((p = scx_task_iter_next_locked(&sti))) {  		/*  		 * @p may already be dead, have lost all its usages counts and @@ -5141,27 +5162,24 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)  		if (!tryget_task_struct(p))  			continue; -		scx_task_iter_rq_unlock(&sti); -		spin_unlock_irq(&scx_tasks_lock); +		scx_task_iter_unlock(&sti);  		ret = scx_ops_init_task(p, task_group(p), false);  		if (ret) {  			put_task_struct(p); -			spin_lock_irq(&scx_tasks_lock); -			scx_task_iter_exit(&sti); -			spin_unlock_irq(&scx_tasks_lock); -			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", -			       ret, p->comm, p->pid); +			scx_task_iter_relock(&sti); +			scx_task_iter_stop(&sti); +			scx_ops_error("ops.init_task() failed (%d) for %s[%d]", +				      ret, p->comm, p->pid);  			goto err_disable_unlock_all;  		}  		scx_set_task_state(p, SCX_TASK_READY);  		put_task_struct(p); -		spin_lock_irq(&scx_tasks_lock); +		scx_task_iter_relock(&sti);  	} -	scx_task_iter_exit(&sti); -	spin_unlock_irq(&scx_tasks_lock); +	scx_task_iter_stop(&sti);  	scx_cgroup_unlock();  	percpu_up_write(&scx_fork_rwsem); @@ -5178,35 +5196,28 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)  	 * scx_tasks_lock.  	 */  	percpu_down_write(&scx_fork_rwsem); -	spin_lock_irq(&scx_tasks_lock); -	scx_task_iter_init(&sti); +	scx_task_iter_start(&sti);  	while ((p = scx_task_iter_next_locked(&sti))) {  		const struct sched_class *old_class = p->sched_class;  		struct sched_enq_and_set_ctx ctx;  		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -		__setscheduler_prio(p, p->prio); +		p->scx.slice = SCX_SLICE_DFL; +		p->sched_class = __setscheduler_class(p, p->prio);  		check_class_changing(task_rq(p), p, old_class);  		sched_enq_and_set_task(&ctx);  		check_class_changed(task_rq(p), p, old_class, p->prio);  	} -	scx_task_iter_exit(&sti); -	spin_unlock_irq(&scx_tasks_lock); +	scx_task_iter_stop(&sti);  	percpu_up_write(&scx_fork_rwsem);  	scx_ops_bypass(false); -	/* -	 * Returning an error code here would lose the recorded error -	 * information. Exit indicating success so that the error is notified -	 * through ops.exit() with all the details. -	 */  	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {  		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); -		ret = 0;  		goto err_disable;  	} @@ -5241,10 +5252,18 @@ err_disable_unlock_all:  	scx_ops_bypass(false);  err_disable:  	mutex_unlock(&scx_ops_enable_mutex); -	/* must be fully disabled before returning */ -	scx_ops_disable(SCX_EXIT_ERROR); +	/* +	 * Returning an error code here would not pass all the error information +	 * to userspace. Record errno using scx_ops_error() for cases +	 * scx_ops_error() wasn't already invoked and exit indicating success so +	 * that the error is notified through ops.exit() with all the details. +	 * +	 * Flush scx_ops_disable_work to ensure that error is reported before +	 * init completion. +	 */ +	scx_ops_error("scx_ops_enable() failed (%d)", ret);  	kthread_flush_work(&scx_ops_disable_work); -	return ret; +	return 0;  } @@ -5864,16 +5883,21 @@ __bpf_kfunc_start_defs();  __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,  				       u64 wake_flags, bool *is_idle)  { -	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { -		*is_idle = false; -		return prev_cpu; +	if (!static_branch_likely(&scx_builtin_idle_enabled)) { +		scx_ops_error("built-in idle tracking is disabled"); +		goto prev_cpu;  	} + +	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) +		goto prev_cpu; +  #ifdef CONFIG_SMP  	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#else +#endif + +prev_cpu:  	*is_idle = false;  	return prev_cpu; -#endif  }  __bpf_kfunc_end_defs();  | 
