From cc9877fb76771b7cbce6c9ec239f13a1d7759876 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:33:37 -1000 Subject: sched_ext: Improve error reporting during loading When the BPF scheduler fails, ops.exit() allows rich error reporting through scx_exit_info. Use scx.exit() path consistently for all failures which can be caused by the BPF scheduler: - scx_ops_error() is called after ops.init() and ops.cgroup_init() failure to record error information. - ops.init_task() failure now uses scx_ops_error() instead of pr_err(). - The err_disable path updated to automatically trigger scx_ops_error() to cover cases that the error message hasn't already been generated and always return 0 indicating init success so that the error is reported through ops.exit(). Signed-off-by: Tejun Heo Cc: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3cd7c50a51c5..76f04f3ace61 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -625,6 +625,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); @@ -4117,6 +4121,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -5041,6 +5046,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); goto err_disable; } } @@ -5150,8 +5156,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_lock_irq(&scx_tasks_lock); scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", - ret, p->comm, p->pid); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } @@ -5199,14 +5205,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_ops_bypass(false); - /* - * Returning an error code here would lose the recorded error - * information. Exit indicating success so that the error is notified - * through ops.exit() with all the details. - */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; goto err_disable; } @@ -5241,10 +5241,18 @@ err_disable_unlock_all: scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); - /* must be fully disabled before returning */ - scx_ops_disable(SCX_EXIT_ERROR); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); - return ret; + return 0; } -- cgit v1.2.3-70-g09d2 From ec010333ce7cf3270ae7193a6724794d5a179625 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:34:38 -1000 Subject: sched_ext: scx_cgroup_exit() may be called without successful scx_cgroup_init() 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") assumed that scx_cgroup_exit() is only called after scx_cgroup_init() finished successfully. This isn't true. scx_cgroup_exit() can be called without scx_cgroup_init() being called at all or after scx_cgroup_init() failed in the middle. As init state is tracked per cgroup, scx_cgroup_exit() can be used safely to clean up in all cases. Remove the incorrect WARN_ON_ONCE(). Signed-off-by: Tejun Heo Fixes: 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 76f04f3ace61..1df2082d1352 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4052,7 +4052,6 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); - WARN_ON_ONCE(!scx_cgroup_enabled); scx_cgroup_enabled = false; /* -- cgit v1.2.3-70-g09d2 From b62933eee41e2909422c2c3d7fdb56217913faf9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:11 -1000 Subject: sched/core: Make select_task_rq() take the pointer to wake_flags instead of value This will be used to allow select_task_rq() to indicate whether ->select_task_rq() was called by modifying *wake_flags. This makes try_to_wake_up() call all functions that take wake_flags with WF_TTWU set. Previously, only select_task_rq() was. Using the same flags is more consistent, and, as the flag is only tested by ->select_task_rq() implementations, it doesn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..e70b57a5693e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3518,12 +3518,12 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -4120,6 +4120,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4252,7 +4254,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); @@ -4793,6 +4795,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); @@ -4807,7 +4810,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -4815,7 +4818,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - wakeup_preempt(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* -- cgit v1.2.3-70-g09d2 From f207dc2dcdcf0e1e7d260b392784855ce8d84147 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:12 -1000 Subject: sched/core: Add ENQUEUE_RQ_SELECTED to indicate whether ->select_task_rq() was called During ttwu, ->select_task_rq() can be skipped if only one CPU is allowed or migration is disabled. sched_ext schedulers may perform operations such as direct dispatch from ->select_task_rq() path and it is useful for them to know whether ->select_task_rq() was skipped in the ->enqueue_task() path. Currently, sched_ext schedulers are using ENQUEUE_WAKEUP for this purpose and end up assuming incorrectly that ->select_task_rq() was called for tasks that are bound to a single CPU or migration disabled. Make select_task_rq() indicate whether ->select_task_rq() was called by setting WF_RQ_SELECTED in *wake_flags and make ttwu_do_activate() map that to ENQUEUE_RQ_SELECTED for ->enqueue_task(). This will be used by sched_ext to fix ->select_task_rq() skip detection. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/core.c | 8 ++++++-- kernel/sched/sched.h | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e70b57a5693e..aeb595514461 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3522,10 +3522,12 @@ int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); - else + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1c3588a8f00..6085ef50febf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ @@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) -- cgit v1.2.3-70-g09d2 From 9b671793c7d95f020791415cbbcc82b9c007d19c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:13 -1000 Subject: sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED scx_qmap and other schedulers in the SCX repo are using SCX_ENQ_WAKEUP to tell whether ops.select_cpu() was called. This is incorrect as ops.select_cpu() can be skipped in the wakeup path and leads to e.g. incorrectly skipping direct dispatch for tasks that are bound to a single CPU. sched core has been updated to specify ENQUEUE_RQ_SELECTED if ->select_task_rq() was called. Map it to SCX_ENQ_CPU_SELECTED and update scx_qmap to test it instead of SCX_ENQ_WAKEUP. Signed-off-by: Tejun Heo Acked-by: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 1 + tools/sched_ext/scx_qmap.bpf.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1df2082d1352..410a4df8a121 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -696,6 +696,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 67e2a7968cc9..5d1f880d1149 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -230,8 +230,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_WAKEUP) && + /* if select_cpu() wasn't called, try direct dispatch */ + if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); -- cgit v1.2.3-70-g09d2