summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/cpufreq_schedutil.c12
-rw-r--r--kernel/sched/ext.c83
-rw-r--r--kernel/sched/ext.h9
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h3
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c142
-rw-r--r--tools/sched_ext/scx_qmap.c8
7 files changed, 252 insertions, 6 deletions
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 972b7dd65af2..e683e5d08daa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
{
- unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
+ unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
+ if (!scx_switched_all())
+ util += cpu_util_cfs_boost(sg_cpu->cpu);
util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
util = max(util, boost);
sg_cpu->bw_min = min;
@@ -330,6 +332,14 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
unsigned long idle_calls;
bool ret;
+ /*
+ * The heuristics in this function is for the fair class. For SCX, the
+ * performance target comes directly from the BPF scheduler. Let's just
+ * follow it.
+ */
+ if (scx_switched_all())
+ return false;
+
/* if capped by uclamp_max, always update to be in compliance */
if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
return false;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 390623a4a376..28f7a4266fde 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -16,6 +16,8 @@ enum scx_consts {
SCX_EXIT_BT_LEN = 64,
SCX_EXIT_MSG_LEN = 1024,
SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
};
enum scx_exit_kind {
@@ -3520,7 +3522,7 @@ DEFINE_SCHED_CLASS(ext) = {
.update_curr = update_curr_scx,
#ifdef CONFIG_UCLAMP_TASK
- .uclamp_enabled = 0,
+ .uclamp_enabled = 1,
#endif
};
@@ -4393,7 +4395,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
- int i, ret;
+ int i, cpu, ret;
mutex_lock(&scx_ops_enable_mutex);
@@ -4442,6 +4444,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
atomic_long_set(&scx_nr_rejected, 0);
+ for_each_possible_cpu(cpu)
+ cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+
/*
* Keep CPUs stable during enable so that the BPF scheduler can track
* online CPUs by watching ->on/offline_cpu() after ->init().
@@ -5836,6 +5841,77 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
}
/**
+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the maximum relative capacity of @cpu in relation to the most
+ * performant CPU in the system. The return value is in the range [1,
+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
+{
+ if (ops_cpu_valid(cpu, NULL))
+ return arch_scale_cpu_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the current relative performance of @cpu in relation to its maximum.
+ * The return value is in the range [1, %SCX_CPUPERF_ONE].
+ *
+ * The current performance level of a CPU in relation to the maximum performance
+ * available in the system can be calculated as follows:
+ *
+ * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
+ *
+ * The result is in the range [1, %SCX_CPUPERF_ONE].
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
+{
+ if (ops_cpu_valid(cpu, NULL))
+ return arch_scale_freq_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
+ * @cpu: CPU of interest
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @flags: %SCX_CPUPERF_* flags
+ *
+ * Set the target performance level of @cpu to @perf. @perf is in linear
+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
+ * schedutil cpufreq governor chooses the target frequency.
+ *
+ * The actual performance level chosen, CPU grouping, and the overhead and
+ * latency of the operations are dependent on the hardware and cpufreq driver in
+ * use. Consult hardware and cpufreq documentation for more information. The
+ * current performance level can be monitored using scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf)
+{
+ if (unlikely(perf > SCX_CPUPERF_ONE)) {
+ scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ return;
+ }
+
+ if (ops_cpu_valid(cpu, NULL)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->scx.cpuperf_target = perf;
+
+ rcu_read_lock_sched_notrace();
+ cpufreq_update_util(cpu_rq(cpu), 0);
+ rcu_read_unlock_sched_notrace();
+ }
+}
+
+/**
* scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
*
* All valid CPU IDs in the system are smaller than the returned value.
@@ -6045,6 +6121,9 @@ BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index bf6f2cfa49d5..0a7b9a34b18f 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -46,6 +46,14 @@ int scx_check_setscheduler(struct task_struct *p, int policy);
bool task_should_scx(struct task_struct *p);
void init_sched_ext_class(void);
+static inline u32 scx_cpuperf_target(s32 cpu)
+{
+ if (scx_enabled())
+ return cpu_rq(cpu)->scx.cpuperf_target;
+ else
+ return 0;
+}
+
static inline const struct sched_class *next_active_class(const struct sched_class *class)
{
class++;
@@ -85,6 +93,7 @@ static inline void scx_pre_fork(struct task_struct *p) {}
static inline int scx_fork(struct task_struct *p) { return 0; }
static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
static inline void scx_rq_activate(struct rq *rq) {}
static inline void scx_rq_deactivate(struct rq *rq) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3989bf8f2a1b..963a2fa180ad 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -743,6 +743,7 @@ struct scx_rq {
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
u32 flags;
+ u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
bool cpu_released;
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_kick_if_idle;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 3fa87084cf17..dbbda0e35c5d 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -42,6 +42,9 @@ void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index c75c70d6a8eb..b1d0b09c966e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -69,6 +69,18 @@ struct {
};
/*
+ * If enabled, CPU performance target is set according to the queue index
+ * according to the following table.
+ */
+static const u32 qidx_to_cpuperf_target[] = {
+ [0] = SCX_CPUPERF_ONE * 0 / 4,
+ [1] = SCX_CPUPERF_ONE * 1 / 4,
+ [2] = SCX_CPUPERF_ONE * 2 / 4,
+ [3] = SCX_CPUPERF_ONE * 3 / 4,
+ [4] = SCX_CPUPERF_ONE * 4 / 4,
+};
+
+/*
* Per-queue sequence numbers to implement core-sched ordering.
*
* Tail seq is assigned to each queued task and incremented. Head seq tracks the
@@ -95,6 +107,8 @@ struct {
struct cpu_ctx {
u64 dsp_idx; /* dispatch index */
u64 dsp_cnt; /* remaining count */
+ u32 avg_weight;
+ u32 cpuperf_target;
};
struct {
@@ -107,6 +121,8 @@ struct {
/* Statistics */
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
u64 nr_core_sched_execed;
+u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
@@ -313,6 +329,29 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
}
+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
+{
+ struct cpu_ctx *cpuc;
+ u32 zero = 0;
+ int idx;
+
+ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+ scx_bpf_error("failed to look up cpu_ctx");
+ return;
+ }
+
+ /*
+ * Use the running avg of weights to select the target cpuperf level.
+ * This is a demonstration of the cpuperf feature rather than a
+ * practical strategy to regulate CPU frequency.
+ */
+ cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
+ idx = weight_to_idx(cpuc->avg_weight);
+ cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
+
+ scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
+}
+
/*
* The distance from the head of the queue scaled by the weight of the queue.
* The lower the number, the older the task and the higher the priority.
@@ -422,8 +461,9 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
return;
- scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
- cpuc->dsp_idx, cpuc->dsp_cnt);
+ scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
+ cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
+ cpuc->cpuperf_target);
}
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
@@ -492,11 +532,106 @@ void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
print_cpus();
}
+struct monitor_timer {
+ struct bpf_timer timer;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct monitor_timer);
+} monitor_timer SEC(".maps");
+
+/*
+ * Print out the min, avg and max performance levels of CPUs every second to
+ * demonstrate the cpuperf interface.
+ */
+static void monitor_cpuperf(void)
+{
+ u32 zero = 0, nr_cpu_ids;
+ u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
+ u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
+ const struct cpumask *online;
+ int i, nr_online_cpus = 0;
+
+ nr_cpu_ids = scx_bpf_nr_cpu_ids();
+ online = scx_bpf_get_online_cpumask();
+
+ bpf_for(i, 0, nr_cpu_ids) {
+ struct cpu_ctx *cpuc;
+ u32 cap, cur;
+
+ if (!bpf_cpumask_test_cpu(i, online))
+ continue;
+ nr_online_cpus++;
+
+ /* collect the capacity and current cpuperf */
+ cap = scx_bpf_cpuperf_cap(i);
+ cur = scx_bpf_cpuperf_cur(i);
+
+ cur_min = cur < cur_min ? cur : cur_min;
+ cur_max = cur > cur_max ? cur : cur_max;
+
+ /*
+ * $cur is relative to $cap. Scale it down accordingly so that
+ * it's in the same scale as other CPUs and $cur_sum/$cap_sum
+ * makes sense.
+ */
+ cur_sum += cur * cap / SCX_CPUPERF_ONE;
+ cap_sum += cap;
+
+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
+ scx_bpf_error("failed to look up cpu_ctx");
+ goto out;
+ }
+
+ /* collect target */
+ cur = cpuc->cpuperf_target;
+ target_sum += cur;
+ target_min = cur < target_min ? cur : target_min;
+ target_max = cur > target_max ? cur : target_max;
+ }
+
+ cpuperf_min = cur_min;
+ cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+ cpuperf_max = cur_max;
+
+ cpuperf_target_min = target_min;
+ cpuperf_target_avg = target_sum / nr_online_cpus;
+ cpuperf_target_max = target_max;
+out:
+ scx_bpf_put_cpumask(online);
+}
+
+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+ monitor_cpuperf();
+
+ bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+ return 0;
+}
+
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
+ u32 key = 0;
+ struct bpf_timer *timer;
+ s32 ret;
+
print_cpus();
- return scx_bpf_create_dsq(SHARED_DSQ, -1);
+ ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+ if (ret)
+ return ret;
+
+ timer = bpf_map_lookup_elem(&monitor_timer, &key);
+ if (!timer)
+ return -ESRCH;
+
+ bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
+ bpf_timer_set_callback(timer, monitor_timerfn);
+
+ return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
}
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
@@ -509,6 +644,7 @@ SCX_OPS_DEFINE(qmap_ops,
.enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
+ .tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before,
.cpu_release = (void *)qmap_cpu_release,
.init_task = (void *)qmap_init_task,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index bc36ec4f88a7..4d41c0cb1dab 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -116,6 +116,14 @@ int main(int argc, char **argv)
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
skel->bss->nr_core_sched_execed);
+ if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
+ printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
+ skel->bss->cpuperf_min,
+ skel->bss->cpuperf_avg,
+ skel->bss->cpuperf_max,
+ skel->bss->cpuperf_target_min,
+ skel->bss->cpuperf_target_avg,
+ skel->bss->cpuperf_target_max);
fflush(stdout);
sleep(1);
}