diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/bloom_filter.c | 2 | ||||
-rw-r--r-- | kernel/bpf/cgroup_iter.c | 2 | ||||
-rw-r--r-- | kernel/bpf/core.c | 6 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 2 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 203 | ||||
-rw-r--r-- | kernel/kcsan/selftest.c | 4 | ||||
-rw-r--r-- | kernel/locking/test-ww_mutex.c | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/sched/psi.c | 280 | ||||
-rw-r--r-- | kernel/sched/stats.h | 6 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 2 |
12 files changed, 399 insertions, 115 deletions
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b9ea539a5561..48ee750849f2 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) attr->value_size / sizeof(u32); if (!(attr->map_flags & BPF_F_ZERO_SEED)) - bloom->hash_seed = get_random_int(); + bloom->hash_seed = get_random_u32(); return &bloom->map; } diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 0d200a993489..9fcf09f2ef00 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -196,7 +196,7 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, return -EINVAL; if (fd) - cgrp = cgroup_get_from_fd(fd); + cgrp = cgroup_v1v2_get_from_fd(fd); else if (id) cgrp = cgroup_get_from_id(id); else /* walk the entire hierarchy by default. */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a0e762a2bf97..9c16338bcbe8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1032,7 +1032,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = prandom_u32_max(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -1094,7 +1094,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = prandom_u32_max(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; @@ -1216,7 +1216,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, bool emit_zext) { struct bpf_insn *to = to_buff; - u32 imm_rnd = get_random_int(); + u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ed3f8a53603b..f39ee3e05589 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -527,7 +527,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else - htab->hashrnd = get_random_int(); + htab->hashrnd = get_random_u32(); htab_init_buckets(htab); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f6d2d511c06..014ee0953dbd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13350,7 +13350,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, aux[adj_idx].ptr_type == PTR_TO_CTX) continue; - imm_rnd = get_random_int(); + imm_rnd = get_random_u32(); rnd_hi32_patch[0] = insn; rnd_hi32_patch[1].imm = imm_rnd; rnd_hi32_patch[3].dst_reg = load_reg; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 764bdd5fd8d1..2319946715e0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1392,6 +1392,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * Returned cgroup is without refcount but it's valid as long as cset pins it. + */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { @@ -1403,6 +1406,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; + lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1414,6 +1418,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } + BUG_ON(!res_cgroup); return res_cgroup; } @@ -1436,23 +1441,36 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); - BUG_ON(!res); return res; } +/* + * Look up cgroup associated with current task's cgroup namespace on the default + * hierarchy. + * + * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: + * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu + * pointers. + * - css_set_lock is not needed because we just read cset->dfl_cgrp. + * - As a bonus returned cgrp is pinned with the current because it cannot + * switch cgroup_ns asynchronously. + */ +static struct cgroup *current_cgns_cgroup_dfl(void) +{ + struct css_set *cset; + + cset = current->nsproxy->cgroup_ns->root_cset; + return __cset_cgroup_from_root(cset, &cgrp_dfl_root); +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - res = __cset_cgroup_from_root(cset, root); - - BUG_ON(!res); - return res; + return __cset_cgroup_from_root(cset, root); } /* @@ -3698,27 +3716,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } -static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, enum psi_res res) +static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; @@ -3738,7 +3756,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); @@ -3755,21 +3773,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_IO); + return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); + return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); + return pressure_write(of, buf, nbytes, PSI_CPU); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + +static int cgroup_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + seq_printf(seq, "%d\n", psi->enabled); + + return 0; +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + ssize_t ret; + int enable; + struct cgroup *cgrp; + struct psi_group *psi; + + ret = kstrtoint(strstrip(buf), 0, &enable); + if (ret) + return ret; + + if (enable < 0 || enable > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + psi = cgroup_psi(cgrp); + if (psi->enabled != enable) { + int i; + + /* show or hide {cpu,memory,io,irq}.pressure files */ + for (i = 0; i < NR_PSI_RESOURCES; i++) + cgroup_file_show(&cgrp->psi_files[i], enable); + + psi->enabled = enable; + if (enable) + psi_cgroup_restart(psi); + } + + cgroup_kn_unlock(of->kn); + + return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, @@ -3789,6 +3872,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) bool cgroup_psi_enabled(void) { + if (static_branch_likely(&psi_disabled)) + return false; + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } @@ -5175,6 +5261,7 @@ static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5182,6 +5269,7 @@ static struct cftype cgroup_psi_files[] = { }, { .name = "memory.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5189,11 +5277,27 @@ static struct cftype cgroup_psi_files[] = { }, { .name = "cpu.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif + { + .name = "cgroup.pressure", + .seq_show = cgroup_pressure_show, + .write = cgroup_pressure_write, + }, #endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -6105,9 +6209,7 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); - spin_lock_irq(&css_set_lock); - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); + root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { cgroup_put(cgrp); return ERR_PTR(-ENOENT); @@ -6208,16 +6310,42 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -static struct cgroup *cgroup_get_from_file(struct file *f) +/** + * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer + * @f: file corresponding to cgroup_dir + * + * Find the cgroup from a file pointer associated with a cgroup directory. + * Returns a pointer to the cgroup on success. ERR_PTR is returned if the + * cgroup cannot be found. + */ +static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; - struct cgroup *cgrp; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); - cgrp = css->cgroup; + return css->cgroup; +} + +/** + * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports + * cgroup2. + * @f: file corresponding to cgroup2_dir + */ +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; } @@ -6686,10 +6814,8 @@ struct cgroup *cgroup_get_from_path(const char *path) struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup *root_cgrp; - spin_lock_irq(&css_set_lock); - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); + root_cgrp = current_cgns_cgroup_dfl(); kn = kernfs_walk_and_get(root_cgrp->kn, path); - spin_unlock_irq(&css_set_lock); if (!kn) goto out; @@ -6714,15 +6840,15 @@ out: EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** - * cgroup_get_from_fd - get a cgroup pointer from a fd - * @fd: fd obtained by open(cgroup2_dir) + * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ -struct cgroup *cgroup_get_from_fd(int fd) +struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct file *f; @@ -6731,10 +6857,29 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF); - cgrp = cgroup_get_from_file(f); + cgrp = cgroup_v1v2_get_from_file(f); fput(f); return cgrp; } + +/** + * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports + * cgroup2. + * @fd: fd obtained by open(cgroup2_dir) + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; +} EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 75712959c84e..00cdf8fa5693 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -26,7 +26,7 @@ static bool __init test_requires(void) { /* random should be initialized for the below tests */ - return prandom_u32() + prandom_u32() != 0; + return get_random_u32() + get_random_u32() != 0; } /* @@ -46,7 +46,7 @@ static bool __init test_encode_decode(void) unsigned long addr; size_t verif_size; - prandom_bytes(&addr, sizeof(addr)); + get_random_bytes(&addr, sizeof(addr)); if (addr < PAGE_SIZE) addr = PAGE_SIZE; diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 353004155d65..43efb2a04160 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -399,7 +399,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_int() % (n + 1); + r = prandom_u32_max(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work) { struct stress *stress = container_of(work, typeof(*stress), work); const int nlocks = stress->nlocks; - struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks); int err; do { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4ce124ec701..5800b0623ff3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7f6030091aee..ee2ecc081422 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -181,6 +181,7 @@ static void group_init(struct psi_group *group) { int cpu; + group->enabled = true; for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); @@ -201,6 +202,7 @@ void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } @@ -211,7 +213,7 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: @@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return unlikely(tasks[NR_MEMSTALL] && tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] > oncpu); case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; enum psi_states s; + u32 state_mask; groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state + * First we update the task counts according to the state * change requested through the @clear and @set bits. + * + * Then if the cgroup PSI stats accounting enabled, we + * assess the aggregate resource states this CPU's tasks + * have been in since the last change, and account any + * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); - record_times(groupc, now); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + write_seqcount_end(&groupc->seq); + return; + } + for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); } @@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); + record_times(groupc, now); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); @@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { - if (*iter == &psi_system) - return NULL; - #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) { - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } - } + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } @@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; u64 now; if (!task->pid) @@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set) psi_flags_change(task, clear, set); now = cpu_clock(cpu); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; - while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, now, wake_clock); + group = task_psi_group(task); + do { + psi_group_change(group, cpu, clear, set, now, true); + } while ((group = group->parent)); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - void *iter; u64 now = cpu_clock(cpu); if (next->pid) { - bool identical_state; - psi_flags_change(next, 0, TSK_ONCPU); /* - * When switching between tasks that have an identical - * runtime state, the cgroup that contains both tasks - * we reach the first common ancestor. Iterate @next's - * ancestors only until we encounter @prev's ONCPU. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - identical_state = prev->psi_flags == next->psi_flags; - iter = NULL; - while ((group = iterate_groups(next, &iter))) { - if (identical_state && - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + group = task_psi_group(next); + do { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; } psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); - } + } while ((group = group->parent)); } if (prev->pid) { int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; /* * When we're going to sleep, psi_dequeue() lets us @@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; } psi_flags_change(prev, clear, set); - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, now, true); + group = task_psi_group(prev); + do { + if (group == common) + break; + psi_group_change(group, cpu, clear, set, now, wake_clock); + } while ((group = group->parent)); /* - * TSK_ONCPU is handled up to the common ancestor. If we're tasked - * with dequeuing too, finish that for the rest of the hierarchy. + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. */ - if (sleep) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, now, true); + for (; group; group = group->parent) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + group = task_psi_group(task); + do { + if (!group->enabled) + continue; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } while ((group = group->parent)); +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); @@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup) return -ENOMEM; } group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; cancel_delayed_work_sync(&cgroup->psi->avgs_work); @@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_state() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_state() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + u64 now; + + rq_lock_irq(rq, &rf); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + rq_unlock_irq(rq, &rf); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_irq_show); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baa839c1ba96..84a188913cc9 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se) } #ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta); + /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cee5da1e54c4..8058bec87ace 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = prandom_u32() % nr_cpu_ids; + cpu = prandom_u32_max(nr_cpu_ids); cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); |