diff options
Diffstat (limited to 'kernel')
55 files changed, 2394 insertions, 1384 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b0cb7631e48b..21d2fa815e78 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -143,6 +143,8 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, + { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" }, + { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" }, { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b5149cfce7d4..146824cc9689 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem = NULL; + struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - if (gfp_flags == GFP_KERNEL) { - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); - if (!selem) - return ERR_PTR(-ENOMEM); - } + /* A lookup has just been done before and concluded a new selem is + * needed. The chance of an unnecessary alloc is unlikely. + */ + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!alloc_selem) + return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); @@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, * simple. */ err = -EAGAIN; - goto unlock_err; + goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) - goto unlock_err; + goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, @@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - if (gfp_flags != GFP_KERNEL) { - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - } - + alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); @@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false, false); + true, false); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (selem) { + if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(selem, smap, true); + bpf_selem_free(alloc_selem, smap, true); } - return ERR_PTR(err); + return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) @@ -779,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, true); + local_storage, selem, true, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 249657c466dd..1095bbe29859 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -553,7 +553,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) return -ENOENT; } -static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) +s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) { struct btf *btf; s32 ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ebeb0695305a..eb01c31ed591 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5502,9 +5502,9 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) } run_ctx.bpf_cookie = 0; - run_ctx.saved_run_ctx = NULL; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); bpf_prog_put(prog); return -EBUSY; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 78acf28d4873..53ff50cac61e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -926,13 +926,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, migrate_disable(); might_fault(); + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); return 0; } - - run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - return bpf_prog_start_time(); } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 83044312bc41..c487ffef6652 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) if (l->list[mid] == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (l->list[mid] < pid) index = mid + 1; else end = mid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5fa95f86cb4d..1fb7f562289d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -493,28 +493,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, } /** - * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get @cgrp's css associated with @ss. If the css doesn't exist - * or is offline, %NULL is returned. - */ -static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - css = cgroup_css(cgrp, ss); - if (css && !css_tryget_online(css)) - css = NULL; - rcu_read_unlock(); - - return css; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -679,7 +657,7 @@ EXPORT_SYMBOL_GPL(of_css); * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_[tree_]mutex. + * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -929,7 +907,7 @@ static void css_set_move_task(struct task_struct *task, #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; @@ -1070,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset, */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) + struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; @@ -1736,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) struct cftype *cfts, *failed_cfts; int ret; - if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { @@ -2499,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /* * This function may be called both before and - * after cgroup_taskset_migrate(). The two cases + * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ @@ -3654,9 +3632,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } -static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, - struct cgroup *cgrp, int ssid) +#ifdef CONFIG_CGROUP_SCHED +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css associated with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (css && !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + +static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { + struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; @@ -3673,20 +3674,8 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, return ret; } -static int cpu_stat_show(struct seq_file *seq, void *v) -{ - struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; - int ret = 0; - - cgroup_base_stat_cputime_show(seq); -#ifdef CONFIG_CGROUP_SCHED - ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); -#endif - return ret; -} - -static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq, - struct cgroup *cgrp, int ssid) +static int cgroup_local_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) { struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; @@ -3703,6 +3692,18 @@ static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq, css_put(css); return ret; } +#endif + +static int cpu_stat_show(struct seq_file *seq, void *v) +{ + int ret = 0; + + cgroup_base_stat_cputime_show(seq); +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); +#endif + return ret; +} static int cpu_local_stat_show(struct seq_file *seq, void *v) { @@ -4350,14 +4351,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return ret; } -static int cgroup_rm_cftypes_locked(struct cftype *cfts) +static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); - return 0; } /** @@ -4373,8 +4373,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - int ret; - if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -4382,9 +4380,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) return -ENOENT; cgroup_lock(); - ret = cgroup_rm_cftypes_locked(cfts); + cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); - return ret; + return 0; } /** @@ -5337,7 +5335,7 @@ static struct cftype cgroup_psi_files[] = { * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in - * css_free_work_fn(). + * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional @@ -5581,8 +5579,7 @@ err_free_css: /* * The returned cgroup is fully initialized including its control mask, but - * it isn't associated with its kernfs_node and doesn't have the control - * mask applied. + * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) @@ -5908,7 +5905,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling - * cgroup_lock_live_group(). The latter makes the csets ignored by + * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; @@ -5930,7 +5927,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* @@ -6123,8 +6120,8 @@ int __init cgroup_init(void) continue; if (cgroup1_ssid_disabled(ssid)) - printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", - ss->name); + pr_info("Disabling %s control group subsystem in v1 mounts\n", + ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58e6f18f01c1..58ec88efa4f8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1230,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) /* * Percpu kthreads in top_cpuset are ignored */ - if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) + if (kthread_is_per_cpu(task)) continue; cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); } else { @@ -1255,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus) { + if (parent->nr_subparts_cpus && is_partition_valid(cs)) { cpumask_or(new_cpus, parent->effective_cpus, parent->subparts_cpus); cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); @@ -1277,6 +1277,52 @@ enum subparts_cmd { static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp); + +/* + * Update partition exclusive flag + * + * Return: 0 if successful, an error code otherwise + */ +static int update_partition_exclusive(struct cpuset *cs, int new_prs) +{ + bool exclusive = (new_prs > 0); + + if (exclusive && !is_cpu_exclusive(cs)) { + if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + return PERR_NOTEXCL; + } else if (!exclusive && is_cpu_exclusive(cs)) { + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + return 0; +} + +/* + * Update partition load balance flag and/or rebuild sched domain + * + * Changing load balance flag will automatically call + * rebuild_sched_domains_locked(). + */ +static void update_partition_sd_lb(struct cpuset *cs, int old_prs) +{ + int new_prs = cs->partition_root_state; + bool new_lb = (new_prs != PRS_ISOLATED); + bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + + if (new_lb != !!is_sched_load_balance(cs)) { + rebuild_domains = true; + if (new_lb) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + } + + if (rebuild_domains) + rebuild_sched_domains_locked(); +} + /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1336,8 +1382,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if ((newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cs->cpus_allowed))) + if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; /* @@ -1404,10 +1449,15 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, adding = cpumask_andnot(tmp->addmask, tmp->addmask, parent->subparts_cpus); /* + * Empty cpumask is not allowed + */ + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - if (adding && + } else if (adding && cpumask_subset(parent->effective_cpus, tmp->addmask) && !cpumask_intersects(tmp->delmask, cpu_active_mask) && partition_is_populated(parent, cs)) { @@ -1480,14 +1530,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, /* * Transitioning between invalid to valid or vice versa may require - * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. + * changing CS_CPU_EXCLUSIVE. */ if (old_prs != new_prs) { - if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && - (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) - return PERR_NOTEXCL; - if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + int err = update_partition_exclusive(cs, new_prs); + + if (err) + return err; } /* @@ -1520,24 +1569,34 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); - if (adding || deleting) + if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, tmp); + } /* - * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. - * rebuild_sched_domains_locked() may be called. + * For partcmd_update without newmask, it is being called from + * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. + * Update the load balance flag and scheduling domain if + * cpus_read_trylock() is successful. */ - if (old_prs != new_prs) { - if (old_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - else if (new_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + update_partition_sd_lb(cs, old_prs); + cpus_read_unlock(); } + notify_partition_change(cs, old_prs); return 0; } /* + * update_cpumasks_hier() flags + */ +#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ +#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ + +/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup @@ -1551,7 +1610,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - bool force) + int flags) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1588,11 +1647,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, } /* - * Skip the whole subtree if the cpumask remains the same - * and has no partition root state and force flag not set. + * Skip the whole subtree if + * 1) the cpumask remains the same, + * 2) has no partition root state, + * 3) HIER_CHECKALL flag not set, and + * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !force && - cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { + if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + cpumask_equal(tmp->new_cpus, cp->effective_cpus) && + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1676,6 +1740,20 @@ update_parent_subparts: update_tasks_cpumask(cp, tmp->new_cpus); /* + * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE + * from parent if current cpuset isn't a valid partition root + * and their load balance states differ. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_partition_valid(cp) && + (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { + if (is_sched_load_balance(parent)) + set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + } + + /* * On legacy hierarchy, if the effective cpumask of any non- * empty cpuset is changed, we need to rebuild sched domains. * On default hierarchy, the cpuset needs to be a partition @@ -1692,7 +1770,7 @@ update_parent_subparts: } rcu_read_unlock(); - if (need_rebuild_sched_domains) + if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) rebuild_sched_domains_locked(); } @@ -1716,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * to use the right effective_cpus value. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. + * release the RCU read lock before calling it. HIER_NO_SD_REBUILD + * flag is used to suppress rebuild of sched domains as the callers + * will take care of that. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1728,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, false); + update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); rcu_read_lock(); css_put(&sibling->css); } @@ -1747,6 +1827,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; struct tmpmasks tmp; bool invalidate = false; + int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -1774,18 +1855,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; -#ifdef CONFIG_CPUMASK_OFFSTACK - /* - * Use the cpumasks in trialcs for tmpmasks when they are pointers - * to allocated cpumasks. - * - * Note that update_parent_subparts_cpumask() uses only addmask & - * delmask, but not new_cpus. - */ - tmp.addmask = trialcs->subparts_cpus; - tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = NULL; -#endif + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; retval = validate_change(cs, trialcs); @@ -1814,7 +1885,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = 0; } if (retval < 0) - return retval; + goto out_free; if (cs->partition_root_state) { if (invalidate) @@ -1849,13 +1920,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); -#ifdef CONFIG_CPUMASK_OFFSTACK - /* Now trialcs->cpus_allowed is available */ - tmp.new_cpus = trialcs->cpus_allowed; -#endif - /* effective_cpus will be updated here */ - update_cpumasks_hier(cs, &tmp, false); + update_cpumasks_hier(cs, &tmp, 0); if (cs->partition_root_state) { struct cpuset *parent = parent_cs(cs); @@ -1866,7 +1932,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmp); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ + update_partition_sd_lb(cs, old_prs); } +out_free: + free_cpumasks(NULL, &tmp); return 0; } @@ -2242,7 +2313,6 @@ out: static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; - bool sched_domain_rebuilt = false; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; @@ -2261,45 +2331,26 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; + err = update_partition_exclusive(cs, new_prs); + if (err) + goto out; + if (!old_prs) { /* - * Turning on partition root requires setting the - * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be empty. + * cpus_allowed cannot be empty. */ if (cpumask_empty(cs->cpus_allowed)) { err = PERR_CPUSEMPTY; goto out; } - err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) { - err = PERR_NOTEXCL; - goto out; - } - err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); - if (err) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - goto out; - } - - if (new_prs == PRS_ISOLATED) { - /* - * Disable the load balance flag should not return an - * error unless the system is running out of memory. - */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - sched_domain_rebuilt = true; - } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); - sched_domain_rebuilt = true; - goto out; /* Sched domain is rebuilt in update_flag() */ + ; } else { /* * Switching back to member is always allowed even if it @@ -2318,40 +2369,31 @@ static int update_prstate(struct cpuset *cs, int new_prs) compute_effective_cpumask(cs->effective_cpus, cs, parent); spin_unlock_irq(&callback_lock); } - - /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - - if (!is_sched_load_balance(cs)) { - /* Make sure load balance is on */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - sched_domain_rebuilt = true; - } } - - update_tasks_cpumask(parent, tmpmask.new_cpus); - - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmpmask); - - if (!sched_domain_rebuilt) - rebuild_sched_domains_locked(); out: /* - * Make partition invalid if an error happen + * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error + * happens. */ - if (err) + if (err) { new_prs = -new_prs; + update_partition_exclusive(cs, new_prs); + } + spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); spin_unlock_irq(&callback_lock); + /* * Update child cpusets, if present. * Force update if switching back to member. */ if (!list_empty(&cs->css.children)) - update_cpumasks_hier(cs, &tmpmask, !new_prs); + update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + + /* Update sched domains and load balance flag */ + update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); @@ -2487,6 +2529,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; + bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ @@ -2501,13 +2544,25 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; + cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - ret = security_task_setscheduler(task); - if (ret) - goto out_unlock; + + /* + * Skip rights over task check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpus_updated || mems_updated)) { + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + } if (dl_task(task)) { cs->nr_migrate_dl_tasks++; @@ -3222,6 +3277,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->use_parent_ecpus = true; parent->child_ecpus_count++; } + + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -3521,17 +3584,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - mutex_unlock(&cpuset_mutex); - /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into * cpuset. Should be done outside any lock. */ - if (is_empty) + if (is_empty) { + mutex_unlock(&cpuset_mutex); remove_tasks_in_empty_cpuset(cs); - - mutex_lock(&cpuset_mutex); + mutex_lock(&cpuset_mutex); + } } static void @@ -3691,6 +3753,7 @@ unlock: /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * @work: unused * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4073,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) /** * cpuset_spread_node() - On which node to begin search for a page + * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ae2f4dd47508..79a3717a5803 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -14,7 +14,7 @@ #include <linux/misc_cgroup.h> #define MAX_STR "max" -#define MAX_NUM ULONG_MAX +#define MAX_NUM U64_MAX /* Miscellaneous res name, keep it in sync with enum misc_res_type */ static const char *const misc_res_name[] = { @@ -37,7 +37,7 @@ static struct misc_cg root_cg; * more than the actual capacity. We are using Limits resource distribution * model of cgroup for miscellaneous controller. */ -static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; +static u64 misc_res_capacity[MISC_CG_RES_TYPES]; /** * parent_misc() - Get the parent of the passed misc cgroup. @@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type) * Context: Any context. * Return: Current total usage of the resource. */ -unsigned long misc_cg_res_total_usage(enum misc_res_type type) +u64 misc_cg_res_total_usage(enum misc_res_type type) { if (valid_type(type)) - return atomic_long_read(&root_cg.res[type].usage); + return atomic64_read(&root_cg.res[type].usage); return 0; } @@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); * * %0 - Successfully registered the capacity. * * %-EINVAL - If @type is invalid. */ -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { if (!valid_type(type)) return -EINVAL; @@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity); * Context: Any context. */ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { - WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), + WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage), "misc cgroup resource %s became less than 0", misc_res_name[type]); } @@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, * * -EBUSY - If max limit will be crossed or total usage will be more than the * capacity. */ -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i, *j; int ret; struct misc_res *res; - int new_usage; + u64 new_usage; if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) return -EINVAL; @@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, for (i = cg; i; i = parent_misc(i)) { res = &i->res[type]; - new_usage = atomic_long_add_return(amount, &res->usage); + new_usage = atomic64_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { ret = -EBUSY; @@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, err_charge: for (j = i; j; j = parent_misc(j)) { - atomic_long_inc(&j->res[type].events); + atomic64_inc(&j->res[type].events); cgroup_file_notify(&j->events_file); } @@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge); * * Context: Any context. */ -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i; @@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) { int i; struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long max; + u64 max; for (i = 0; i < MISC_CG_RES_TYPES; i++) { if (READ_ONCE(misc_res_capacity[i])) { @@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) if (max == MAX_NUM) seq_printf(sf, "%s max\n", misc_res_name[i]); else - seq_printf(sf, "%s %lu\n", misc_res_name[i], + seq_printf(sf, "%s %llu\n", misc_res_name[i], max); } } @@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) * Return: * * >= 0 - Number of bytes processed in the input. * * -EINVAL - If buf is not valid. - * * -ERANGE - If number is bigger than the unsigned long capacity. + * * -ERANGE - If number is bigger than the u64 capacity. */ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct misc_cg *cg; - unsigned long max; + u64 max; int ret = 0, i; enum misc_res_type type = MISC_CG_RES_TYPES; char *token; @@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, if (!strcmp(MAX_STR, buf)) { max = MAX_NUM; } else { - ret = kstrtoul(buf, 0, &max); + ret = kstrtou64(buf, 0, &max); if (ret) return ret; } @@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, static int misc_cg_current_show(struct seq_file *sf, void *v) { int i; - unsigned long usage; + u64 usage; struct misc_cg *cg = css_misc(seq_css(sf)); for (i = 0; i < MISC_CG_RES_TYPES; i++) { - usage = atomic_long_read(&cg->res[i].usage); + usage = atomic64_read(&cg->res[i].usage); if (READ_ONCE(misc_res_capacity[i]) || usage) - seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); + seq_printf(sf, "%s %llu\n", misc_res_name[i], usage); } return 0; @@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) static int misc_cg_capacity_show(struct seq_file *sf, void *v) { int i; - unsigned long cap; + u64 cap; for (i = 0; i < MISC_CG_RES_TYPES; i++) { cap = READ_ONCE(misc_res_capacity[i]); if (cap) - seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); + seq_printf(sf, "%s %llu\n", misc_res_name[i], cap); } return 0; @@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) static int misc_events_show(struct seq_file *sf, void *v) { struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long events, i; + u64 events; + int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic_long_read(&cg->res[i].events); + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) - seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } @@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css) for (i = 0; i < MISC_CG_RES_TYPES; i++) { WRITE_ONCE(cg->res[i].max, MAX_NUM); - atomic_long_set(&cg->res[i].usage, 0); + atomic64_set(&cg->res[i].usage, 0); } return &cg->css; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 0d5c29879a50..144a464e45c6 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = { .install = cgroupns_install, .owner = cgroupns_owner, }; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 2542c21b6b6d..d80d7a608141 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; struct cgroup_base_stat delta; unsigned seq; @@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* propagate percpu delta to global */ + /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); + cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); - /* propagate global delta to parent (unless that's root) */ + /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); + + delta = rstatc->subtree_bstat; + prstatc = cgroup_rstat_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); + cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); } } diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e8db8d938661..4722b998a324 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -1,3 +1,5 @@ +# Help: Debugging for CI systems and finding regressions +# # The config is based on running daily CI for enterprise Linux distros to # seek regressions on linux-next builds on different bare-metal and virtual # platforms. It can be used for example, diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 208481d91090..d0877063d925 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -1,3 +1,4 @@ +# Help: Bootable as a KVM guest CONFIG_NET=y CONFIG_NET_CORE=y CONFIG_NETDEVICES=y diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config index 81ff07863576..ebfdc3d8aa9a 100644 --- a/kernel/configs/nopm.config +++ b/kernel/configs/nopm.config @@ -1,3 +1,5 @@ +# Help: Disable Power Management + CONFIG_PM=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config index 38a7c5362c9c..2c6e001a7284 100644 --- a/kernel/configs/rust.config +++ b/kernel/configs/rust.config @@ -1 +1,2 @@ +# Help: Enable Rust CONFIG_RUST=y diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config index 6fac5b405334..35f48671b8d5 100644 --- a/kernel/configs/x86_debug.config +++ b/kernel/configs/x86_debug.config @@ -1,3 +1,4 @@ +# Help: Debugging options for tip tree testing CONFIG_X86_DEBUG_FPU=y CONFIG_LOCK_STAT=y CONFIG_DEBUG_VM=y diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index 436f806aa1ed..6878b9a49be8 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -1,3 +1,5 @@ +# Help: Bootable as a Xen guest +# # global stuff - these enable us to allow some # of the not so generic stuff below for xen CONFIG_PARAVIRT=y diff --git a/kernel/cpu.c b/kernel/cpu.c index f6811c857102..6de7c6bb74ee 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1487,8 +1487,22 @@ out: return ret; } +struct cpu_down_work { + unsigned int cpu; + enum cpuhp_state target; +}; + +static long __cpu_down_maps_locked(void *arg) +{ + struct cpu_down_work *work = arg; + + return _cpu_down(work->cpu, 0, work->target); +} + static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { + struct cpu_down_work work = { .cpu = cpu, .target = target, }; + /* * If the platform does not support hotplug, report it explicitly to * differentiate it from a transient offlining failure. @@ -1497,7 +1511,15 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; - return _cpu_down(cpu, 0, target); + + /* + * Ensure that the control task does not run on the to be offlined + * CPU to prevent a deadlock against cfs_b->period_timer. + */ + cpu = cpumask_any_but(cpu_online_mask, cpu); + if (cpu >= nr_cpu_ids) + return -EBUSY; + return work_on_cpu(cpu, __cpu_down_maps_locked, &work); } static int cpu_down(unsigned int cpu, enum cpuhp_state target) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index d5e9ccde3ab8..621037a0aa87 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -968,7 +968,7 @@ static int __init opt_kgdb_con(char *str) early_param("kgdbcon", opt_kgdb_con); #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_dbg(int key) +static void sysrq_handle_dbg(u8 key) { if (!dbg_io_ops) { pr_crit("ERROR: No KGDB I/O module available\n"); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 813cb6cf72d6..9443bc63c5a2 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -590,6 +590,8 @@ static void kdb_msg_write(const char *msg, int msg_len) continue; if (c == dbg_io_ops->cons) continue; + if (!c->write) + continue; /* * Set oops_in_progress to encourage the console drivers to * disregard their internal spin locks: in the current calling diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 4c1e9a3c0ab6..f488997b0717 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -160,7 +160,7 @@ if DMA_CMA config DMA_NUMA_CMA bool "Enable separate DMA Contiguous Memory Area for NUMA Node" - default NUMA + depends on NUMA help Enable this option to get numa CMA areas so that NUMA devices can get local memory by DMA coherent APIs. diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 88c595e49e34..f005c66f378c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -473,11 +473,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) return -EBUSY; } - if (memblock_is_region_reserved(rmem->base, rmem->size)) { - pr_info("Reserved memory: overlap with other memblock reserved region\n"); - return -EBUSY; - } - if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f190651bcadd..06366acd27b0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -637,15 +637,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -static void __dma_entry_alloc_check_leak(void) +/* + * This should be called outside of free_entries_lock scope to avoid potential + * deadlocks with serial consoles that use DMA. + */ +static void __dma_entry_alloc_check_leak(u32 nr_entries) { - u32 tmp = nr_total_entries % nr_prealloc_entries; + u32 tmp = nr_entries % nr_prealloc_entries; /* Shout each time we tick over some multiple of the initial pool */ if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) { pr_info("dma_debug_entry pool grown to %u (%u00%%)\n", - nr_total_entries, - (nr_total_entries / nr_prealloc_entries)); + nr_entries, + (nr_entries / nr_prealloc_entries)); } } @@ -656,8 +660,10 @@ static void __dma_entry_alloc_check_leak(void) */ static struct dma_debug_entry *dma_entry_alloc(void) { + bool alloc_check_leak = false; struct dma_debug_entry *entry; unsigned long flags; + u32 nr_entries; spin_lock_irqsave(&free_entries_lock, flags); if (num_free_entries == 0) { @@ -667,13 +673,17 @@ static struct dma_debug_entry *dma_entry_alloc(void) pr_err("debugging out of memory - disabling\n"); return NULL; } - __dma_entry_alloc_check_leak(); + alloc_check_leak = true; + nr_entries = nr_total_entries; } entry = __dma_entry_alloc(); spin_unlock_irqrestore(&free_entries_lock, flags); + if (alloc_check_leak) + __dma_entry_alloc_check_leak(nr_entries); + #ifdef CONFIG_STACKTRACE entry->stack_len = stack_trace_save(entry->stack_entries, ARRAY_SIZE(entry->stack_entries), diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 1acec2e22827..b481c48a31a6 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -135,9 +135,9 @@ encrypt_mapping: remove_mapping: #ifdef CONFIG_DMA_DIRECT_REMAP dma_common_free_remap(addr, pool_size); -#endif -free_page: __maybe_unused +free_page: __free_pages(page, order); +#endif out: return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index a9c18d480dc5..3b6d20dfb9a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { - int i; - BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); - for (i = 0; i < NR_MM_COUNTERS; i++) - percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1260,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - int i; - mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1309,17 +1304,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid; - for (i = 0; i < NR_MM_COUNTERS; i++) - if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) - goto fail_pcpu; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: - while (i > 0) - percpu_counter_destroy(&mm->rss_stat[--i]); mm_destroy_cid(mm); fail_cid: destroy_context(mm); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ca385b61d546..0c6185aefaef 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2232,8 +2232,7 @@ int register_kretprobe(struct kretprobe *rp) return -ENOMEM; for (i = 0; i < rp->maxactive; i++) { - inst = kzalloc(sizeof(struct kretprobe_instance) + - rp->data_size, GFP_KERNEL); + inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); if (inst == NULL) { rethook_free(rp->rh); rp->rh = NULL; @@ -2256,8 +2255,7 @@ int register_kretprobe(struct kretprobe *rp) rp->rph->rp = rp; for (i = 0; i < rp->maxactive; i++) { - inst = kzalloc(sizeof(struct kretprobe_instance) + - rp->data_size, GFP_KERNEL); + inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); if (inst == NULL) { refcount_set(&rp->rph->ref, i); free_rp_inst(rp); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 562aa0e450ed..1f306f158696 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -23,7 +23,7 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key) +static void handle_poweroff(u8 key) { /* run sysrq poweroff on boot cpu */ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2a17704136f1..7d4979d5c3ce 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -103,3 +103,5 @@ struct printk_message { u64 seq; unsigned long dropped; }; + +bool other_cpu_in_panic(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 357a4d18f638..7e0b4dd02398 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -88,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress); static DEFINE_MUTEX(console_mutex); /* - * console_sem protects updates to console->seq and console_suspended, + * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); @@ -361,7 +361,7 @@ static bool panic_in_progress(void) * paths in the console code where we end up in places I want * locked without the console semaphore held). */ -static int console_locked, console_suspended; +static int console_locked; /* * Array of consoles built from command line options (console=) @@ -2308,7 +2308,11 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - wake_up_klogd(); + if (in_sched) + defer_console_output(); + else + wake_up_klogd(); + return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2547,22 +2551,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig */ void suspend_console(void) { + struct console *con; + if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); - console_lock(); - console_suspended = 1; - up_console_sem(); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags | CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see that they are suspended so that it + * is guaranteed that all printing has stopped when this function + * completes. + */ + synchronize_srcu(&console_srcu); } void resume_console(void) { + struct console *con; + if (!console_suspend_enabled) return; - down_console_sem(); - console_suspended = 0; - console_unlock(); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see they are no longer suspended so + * that they are guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + pr_flush(1000, true); } @@ -2585,6 +2613,26 @@ static int console_cpu_notify(unsigned int cpu) return 0; } +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool other_cpu_in_panic(void) +{ + if (!panic_in_progress()) + return false; + + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return atomic_read(&panic_cpu) != raw_smp_processor_id(); +} + /** * console_lock - block the console subsystem from printing * @@ -2597,9 +2645,11 @@ void console_lock(void) { might_sleep(); + /* On panic, the console_lock must be left to the panic cpu. */ + while (other_cpu_in_panic()) + msleep(1000); + down_console_sem(); - if (console_suspended) - return; console_locked = 1; console_may_schedule = 1; } @@ -2615,12 +2665,11 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock_console_sem()) + /* On panic, the console_lock must be left to the panic cpu. */ + if (other_cpu_in_panic()) return 0; - if (console_suspended) { - up_console_sem(); + if (down_trylock_console_sem()) return 0; - } console_locked = 1; console_may_schedule = 0; return 1; @@ -2634,25 +2683,6 @@ int is_console_locked(void) EXPORT_SYMBOL(is_console_locked); /* - * Return true when this CPU should unlock console_sem without pushing all - * messages to the console. This reduces the chance that the console is - * locked when the panic CPU tries to use it. - */ -static bool abandon_console_lock_in_panic(void) -{ - if (!panic_in_progress()) - return false; - - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return atomic_read(&panic_cpu) != raw_smp_processor_id(); -} - -/* * Check if the given console is currently capable and allowed to print * records. * @@ -2665,6 +2695,9 @@ static inline bool console_is_usable(struct console *con) if (!(flags & CON_ENABLED)) return false; + if ((flags & CON_SUSPENDED)) + return false; + if (!con->write) return false; @@ -2948,7 +2981,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (abandon_console_lock_in_panic()) + if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) @@ -2983,11 +3016,6 @@ void console_unlock(void) bool flushed; u64 next_seq; - if (console_suspended) { - up_console_sem(); - return; - } - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -3045,10 +3073,28 @@ EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { + bool found_unblank = false; struct console *c; int cookie; /* + * First check if there are any consoles implementing the unblank() + * callback. If not, there is no reason to continue and take the + * console lock, which in particular can be dangerous if + * @oops_in_progress is set. + */ + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + found_unblank = true; + break; + } + } + console_srcu_read_unlock(cookie); + if (!found_unblank) + return; + + /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * @@ -3056,6 +3102,16 @@ void console_unblank(void) * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { + /* Semaphores are not NMI-safe. */ + if (in_nmi()) + return; + + /* + * Attempting to trylock the console lock can deadlock + * if another CPU was stopped while modifying the + * semaphore. "Hope and pray" that this is not the + * current situation. + */ if (down_trylock_console_sem() != 0) return; } else @@ -3085,14 +3141,24 @@ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { + bool handover; + u64 next_seq; + /* - * If someone else is holding the console lock, trylock will fail - * and may_schedule may be set. Ignore and proceed to unlock so - * that messages are flushed out. As this can be called from any - * context and we don't want to get preempted while flushing, - * ensure may_schedule is cleared. + * Ignore the console lock and flush out the messages. Attempting a + * trylock would not be useful because: + * + * - if it is contended, it must be ignored anyway + * - console_lock() and console_trylock() block and fail + * respectively in panic for non-panic CPUs + * - semaphores are not NMI-safe + */ + + /* + * If another context is holding the console lock, + * @console_may_schedule might be set. Clear it so that + * this context does not call cond_resched() while flushing. */ - console_trylock(); console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) { @@ -3105,15 +3171,15 @@ void console_flush_on_panic(enum con_flush_mode mode) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { /* - * If the above console_trylock() failed, this is an - * unsynchronized assignment. But in that case, the + * This is an unsynchronized assignment, but the * kernel is in "hope and pray" mode anyway. */ c->seq = seq; } console_srcu_read_unlock(cookie); } - console_unlock(); + + console_flush_all(false, &next_seq, &handover); } /* @@ -3679,8 +3745,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre /* * Hold the console_lock to guarantee safe access to - * console->seq and to prevent changes to @console_suspended - * until all consoles have been processed. + * console->seq. */ console_lock(); @@ -3688,6 +3753,11 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre for_each_console_srcu(c) { if (con && con != c) continue; + /* + * If consoles are not usable, it cannot be expected + * that they make forward progress, so only increment + * @diff for usable consoles. + */ if (!console_is_usable(c)) continue; printk_seq = c->seq; @@ -3696,18 +3766,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre } console_srcu_read_unlock(cookie); - /* - * If consoles are suspended, it cannot be expected that they - * make forward progress, so timeout immediately. @diff is - * still used to return a valid flush status. - */ - if (console_suspended) - remaining = 0; - else if (diff != last_diff && reset_on_progress) + if (diff != last_diff && reset_on_progress) remaining = timeout_ms; console_unlock(); + /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining == 0) break; @@ -3741,7 +3805,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. - * Return: true if all enabled printers are caught up. + * Return: true if all usable printers are caught up. */ static bool pr_flush(int timeout_ms, bool reset_on_progress) { @@ -3798,11 +3862,33 @@ static void __wake_up_klogd(int val) preempt_enable(); } +/** + * wake_up_klogd - Wake kernel logging daemon + * + * Use this function when new records have been added to the ringbuffer + * and the console printing of those records has already occurred or is + * known to be handled by some other context. This function will only + * wake the logging daemon. + * + * Context: Any context. + */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } +/** + * defer_console_output - Wake kernel logging daemon and trigger + * console printing in a deferred context + * + * Use this function when new records have been added to the ringbuffer, + * this context is responsible for console printing those records, but + * the current context is not allowed to perform the console printing. + * Trigger an irq_work context to perform the console printing. This + * function also wakes the logging daemon. + * + * Context: Any context. + */ void defer_console_output(void) { /* @@ -3819,12 +3905,7 @@ void printk_trigger_flush(void) int vprintk_deferred(const char *fmt, va_list args) { - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); - defer_console_output(); - - return r; + return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 2dc4d5a1f1ff..fde338606ce8 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1735,7 +1735,7 @@ static bool copy_data(struct prb_data_ring *data_ring, if (!buf || !buf_size) return true; - data_size = min_t(u16, buf_size, len); + data_size = min_t(unsigned int, buf_size, len); memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index ef0f9a2044da..6d10927a07d8 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -38,13 +38,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) || in_nmi()) { - int len; - - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - defer_console_output(); - return len; - } + if (this_cpu_read(printk_context) || in_nmi()) + return vprintk_deferred(fmt, args); /* No obstacles. */ return vprintk_default(fmt, args); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b10b8349bb2a..6f06dc12904a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -1035,7 +1035,7 @@ static bool sysrq_rcu; module_param(sysrq_rcu, bool, 0444); /* Dump grace-period-request information due to commandeered sysrq. */ -static void sysrq_show_rcu(int key) +static void sysrq_show_rcu(u8 key) { show_rcu_gp_kthreads(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 911d0063763c..8dbff6e7ad4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -699,7 +699,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * * XXX could add max_slice to the augmented data to track this. */ -void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 lag, limit; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4df14db4da49..87015e9deacc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1045,7 +1045,7 @@ static bool report_idle_softirq(void) return false; /* On RT, softirqs handling may be waiting on some lock */ - if (!local_bh_blocked()) + if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 64b61f67a403..057cd975d014 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o +obj-$(CONFIG_PROBE_EVENTS_BTF_ARGS) += trace_btf.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 05c0024815bf..8de8bec5f366 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6779,8 +6779,7 @@ void ftrace_release_mod(struct module *mod) last_pg = &ftrace_pages_start; for (pg = ftrace_pages_start; pg; pg = *last_pg) { rec = &pg->records[0]; - if (within_module_core(rec->ip, mod) || - within_module_init(rec->ip, mod)) { + if (within_module(rec->ip, mod)) { /* * As core pages are first, the first * page should never be a module page. @@ -6852,8 +6851,7 @@ void ftrace_module_enable(struct module *mod) * not part of this module, then skip this pg, * which the "break" will do. */ - if (!within_module_core(rec->ip, mod) && - !within_module_init(rec->ip, mod)) + if (!within_module(rec->ip, mod)) break; /* Weak functions should still be ignored */ @@ -7142,9 +7140,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct dyn_ftrace key; struct ftrace_mod_map *mod_map = NULL; struct ftrace_init_func *func, *func_next; - struct list_head clear_hash; - - INIT_LIST_HEAD(&clear_hash); + LIST_HEAD(clear_hash); key.ip = start; key.flags = end; /* overload flags, as it is unsigned long */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 52dea5dd5362..78502d4c7214 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -692,10 +692,7 @@ static void rb_time_set(rb_time_t *t, u64 val) static inline bool rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) { - unsigned long ret; - - ret = local_cmpxchg(l, expect, set); - return ret == expect; + return local_try_cmpxchg(l, &expect, set); } static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) @@ -752,9 +749,7 @@ static void rb_time_set(rb_time_t *t, u64 val) static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { - u64 val; - val = local64_cmpxchg(&t->time, expect, set); - return val == expect; + return local64_try_cmpxchg(&t->time, &expect, set); } #endif @@ -1494,14 +1489,11 @@ static bool rb_head_page_replace(struct buffer_page *old, { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; - unsigned long ret; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; - ret = cmpxchg(ptr, val, (unsigned long)&new->list); - - return ret == val; + return try_cmpxchg(ptr, &val, (unsigned long)&new->list); } /* @@ -3003,7 +2995,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long new_index, old_index; struct buffer_page *bpage; - unsigned long index; unsigned long addr; u64 write_stamp; u64 delta; @@ -3060,8 +3051,9 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, */ old_index += write_mask; new_index += write_mask; - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) { + + /* caution: old_index gets updated on cmpxchg failure */ + if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); return true; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8e64aaad5361..2b4ded753367 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3119,7 +3119,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; - void *ptr; /* * Add one, for this function and the call to save_stack_trace() @@ -3157,32 +3156,16 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, nr_entries = stack_trace_save(fstack->calls, size, skip); } - size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, - (sizeof(*entry) - sizeof(entry->caller)) + size, + struct_size(entry, caller, nr_entries), trace_ctx); if (!event) goto out; - ptr = ring_buffer_event_data(event); - entry = ptr; - - /* - * For backward compatibility reasons, the entry->caller is an - * array of 8 slots to store the stack. This is also exported - * to user space. The amount allocated on the ring buffer actually - * holds enough for the stack specified by nr_entries. This will - * go into the location of entry->caller. Due to string fortifiers - * checking the size of the destination of memcpy() it triggers - * when it detects that size is greater than 8. To hide this from - * the fortifiers, we use "ptr" and pointer arithmetic to assign caller. - * - * The below is really just: - * memcpy(&entry->caller, fstack->calls, size); - */ - ptr += offsetof(typeof(*entry), caller); - memcpy(ptr, fstack->calls, size); + entry = ring_buffer_event_data(event); entry->size = nr_entries; + memcpy(&entry->caller, fstack->calls, + flex_array_size(entry, caller, nr_entries)); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -4206,18 +4189,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t l = 0; int cpu; - /* - * copy the tracer to avoid using a global lock all around. - * iter->trace is a copy of current_trace, the pointer to the - * name may be used instead of a strcmp(), as iter->trace->name - * will point to the same string as current_trace->name. - */ mutex_lock(&trace_types_lock); - if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) { + if (unlikely(tr->current_trace != iter->trace)) { /* Close iter->trace before switching to the new current tracer */ if (iter->trace->close) iter->trace->close(iter); - *iter->trace = *tr->current_trace; + iter->trace = tr->current_trace; /* Reopen the new current tracer */ if (iter->trace->open) iter->trace->open(iter); @@ -4829,6 +4806,25 @@ static const struct seq_operations tracer_seq_ops = { .show = s_show, }; +/* + * Note, as iter itself can be allocated and freed in different + * ways, this function is only used to free its content, and not + * the iterator itself. The only requirement to all the allocations + * is that it must zero all fields (kzalloc), as freeing works with + * ethier allocated content or NULL. + */ +static void free_trace_iter_content(struct trace_iterator *iter) +{ + /* The fmt is either NULL, allocated or points to static_fmt_buf */ + if (iter->fmt != static_fmt_buf) + kfree(iter->fmt); + + kfree(iter->temp); + kfree(iter->buffer_iter); + mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); +} + static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { @@ -4870,16 +4866,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->fmt = NULL; iter->fmt_size = 0; - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ mutex_lock(&trace_types_lock); - iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) - goto fail; - - *iter->trace = *tr->current_trace; + iter->trace = tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; @@ -4944,9 +4932,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) fail: mutex_unlock(&trace_types_lock); - kfree(iter->trace); - kfree(iter->temp); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); @@ -5025,12 +5011,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); - mutex_destroy(&iter->mutex); - free_cpumask_var(iter->started); - kfree(iter->fmt); - kfree(iter->temp); - kfree(iter->trace); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); seq_release_private(inode, file); return 0; @@ -5730,7 +5711,8 @@ static const char readme_msg[] = "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS - "\t $stack<index>, $stack, $retval, $comm, $arg<N>, <argname>\n" + "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" + "\t <argname>[->field[->field|.field...]],\n" #else "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #endif @@ -6318,6 +6300,15 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val) per_cpu_ptr(buf->data, cpu)->entries = val; } +static void update_buffer_entries(struct array_buffer *buf, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + } else { + per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); + } +} + #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, @@ -6396,18 +6387,12 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, return ret; } - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->max_buffer, size); - else - per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; + update_buffer_entries(&tr->max_buffer, cpu); out: #endif /* CONFIG_TRACER_MAX_TRACE */ - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->array_buffer, size); - else - per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size; + update_buffer_entries(&tr->array_buffer, cpu); return ret; } @@ -6825,10 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); - free_cpumask_var(iter->started); - kfree(iter->fmt); - kfree(iter->temp); - mutex_destroy(&iter->mutex); + free_trace_iter_content(iter); kfree(iter); trace_array_put(tr); @@ -7618,6 +7600,11 @@ out: return ret; } +static void tracing_swap_cpu_buffer(void *tr) +{ + update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); +} + static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -7676,13 +7663,15 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) break; - local_irq_disable(); /* Now, we're going to swap */ - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + local_irq_disable(); update_max_tr(tr, current, smp_processor_id(), NULL); - else - update_max_tr_single(tr, current, iter->cpu_file); - local_irq_enable(); + local_irq_enable(); + } else { + smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, + (void *)tr, 1); + } break; default: if (tr->allocated_snapshot) { @@ -9486,7 +9475,7 @@ static struct trace_array *trace_array_create(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; - if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; @@ -10431,7 +10420,7 @@ __init static int tracer_alloc_buffers(void) if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; - if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) goto out_free_savedcmd; /* TODO: make the number of buffers hot pluggable with CPUS */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 73eaec158473..5669dd1f90d9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -77,6 +77,16 @@ enum trace_type { #undef __array #define __array(type, item, size) type item[size]; +/* + * For backward compatibility, older user space expects to see the + * kernel_stack event with a fixed size caller field. But today the fix + * size is ignored by the kernel, and the real structure is dynamic. + * Expose to user space: "unsigned long caller[8];" but the real structure + * will be "unsigned long caller[] __counted_by(size)" + */ +#undef __stack_array +#define __stack_array(type, item, size, field) type item[] __counted_by(field); + #undef __array_desc #define __array_desc(type, container, item, size) @@ -596,7 +606,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); -void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); @@ -697,7 +706,6 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); int trace_pid_show(struct seq_file *m, void *v); -void trace_free_pid_list(struct trace_pid_list *pid_list); int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt); @@ -1334,7 +1342,7 @@ struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; - struct dentry *entry; + struct eventfs_file *ef; int ref_count; int nr_events; }; diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c new file mode 100644 index 000000000000..ca224d53bfdc --- /dev/null +++ b/kernel/trace/trace_btf.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/btf.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "trace_btf.h" + +/* + * Find a function proto type by name, and return the btf_type with its btf + * in *@btf_p. Return NULL if not found. + * Note that caller has to call btf_put(*@btf_p) after using the btf_type. + */ +const struct btf_type *btf_find_func_proto(const char *func_name, struct btf **btf_p) +{ + const struct btf_type *t; + s32 id; + + id = bpf_find_btf_id(func_name, BTF_KIND_FUNC, btf_p); + if (id < 0) + return NULL; + + /* Get BTF_KIND_FUNC type */ + t = btf_type_by_id(*btf_p, id); + if (!t || !btf_type_is_func(t)) + goto err; + + /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ + t = btf_type_by_id(*btf_p, t->type); + if (!t || !btf_type_is_func_proto(t)) + goto err; + + return t; +err: + btf_put(*btf_p); + return NULL; +} + +/* + * Get function parameter with the number of parameters. + * This can return NULL if the function has no parameters. + * It can return -EINVAL if the @func_proto is not a function proto type. + */ +const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s32 *nr) +{ + if (!btf_type_is_func_proto(func_proto)) + return ERR_PTR(-EINVAL); + + *nr = btf_type_vlen(func_proto); + if (*nr > 0) + return (const struct btf_param *)(func_proto + 1); + else + return NULL; +} + +#define BTF_ANON_STACK_MAX 16 + +struct btf_anon_stack { + u32 tid; + u32 offset; +}; + +/* + * Find a member of data structure/union by name and return it. + * Return NULL if not found, or -EINVAL if parameter is invalid. + * If the member is an member of anonymous union/structure, the offset + * of that anonymous union/structure is stored into @anon_offset. Caller + * can calculate the correct offset from the root data structure by + * adding anon_offset to the member's offset. + */ +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name, + u32 *anon_offset) +{ + struct btf_anon_stack *anon_stack; + const struct btf_member *member; + u32 tid, cur_offset = 0; + const char *name; + int i, top = 0; + + anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL); + if (!anon_stack) + return ERR_PTR(-ENOMEM); + +retry: + if (!btf_type_is_struct(type)) { + member = ERR_PTR(-EINVAL); + goto out; + } + + for_each_member(i, type, member) { + if (!member->name_off) { + /* Anonymous union/struct: push it for later use */ + type = btf_type_skip_modifiers(btf, member->type, &tid); + if (type && top < BTF_ANON_STACK_MAX) { + anon_stack[top].tid = tid; + anon_stack[top++].offset = + cur_offset + member->offset; + } + } else { + name = btf_name_by_offset(btf, member->name_off); + if (name && !strcmp(member_name, name)) { + if (anon_offset) + *anon_offset = cur_offset; + goto out; + } + } + } + if (top > 0) { + /* Pop from the anonymous stack and retry */ + tid = anon_stack[--top].tid; + cur_offset = anon_stack[top].offset; + type = btf_type_by_id(btf, tid); + goto retry; + } + member = NULL; + +out: + kfree(anon_stack); + return member; +} + diff --git a/kernel/trace/trace_btf.h b/kernel/trace/trace_btf.h new file mode 100644 index 000000000000..4bc44bc261e6 --- /dev/null +++ b/kernel/trace/trace_btf.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/btf.h> + +const struct btf_type *btf_find_func_proto(const char *func_name, + struct btf **btf_p); +const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, + s32 *nr); +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name, + u32 *anon_offset); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 340b2fa98218..c47422b20908 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -190,7 +190,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry, F_STRUCT( __field( int, size ) - __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + __stack_array( unsigned long, caller, FTRACE_STACK_ENTRIES, size) ), F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a0a704ba27db..72714cbf475c 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -41,6 +41,10 @@ struct eprobe_data { struct trace_eprobe *ep; }; + +#define for_each_trace_eprobe_tp(ep, _tp) \ + list_for_each_entry(ep, trace_probe_probe_list(_tp), tp.list) + static int __trace_eprobe_create(int argc, const char *argv[]); static void trace_event_probe_cleanup(struct trace_eprobe *ep) @@ -640,7 +644,7 @@ static int disable_eprobe(struct trace_eprobe *ep, static int enable_trace_eprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_eprobe *ep; bool enabled; int ret = 0; @@ -662,8 +666,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, if (enabled) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - ep = container_of(pos, struct trace_eprobe, tp); + for_each_trace_eprobe_tp(ep, tp) { ret = enable_eprobe(ep, file); if (ret) break; @@ -680,8 +683,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, */ WARN_ON_ONCE(ret != -ENOMEM); - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - ep = container_of(pos, struct trace_eprobe, tp); + for_each_trace_eprobe_tp(ep, tp) { disable_eprobe(ep, file->tr); if (!--cnt) break; @@ -699,7 +701,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, static int disable_trace_eprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_eprobe *ep; tp = trace_probe_primary_from_call(call); @@ -716,10 +718,8 @@ static int disable_trace_eprobe(struct trace_event_call *call, trace_probe_clear_flag(tp, TP_FLAG_PROFILE); if (!trace_probe_is_enabled(tp)) { - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - ep = container_of(pos, struct trace_eprobe, tp); + for_each_trace_eprobe_tp(ep, tp) disable_eprobe(ep, file->tr); - } } out: @@ -807,13 +807,11 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ int ret; ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); - if (ret) - return ret; - /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); + traceprobe_finish_parse(&ctx); return ret; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 578f1f7d49a6..ed367d713be0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -984,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - tracefs_remove(dir->entry); + eventfs_remove(dir->ef); list_del(&dir->list); __put_system_dir(dir); } @@ -1005,7 +1005,7 @@ static void remove_event_file_dir(struct trace_event_file *file) tracefs_remove(dir); } - + eventfs_remove(file->ef); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); @@ -2291,13 +2291,13 @@ create_new_subsystem(const char *name) return NULL; } -static struct dentry * +static struct eventfs_file * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct dentry *entry; + int res; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { @@ -2305,7 +2305,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; - return dir->entry; + return dir->ef; } } @@ -2329,8 +2329,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->entry = tracefs_create_dir(name, parent); - if (!dir->entry) { + dir->ef = eventfs_add_subsystem_dir(name, parent); + if (IS_ERR(dir->ef)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; @@ -2345,22 +2345,22 @@ event_subsystem_dir(struct trace_array *tr, const char *name, /* the ftrace system is special, do not create enable or filter files */ if (strcmp(name, "ftrace") != 0) { - entry = tracefs_create_file("filter", TRACE_MODE_WRITE, - dir->entry, dir, + res = eventfs_add_file("filter", TRACE_MODE_WRITE, + dir->ef, dir, &ftrace_subsystem_filter_fops); - if (!entry) { + if (res) { kfree(system->filter); system->filter = NULL; pr_warn("Could not create tracefs '%s/filter' entry\n", name); } - trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir, + eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir, &ftrace_system_enable_fops); } list_add(&dir->list, &tr->systems); - return dir->entry; + return dir->ef; out_free: kfree(dir); @@ -2413,36 +2413,37 @@ static int event_create_dir(struct dentry *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; + struct eventfs_file *ef_subsystem = NULL; struct trace_array *tr = file->tr; - struct dentry *d_events; const char *name; int ret; /* * If the trace point header did not define TRACE_SYSTEM - * then the system would be called "TRACE_SYSTEM". + * then the system would be called "TRACE_SYSTEM". This should + * never happen. */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { - d_events = event_subsystem_dir(tr, call->class->system, file, parent); - if (!d_events) - return -ENOMEM; - } else - d_events = parent; + if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) + return -ENODEV; + + ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent); + if (!ef_subsystem) + return -ENOMEM; name = trace_event_name(call); - file->dir = tracefs_create_dir(name, d_events); - if (!file->dir) { + file->ef = eventfs_add_dir(name, ef_subsystem); + if (IS_ERR(file->ef)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file, + eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", TRACE_MODE_READ, file->dir, + eventfs_add_file("id", TRACE_MODE_READ, file->ef, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif @@ -2458,27 +2459,27 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - trace_create_file("filter", TRACE_MODE_WRITE, file->dir, + eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef, file, &ftrace_event_filter_fops); - trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, file, &event_trigger_fops); } #ifdef CONFIG_HIST_TRIGGERS - trace_create_file("hist", TRACE_MODE_READ, file->dir, file, + eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file, &event_hist_fops); #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG - trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file, + eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file, &event_hist_debug_fops); #endif - trace_create_file("format", TRACE_MODE_READ, file->dir, call, + eventfs_add_file("format", TRACE_MODE_READ, file->ef, call, &ftrace_event_format_fops); #ifdef CONFIG_TRACE_EVENT_INJECT if (call->event.type && call->class->reg) - trace_create_file("inject", 0200, file->dir, file, + eventfs_add_file("inject", 0200, file->ef, file, &event_inject_fops); #endif @@ -3631,21 +3632,22 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct dentry *d_events; struct dentry *entry; + int error = 0; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; - d_events = tracefs_create_dir("events", parent); - if (!d_events) { + d_events = eventfs_create_events_dir("events", parent); + if (IS_ERR(d_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } - entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, + error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); - if (!entry) + if (error) return -ENOMEM; /* There are not as crucial, just warn if they are not created */ @@ -3658,11 +3660,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) &ftrace_set_event_notrace_pid_fops); /* ring buffer internal formats */ - trace_create_file("header_page", TRACE_MODE_READ, d_events, + eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); - trace_create_file("header_event", TRACE_MODE_READ, d_events, + eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); @@ -3750,7 +3752,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - tracefs_remove(tr->event_dir); + eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1dad64267878..33264e510d16 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -46,15 +46,19 @@ static const char * ops[] = { OPS }; enum filter_pred_fn { FILTER_PRED_FN_NOP, FILTER_PRED_FN_64, + FILTER_PRED_FN_64_CPUMASK, FILTER_PRED_FN_S64, FILTER_PRED_FN_U64, FILTER_PRED_FN_32, + FILTER_PRED_FN_32_CPUMASK, FILTER_PRED_FN_S32, FILTER_PRED_FN_U32, FILTER_PRED_FN_16, + FILTER_PRED_FN_16_CPUMASK, FILTER_PRED_FN_S16, FILTER_PRED_FN_U16, FILTER_PRED_FN_8, + FILTER_PRED_FN_8_CPUMASK, FILTER_PRED_FN_S8, FILTER_PRED_FN_U8, FILTER_PRED_FN_COMM, @@ -64,21 +68,25 @@ enum filter_pred_fn { FILTER_PRED_FN_PCHAR_USER, FILTER_PRED_FN_PCHAR, FILTER_PRED_FN_CPU, + FILTER_PRED_FN_CPU_CPUMASK, + FILTER_PRED_FN_CPUMASK, + FILTER_PRED_FN_CPUMASK_CPU, FILTER_PRED_FN_FUNCTION, FILTER_PRED_FN_, FILTER_PRED_TEST_VISITED, }; struct filter_pred { - enum filter_pred_fn fn_num; - u64 val; - u64 val2; - struct regex regex; + struct regex *regex; + struct cpumask *mask; unsigned short *ops; struct ftrace_event_field *field; - int offset; + u64 val; + u64 val2; + enum filter_pred_fn fn_num; + int offset; int not; - int op; + int op; }; /* @@ -94,6 +102,8 @@ struct filter_pred { C(TOO_MANY_OPEN, "Too many '('"), \ C(TOO_MANY_CLOSE, "Too few '('"), \ C(MISSING_QUOTE, "Missing matching quote"), \ + C(MISSING_BRACE_OPEN, "Missing '{'"), \ + C(MISSING_BRACE_CLOSE, "Missing '}'"), \ C(OPERAND_TOO_LONG, "Operand too long"), \ C(EXPECT_STRING, "Expecting string field"), \ C(EXPECT_DIGIT, "Expecting numeric field"), \ @@ -103,6 +113,7 @@ struct filter_pred { C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ C(INVALID_FILTER, "Meaningless filter expression"), \ + C(INVALID_CPULIST, "Invalid cpulist"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ C(NO_FUNCTION, "Function not found"), \ @@ -186,6 +197,15 @@ enum { PROCESS_OR = 4, }; +static void free_predicate(struct filter_pred *pred) +{ + if (pred) { + kfree(pred->regex); + kfree(pred->mask); + kfree(pred); + } +} + /* * Without going into a formal proof, this explains the method that is used in * parsing the logical expressions. @@ -623,12 +643,64 @@ out_free: kfree(inverts); if (prog_stack) { for (i = 0; prog_stack[i].pred; i++) - kfree(prog_stack[i].pred); + free_predicate(prog_stack[i].pred); kfree(prog_stack); } return ERR_PTR(ret); } +static inline int +do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp) +{ + switch (op) { + case OP_EQ: + return cpumask_equal(mask, cmp); + case OP_NE: + return !cpumask_equal(mask, cmp); + case OP_BAND: + return cpumask_intersects(mask, cmp); + default: + return 0; + } +} + +/* Optimisation of do_filter_cpumask() for scalar fields */ +static inline int +do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask) +{ + /* + * Per the weight-of-one cpumask optimisations, the mask passed in this + * function has a weight >= 2, so it is never equal to a single scalar. + */ + switch (op) { + case OP_EQ: + return false; + case OP_NE: + return true; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + +static inline int +do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu) +{ + switch (op) { + case OP_EQ: + return cpumask_test_cpu(cpu, mask) && + cpumask_nth(1, mask) >= nr_cpu_ids; + case OP_NE: + return !cpumask_test_cpu(cpu, mask) || + cpumask_nth(1, mask) < nr_cpu_ids; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + enum pred_cmp_types { PRED_CMP_TYPE_NOP, PRED_CMP_TYPE_LT, @@ -672,6 +744,18 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ } \ } +#define DEFINE_CPUMASK_COMPARISON_PRED(size) \ +static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + unsigned int cpu = *addr; \ + \ + if (cpu >= nr_cpu_ids) \ + return 0; \ + \ + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \ +} + #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ @@ -693,6 +777,11 @@ DEFINE_COMPARISON_PRED(u16); DEFINE_COMPARISON_PRED(s8); DEFINE_COMPARISON_PRED(u8); +DEFINE_CPUMASK_COMPARISON_PRED(64); +DEFINE_CPUMASK_COMPARISON_PRED(32); +DEFINE_CPUMASK_COMPARISON_PRED(16); +DEFINE_CPUMASK_COMPARISON_PRED(8); + DEFINE_EQUALITY_PRED(64); DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); @@ -750,7 +839,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event) char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); + cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len); match = cmp ^ pred->not; @@ -763,7 +852,7 @@ static __always_inline int filter_pchar(struct filter_pred *pred, char *str) int len; len = strlen(str) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(str, &pred->regex, len); + cmp = pred->regex->match(str, pred->regex, len); match = cmp ^ pred->not; @@ -813,7 +902,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) char *addr = (char *)(event + str_loc); int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, str_len); + cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; @@ -836,7 +925,7 @@ static int filter_pred_strrelloc(struct filter_pred *pred, void *event) char *addr = (char *)(&item[1]) + str_loc; int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, str_len); + cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; @@ -869,12 +958,42 @@ static int filter_pred_cpu(struct filter_pred *pred, void *event) } } +/* Filter predicate for current CPU vs user-provided cpumask */ +static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event) +{ + int cpu = raw_smp_processor_id(); + + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); +} + +/* Filter predicate for cpumask field vs user-provided cpumask */ +static int filter_pred_cpumask(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + const struct cpumask *cmp = pred->mask; + + return do_filter_cpumask(pred->op, mask, cmp); +} + +/* Filter predicate for cpumask field vs user-provided scalar */ +static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + unsigned int cpu = pred->val; + + return do_filter_cpumask_scalar(pred->op, mask, cpu); +} + /* Filter predicate for COMM. */ static int filter_pred_comm(struct filter_pred *pred, void *event) { int cmp; - cmp = pred->regex.match(current->comm, &pred->regex, + cmp = pred->regex->match(current->comm, pred->regex, TASK_COMM_LEN); return cmp ^ pred->not; } @@ -1004,7 +1123,7 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) static void filter_build_regex(struct filter_pred *pred) { - struct regex *r = &pred->regex; + struct regex *r = pred->regex; char *search; enum regex_type type = MATCH_FULL; @@ -1169,7 +1288,7 @@ static void free_prog(struct event_filter *filter) return; for (i = 0; prog[i].pred; i++) - kfree(prog[i].pred); + free_predicate(prog[i].pred); kfree(prog); } @@ -1236,8 +1355,12 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, int filter_assign_type(const char *type) { - if (strstr(type, "__data_loc") && strstr(type, "char")) - return FILTER_DYN_STRING; + if (strstr(type, "__data_loc")) { + if (strstr(type, "char")) + return FILTER_DYN_STRING; + if (strstr(type, "cpumask_t")) + return FILTER_CPUMASK; + } if (strstr(type, "__rel_loc") && strstr(type, "char")) return FILTER_RDYN_STRING; @@ -1313,24 +1436,32 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) switch (pred->fn_num) { case FILTER_PRED_FN_64: return filter_pred_64(pred, event); + case FILTER_PRED_FN_64_CPUMASK: + return filter_pred_64_cpumask(pred, event); case FILTER_PRED_FN_S64: return filter_pred_s64(pred, event); case FILTER_PRED_FN_U64: return filter_pred_u64(pred, event); case FILTER_PRED_FN_32: return filter_pred_32(pred, event); + case FILTER_PRED_FN_32_CPUMASK: + return filter_pred_32_cpumask(pred, event); case FILTER_PRED_FN_S32: return filter_pred_s32(pred, event); case FILTER_PRED_FN_U32: return filter_pred_u32(pred, event); case FILTER_PRED_FN_16: return filter_pred_16(pred, event); + case FILTER_PRED_FN_16_CPUMASK: + return filter_pred_16_cpumask(pred, event); case FILTER_PRED_FN_S16: return filter_pred_s16(pred, event); case FILTER_PRED_FN_U16: return filter_pred_u16(pred, event); case FILTER_PRED_FN_8: return filter_pred_8(pred, event); + case FILTER_PRED_FN_8_CPUMASK: + return filter_pred_8_cpumask(pred, event); case FILTER_PRED_FN_S8: return filter_pred_s8(pred, event); case FILTER_PRED_FN_U8: @@ -1349,6 +1480,12 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) return filter_pred_pchar(pred, event); case FILTER_PRED_FN_CPU: return filter_pred_cpu(pred, event); + case FILTER_PRED_FN_CPU_CPUMASK: + return filter_pred_cpu_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK: + return filter_pred_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK_CPU: + return filter_pred_cpumask_cpu(pred, event); case FILTER_PRED_FN_FUNCTION: return filter_pred_function(pred, event); case FILTER_PRED_TEST_VISITED: @@ -1553,9 +1690,130 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex.len = len; - strncpy(pred->regex.pattern, str + s, len); - pred->regex.pattern[len] = 0; + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; + + } else if (!strncmp(str + i, "CPUS", 4)) { + unsigned int maskstart; + bool single; + char *tmp; + + switch (field->filter_type) { + case FILTER_CPUMASK: + case FILTER_CPU: + case FILTER_OTHER: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + switch (op) { + case OP_EQ: + case OP_NE: + case OP_BAND: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Skip CPUS */ + i += 4; + if (str[i++] != '{') { + parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i); + goto err_free; + } + maskstart = i; + + /* Walk the cpulist until closing } */ + for (; str[i] && str[i] != '}'; i++) + ; + + if (str[i] != '}') { + parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i); + goto err_free; + } + + if (maskstart == i) { + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + + /* Copy the cpulist between { and } */ + tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL); + if (!tmp) + goto err_mem; + + strscpy(tmp, str + maskstart, (i - maskstart) + 1); + pred->mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!pred->mask) { + kfree(tmp); + goto err_mem; + } + + /* Now parse it */ + if (cpulist_parse(tmp, pred->mask)) { + kfree(tmp); + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + kfree(tmp); + + /* Move along */ + i++; + + /* + * Optimisation: if the user-provided mask has a weight of one + * then we can treat it as a scalar input. + */ + single = cpumask_weight(pred->mask) == 1; + if (single) { + pred->val = cpumask_first(pred->mask); + kfree(pred->mask); + pred->mask = NULL; + } + + if (field->filter_type == FILTER_CPUMASK) { + pred->fn_num = single ? + FILTER_PRED_FN_CPUMASK_CPU : + FILTER_PRED_FN_CPUMASK; + } else if (field->filter_type == FILTER_CPU) { + if (single) { + if (pred->op == OP_BAND) + pred->op = OP_EQ; + + pred->fn_num = FILTER_PRED_FN_CPU; + } else { + pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK; + } + } else if (single) { + if (pred->op == OP_BAND) + pred->op = OP_EQ; + + pred->fn_num = select_comparison_fn(pred->op, field->size, false); + if (pred->op == OP_NE) + pred->not = 1; + } else { + switch (field->size) { + case 8: + pred->fn_num = FILTER_PRED_FN_64_CPUMASK; + break; + case 4: + pred->fn_num = FILTER_PRED_FN_32_CPUMASK; + break; + case 2: + pred->fn_num = FILTER_PRED_FN_16_CPUMASK; + break; + case 1: + pred->fn_num = FILTER_PRED_FN_8_CPUMASK; + break; + } + } /* This is either a string, or an integer */ } else if (str[i] == '\'' || str[i] == '"') { @@ -1597,9 +1855,12 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex.len = len; - strncpy(pred->regex.pattern, str + s, len); - pred->regex.pattern[len] = 0; + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; filter_build_regex(pred); @@ -1608,7 +1869,7 @@ static int parse_pred(const char *str, void *data, } else if (field->filter_type == FILTER_STATIC_STRING) { pred->fn_num = FILTER_PRED_FN_STRING; - pred->regex.field_len = field->size; + pred->regex->field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn_num = FILTER_PRED_FN_STRLOC; @@ -1691,10 +1952,10 @@ static int parse_pred(const char *str, void *data, return i; err_free: - kfree(pred); + free_predicate(pred); return -EINVAL; err_mem: - kfree(pred); + free_predicate(pred); return -ENOMEM; } @@ -2287,8 +2548,8 @@ static int ftrace_function_set_filter_pred(struct filter_pred *pred, return ret; return __ftrace_function_set_filter(pred->op == OP_EQ, - pred->regex.pattern, - pred->regex.len, + pred->regex->pattern, + pred->regex->len, data); } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 33cb6af31f39..6f046650e527 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1328,14 +1328,14 @@ static int user_field_set_string(struct ftrace_event_field *field, static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head = &user->fields; int pos = 0, depth = 0; const char *str_func; pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (depth != 0) pos += snprintf(buf + pos, LEN_OR_ZERO, " "); @@ -1347,7 +1347,7 @@ static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (user_field_is_dyn_string(field->type, &str_func)) pos += snprintf(buf + pos, LEN_OR_ZERO, ", %s(%s)", str_func, field->name); @@ -1732,7 +1732,7 @@ static int user_event_create(const char *raw_command) static int user_event_show(struct seq_file *m, struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head; int depth = 0; @@ -1740,7 +1740,7 @@ static int user_event_show(struct seq_file *m, struct dyn_event *ev) head = trace_get_fields(&user->call); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (depth == 0) seq_puts(m, " "); else @@ -1816,13 +1816,14 @@ out: static bool user_fields_match(struct user_event *user, int argc, const char **argv) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head = &user->fields; int i = 0; - list_for_each_entry_safe_reverse(field, next, head, link) + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, &i)) return false; + } if (i != argc) return false; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 58f3946081e2..1698fc22afa0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -51,6 +51,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __array #define __array(type, item, size) type item[size]; +#undef __stack_array +#define __stack_array(type, item, size, field) __array(type, item, size) + #undef __array_desc #define __array_desc(type, container, item, size) type item[size]; @@ -114,6 +117,9 @@ static void __always_unused ____ftrace_check_##name(void) \ is_signed_type(_type), .filter_type = FILTER_OTHER, \ .len = _len }, +#undef __stack_array +#define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len) + #undef __array_desc #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) @@ -149,6 +155,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \ #undef __array #define __array(type, item, len) +#undef __stack_array +#define __stack_array(type, item, len, field) + #undef __array_desc #define __array_desc(type, container, item, len) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index dfe2e546acdc..8bfe23af9c73 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -898,6 +898,46 @@ static struct tracepoint *find_tracepoint(const char *tp_name) return data.tpoint; } +static int parse_symbol_and_return(int argc, const char *argv[], + char **symbol, bool *is_return, + bool is_tracepoint) +{ + char *tmp = strchr(argv[1], '%'); + int i; + + if (tmp) { + int len = tmp - argv[1]; + + if (!is_tracepoint && !strcmp(tmp, "%return")) { + *is_return = true; + } else { + trace_probe_log_err(len, BAD_ADDR_SUFFIX); + return -EINVAL; + } + *symbol = kmemdup_nul(argv[1], len, GFP_KERNEL); + } else + *symbol = kstrdup(argv[1], GFP_KERNEL); + if (!*symbol) + return -ENOMEM; + + if (*is_return) + return 0; + + /* If there is $retval, this should be a return fprobe. */ + for (i = 2; i < argc; i++) { + tmp = strstr(argv[i], "$retval"); + if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') { + *is_return = true; + /* + * NOTE: Don't check is_tracepoint here, because it will + * be checked when the argument is parsed. + */ + break; + } + } + return 0; +} + static int __trace_fprobe_create(int argc, const char *argv[]) { /* @@ -927,7 +967,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) struct trace_fprobe *tf = NULL; int i, len, new_argc = 0, ret = 0; bool is_return = false; - char *symbol = NULL, *tmp = NULL; + char *symbol = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; const char **new_argv = NULL; int maxactive = 0; @@ -983,20 +1023,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); /* a symbol(or tracepoint) must be specified */ - symbol = kstrdup(argv[1], GFP_KERNEL); - if (!symbol) - return -ENOMEM; + ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint); + if (ret < 0) + goto parse_error; - tmp = strchr(symbol, '%'); - if (tmp) { - if (!is_tracepoint && !strcmp(tmp, "%return")) { - *tmp = '\0'; - is_return = true; - } else { - trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); - goto parse_error; - } - } if (!is_return && maxactive) { trace_probe_log_set_index(0); trace_probe_log_err(1, BAD_MAXACT_TYPE); @@ -1096,6 +1126,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } out: + traceprobe_finish_parse(&ctx); trace_probe_log_clear(); kfree(new_argv); kfree(symbol); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 2f37a6e68aa9..b791524a6536 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -635,7 +635,7 @@ static int s_mode_show(struct seq_file *s, void *v) else seq_printf(s, "%s", thread_mode_str[mode]); - if (mode != MODE_MAX) + if (mode < MODE_MAX - 1) /* if mode is any but last */ seq_puts(s, " "); return 0; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 17c21c0b2dd1..3d7a180a8427 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -907,6 +907,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } out: + traceprobe_finish_parse(&ctx); trace_probe_log_clear(); kfree(new_argv); kfree(symbol); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index c68a72707852..4dc74d73fc1d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include <linux/bpf.h> +#include "trace_btf.h" #include "trace_probe.h" @@ -304,31 +305,90 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code, #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS -static struct btf *traceprobe_get_btf(void) +static u32 btf_type_int(const struct btf_type *t) { - struct btf *btf = bpf_get_btf_vmlinux(); + return *(u32 *)(t + 1); +} - if (IS_ERR_OR_NULL(btf)) - return NULL; +static bool btf_type_is_char_ptr(struct btf *btf, const struct btf_type *type) +{ + const struct btf_type *real_type; + u32 intdata; + s32 tid; + + real_type = btf_type_skip_modifiers(btf, type->type, &tid); + if (!real_type) + return false; + + if (BTF_INFO_KIND(real_type->info) != BTF_KIND_INT) + return false; - return btf; + intdata = btf_type_int(real_type); + return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) + && BTF_INT_BITS(intdata) == 8; } -static u32 btf_type_int(const struct btf_type *t) +static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type) { - return *(u32 *)(t + 1); + const struct btf_type *real_type; + const struct btf_array *array; + u32 intdata; + s32 tid; + + if (BTF_INFO_KIND(type->info) != BTF_KIND_ARRAY) + return false; + + array = (const struct btf_array *)(type + 1); + + real_type = btf_type_skip_modifiers(btf, array->type, &tid); + + intdata = btf_type_int(real_type); + return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) + && BTF_INT_BITS(intdata) == 8; } -static const char *type_from_btf_id(struct btf *btf, s32 id) +static int check_prepare_btf_string_fetch(char *typename, + struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = ctx->btf; + + if (!btf || !ctx->last_type) + return 0; + + /* char [] does not need any change. */ + if (btf_type_is_char_array(btf, ctx->last_type)) + return 0; + + /* char * requires dereference the pointer. */ + if (btf_type_is_char_ptr(btf, ctx->last_type)) { + struct fetch_insn *code = *pcode + 1; + + if (code->op == FETCH_OP_END) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -E2BIG; + } + if (typename[0] == 'u') + code->op = FETCH_OP_UDEREF; + else + code->op = FETCH_OP_DEREF; + code->offset = 0; + *pcode = code; + return 0; + } + /* Other types are not available for string */ + trace_probe_log_err(ctx->offset, BAD_TYPE4STR); + return -EINVAL; +} + +static const char *fetch_type_from_btf_type(struct btf *btf, + const struct btf_type *type, + struct traceprobe_parse_context *ctx) { - const struct btf_type *t; u32 intdata; - s32 tid; /* TODO: const char * could be converted as a string */ - t = btf_type_skip_modifiers(btf, id, &tid); - - switch (BTF_INFO_KIND(t->info)) { + switch (BTF_INFO_KIND(type->info)) { case BTF_KIND_ENUM: /* enum is "int", so convert to "s32" */ return "s32"; @@ -341,7 +401,7 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) else return "x32"; case BTF_KIND_INT: - intdata = btf_type_int(t); + intdata = btf_type_int(type); if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) { switch (BTF_INT_BITS(intdata)) { case 8: @@ -364,6 +424,10 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) case 64: return "u64"; } + /* bitfield, size is encoded in the type */ + ctx->last_bitsize = BTF_INT_BITS(intdata); + ctx->last_bitoffs += BTF_INT_OFFSET(intdata); + return "u64"; } } /* TODO: support other types */ @@ -371,88 +435,223 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) return NULL; } -static const struct btf_type *find_btf_func_proto(const char *funcname) +static int query_btf_context(struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); - const struct btf_type *t; - s32 id; + const struct btf_param *param; + const struct btf_type *type; + struct btf *btf; + s32 nr; - if (!btf || !funcname) - return ERR_PTR(-EINVAL); + if (ctx->btf) + return 0; + + if (!ctx->funcname) + return -EINVAL; + + type = btf_find_func_proto(ctx->funcname, &btf); + if (!type) + return -ENOENT; - id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); - if (id <= 0) - return ERR_PTR(-ENOENT); + ctx->btf = btf; + ctx->proto = type; + + /* ctx->params is optional, since func(void) will not have params. */ + nr = 0; + param = btf_get_func_param(type, &nr); + if (!IS_ERR_OR_NULL(param)) { + /* Hide the first 'data' argument of tracepoint */ + if (ctx->flags & TPARG_FL_TPOINT) { + nr--; + param++; + } + } - /* Get BTF_KIND_FUNC type */ - t = btf_type_by_id(btf, id); - if (!t || !btf_type_is_func(t)) - return ERR_PTR(-ENOENT); + if (nr > 0) { + ctx->nr_params = nr; + ctx->params = param; + } else { + ctx->nr_params = 0; + ctx->params = NULL; + } - /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ - t = btf_type_by_id(btf, t->type); - if (!t || !btf_type_is_func_proto(t)) - return ERR_PTR(-ENOENT); + return 0; +} - return t; +static void clear_btf_context(struct traceprobe_parse_context *ctx) +{ + if (ctx->btf) { + btf_put(ctx->btf); + ctx->btf = NULL; + ctx->proto = NULL; + ctx->params = NULL; + ctx->nr_params = 0; + } } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, - bool tracepoint) +/* Return 1 if the field separater is arrow operator ('->') */ +static int split_next_field(char *varname, char **next_field, + struct traceprobe_parse_context *ctx) { - const struct btf_param *param; - const struct btf_type *t; + char *field; + int ret = 0; + + field = strpbrk(varname, ".-"); + if (field) { + if (field[0] == '-' && field[1] == '>') { + field[0] = '\0'; + field += 2; + ret = 1; + } else if (field[0] == '.') { + field[0] = '\0'; + field += 1; + } else { + trace_probe_log_err(ctx->offset + field - varname, BAD_HYPHEN); + return -EINVAL; + } + *next_field = field; + } - if (!funcname || !nr) - return ERR_PTR(-EINVAL); + return ret; +} - t = find_btf_func_proto(funcname); - if (IS_ERR(t)) - return (const struct btf_param *)t; +/* + * Parse the field of data structure. The @type must be a pointer type + * pointing the target data structure type. + */ +static int parse_btf_field(char *fieldname, const struct btf_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code = *pcode; + const struct btf_member *field; + u32 bitoffs, anon_offs; + char *next; + int is_ptr; + s32 tid; - *nr = btf_type_vlen(t); - param = (const struct btf_param *)(t + 1); + do { + /* Outer loop for solving arrow operator ('->') */ + if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) { + trace_probe_log_err(ctx->offset, NO_PTR_STRCT); + return -EINVAL; + } + /* Convert a struct pointer type to a struct type */ + type = btf_type_skip_modifiers(ctx->btf, type->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } - /* Hide the first 'data' argument of tracepoint */ - if (tracepoint) { - (*nr)--; - param++; - } + bitoffs = 0; + do { + /* Inner loop for solving dot operator ('.') */ + next = NULL; + is_ptr = split_next_field(fieldname, &next, ctx); + if (is_ptr < 0) + return is_ptr; + + anon_offs = 0; + field = btf_find_struct_member(ctx->btf, type, fieldname, + &anon_offs); + if (!field) { + trace_probe_log_err(ctx->offset, NO_BTF_FIELD); + return -ENOENT; + } + /* Add anonymous structure/union offset */ + bitoffs += anon_offs; + + /* Accumulate the bit-offsets of the dot-connected fields */ + if (btf_type_kflag(type)) { + bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset); + ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset); + } else { + bitoffs += field->offset; + ctx->last_bitsize = 0; + } - if (*nr > 0) - return param; - else - return NULL; + type = btf_type_skip_modifiers(ctx->btf, field->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } + + ctx->offset += next - fieldname; + fieldname = next; + } while (!is_ptr && fieldname); + + if (++code == end) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + code->op = FETCH_OP_DEREF; /* TODO: user deref support */ + code->offset = bitoffs / 8; + *pcode = code; + + ctx->last_bitoffs = bitoffs % 8; + ctx->last_type = type; + } while (fieldname); + + return 0; } -static int parse_btf_arg(const char *varname, struct fetch_insn *code, +static int parse_btf_arg(char *varname, + struct fetch_insn **pcode, struct fetch_insn *end, struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); + struct fetch_insn *code = *pcode; const struct btf_param *params; - int i; + const struct btf_type *type; + char *field = NULL; + int i, is_ptr, ret; + u32 tid; + + if (WARN_ON_ONCE(!ctx->funcname)) + return -EINVAL; - if (!btf) { - trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + is_ptr = split_next_field(varname, &field, ctx); + if (is_ptr < 0) + return is_ptr; + if (!is_ptr && field) { + /* dot-connected field on an argument is not supported. */ + trace_probe_log_err(ctx->offset + field - varname, + NOSUP_DAT_ARG); return -EOPNOTSUPP; } - if (WARN_ON_ONCE(!ctx->funcname)) - return -EINVAL; + if (ctx->flags & TPARG_FL_RETURN) { + if (strcmp(varname, "$retval") != 0) { + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; + } + code->op = FETCH_OP_RETVAL; + /* Check whether the function return type is not void */ + if (query_btf_context(ctx) == 0) { + if (ctx->proto->type == 0) { + trace_probe_log_err(ctx->offset, NO_RETVAL); + return -ENOENT; + } + tid = ctx->proto->type; + goto found; + } + if (field) { + trace_probe_log_err(ctx->offset + field - varname, + NO_BTF_ENTRY); + return -ENOENT; + } + return 0; + } - if (!ctx->params) { - params = find_btf_func_param(ctx->funcname, &ctx->nr_params, - ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR_OR_NULL(params)) { + if (!ctx->btf) { + ret = query_btf_context(ctx); + if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); return PTR_ERR(params); } - ctx->params = params; - } else - params = ctx->params; + } + params = ctx->params; for (i = 0; i < ctx->nr_params; i++) { - const char *name = btf_name_by_offset(btf, params[i].name_off); + const char *name = btf_name_by_offset(ctx->btf, params[i].name_off); if (name && !strcmp(name, varname)) { code->op = FETCH_OP_ARG; @@ -460,91 +659,114 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, code->param = i + 1; else code->param = i; - return 0; + tid = params[i].type; + goto found; } } trace_probe_log_err(ctx->offset, NO_BTFARG); return -ENOENT; -} - -static const struct fetch_type *parse_btf_arg_type(int arg_idx, - struct traceprobe_parse_context *ctx) -{ - struct btf *btf = traceprobe_get_btf(); - const char *typestr = NULL; - if (btf && ctx->params) { - if (ctx->flags & TPARG_FL_TPOINT) - arg_idx--; - typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); +found: + type = btf_type_skip_modifiers(ctx->btf, tid, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; } - - return find_fetch_type(typestr, ctx->flags); + /* Initialize the last type information */ + ctx->last_type = type; + ctx->last_bitoffs = 0; + ctx->last_bitsize = 0; + if (field) { + ctx->offset += field - varname; + return parse_btf_field(field, type, pcode, end, ctx); + } + return 0; } -static const struct fetch_type *parse_btf_retval_type( +static const struct fetch_type *find_fetch_type_from_btf_type( struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); + struct btf *btf = ctx->btf; const char *typestr = NULL; - const struct btf_type *t; - if (btf && ctx->funcname) { - t = find_btf_func_proto(ctx->funcname); - if (!IS_ERR(t)) - typestr = type_from_btf_id(btf, t->type); - } + if (btf && ctx->last_type) + typestr = fetch_type_from_btf_type(btf, ctx->last_type, ctx); return find_fetch_type(typestr, ctx->flags); } -static bool is_btf_retval_void(const char *funcname) +static int parse_btf_bitfield(struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) { - const struct btf_type *t; + struct fetch_insn *code = *pcode; - t = find_btf_func_proto(funcname); - if (IS_ERR(t)) - return false; + if ((ctx->last_bitsize % 8 == 0) && ctx->last_bitoffs == 0) + return 0; + + code++; + if (code->op != FETCH_OP_NOP) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + *pcode = code; - return t->type == 0; + code->op = FETCH_OP_MOD_BF; + code->lshift = 64 - (ctx->last_bitsize + ctx->last_bitoffs); + code->rshift = 64 - ctx->last_bitsize; + code->basesize = 64 / 8; + return 0; } + #else -static struct btf *traceprobe_get_btf(void) +static void clear_btf_context(struct traceprobe_parse_context *ctx) { - return NULL; + ctx->btf = NULL; } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, - bool tracepoint) +static int query_btf_context(struct traceprobe_parse_context *ctx) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } -static int parse_btf_arg(const char *varname, struct fetch_insn *code, +static int parse_btf_arg(char *varname, + struct fetch_insn **pcode, struct fetch_insn *end, struct traceprobe_parse_context *ctx) { trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EOPNOTSUPP; } -#define parse_btf_arg_type(idx, ctx) \ - find_fetch_type(NULL, ctx->flags) +static int parse_btf_bitfield(struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} -#define parse_btf_retval_type(ctx) \ +#define find_fetch_type_from_btf_type(ctx) \ find_fetch_type(NULL, ctx->flags) -#define is_btf_retval_void(funcname) (false) +static int check_prepare_btf_string_fetch(char *typename, + struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + return 0; +} #endif #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, +/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */ +static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, + struct fetch_insn **pcode, + struct fetch_insn *end, struct traceprobe_parse_context *ctx) { - unsigned long param; + struct fetch_insn *code = *pcode; int err = TP_ERR_BAD_VAR; + char *arg = orig_arg + 1; + unsigned long param; int ret = 0; int len; @@ -563,18 +785,17 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, goto inval; } - if (strcmp(arg, "retval") == 0) { - if (ctx->flags & TPARG_FL_RETURN) { - if ((ctx->flags & TPARG_FL_KERNEL) && - is_btf_retval_void(ctx->funcname)) { - err = TP_ERR_NO_RETVAL; - goto inval; - } + if (str_has_prefix(arg, "retval")) { + if (!(ctx->flags & TPARG_FL_RETURN)) { + err = TP_ERR_RETVAL_ON_PROBE; + goto inval; + } + if (!(ctx->flags & TPARG_FL_KERNEL) || + !IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { code->op = FETCH_OP_RETVAL; return 0; } - err = TP_ERR_RETVAL_ON_PROBE; - goto inval; + return parse_btf_arg(orig_arg, pcode, end, ctx); } len = str_has_prefix(arg, "stack"); @@ -676,7 +897,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, ctx); + ret = parse_probe_vars(arg, type, pcode, end, ctx); break; case '%': /* named register */ @@ -795,6 +1016,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = deref; code->offset = offset; + /* Reset the last type if used */ + ctx->last_type = NULL; } break; case '\\': /* Immediate value */ @@ -818,7 +1041,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EINVAL; } - ret = parse_btf_arg(arg, code, ctx); + ret = parse_btf_arg(arg, pcode, end, ctx); break; } } @@ -964,17 +1187,22 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, goto out; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + ctx->last_type = NULL; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], ctx); if (ret) goto fail; /* Update storing type if BTF is available */ - if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && !t) { - if (code->op == FETCH_OP_ARG) - parg->type = parse_btf_arg_type(code->param, ctx); - else if (code->op == FETCH_OP_RETVAL) - parg->type = parse_btf_retval_type(ctx); + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + if (!t) { + parg->type = find_fetch_type_from_btf_type(ctx); + } else if (strstr(t, "string")) { + ret = check_prepare_btf_string_fetch(t, &code, ctx); + if (ret) + goto fail; + } } ret = -EINVAL; @@ -1048,6 +1276,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); goto fail; } + } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + ret = parse_btf_bitfield(&code, ctx); + if (ret) + goto fail; } ret = -EINVAL; /* Loop(Array) operation */ @@ -1231,7 +1464,6 @@ static int sprint_nth_btf_arg(int idx, const char *type, char *buf, int bufsize, struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); const char *name; int ret; @@ -1239,7 +1471,7 @@ static int sprint_nth_btf_arg(int idx, const char *type, trace_probe_log_err(0, NO_BTFARG); return -ENOENT; } - name = btf_name_by_offset(btf, ctx->params[idx].name_off); + name = btf_name_by_offset(ctx->btf, ctx->params[idx].name_off); if (!name) { trace_probe_log_err(0, NO_BTF_ENTRY); return -ENOENT; @@ -1260,7 +1492,6 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], const struct btf_param *params = NULL; int i, j, n, used, ret, args_idx = -1; const char **new_argv = NULL; - int nr_params; ret = argv_has_var_arg(argc, argv, &args_idx, ctx); if (ret < 0) @@ -1271,9 +1502,8 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], return NULL; } - params = find_btf_func_param(ctx->funcname, &nr_params, - ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR_OR_NULL(params)) { + ret = query_btf_context(ctx); + if (ret < 0 || ctx->nr_params == 0) { if (args_idx != -1) { /* $arg* requires BTF info */ trace_probe_log_err(0, NOSUP_BTFARG); @@ -1282,8 +1512,6 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], *new_argc = argc; return NULL; } - ctx->params = params; - ctx->nr_params = nr_params; if (args_idx >= 0) *new_argc = argc + ctx->nr_params - 1; @@ -1298,7 +1526,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], for (i = 0, j = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); if (i == args_idx) { - for (n = 0; n < nr_params; n++) { + for (n = 0; n < ctx->nr_params; n++) { ret = sprint_nth_btf_arg(n, "", buf + used, bufsize - used, ctx); if (ret < 0) @@ -1337,6 +1565,11 @@ error: return ERR_PTR(ret); } +void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) +{ + clear_btf_context(ctx); +} + int traceprobe_update_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 7dde806be91e..02b432ae7513 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -383,9 +383,15 @@ static inline bool tparg_is_function_entry(unsigned int flags) struct traceprobe_parse_context { struct trace_event_call *event; - const struct btf_param *params; - s32 nr_params; - const char *funcname; + /* BTF related parameters */ + const char *funcname; /* Function name in BTF */ + const struct btf_type *proto; /* Prototype of the function */ + const struct btf_param *params; /* Parameter of the function */ + s32 nr_params; /* The number of the parameters */ + struct btf *btf; /* The BTF to be used */ + const struct btf_type *last_type; /* Saved type */ + u32 last_bitoffs; /* Saved bitoffs */ + u32 last_bitsize; /* Saved bitsize */ unsigned int flags; int offset; }; @@ -400,6 +406,12 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); +/* + * If either traceprobe_parse_probe_arg() or traceprobe_expand_meta_args() is called, + * this MUST be called for clean up the context and return a resource. + */ +void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); + extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset); @@ -495,7 +507,14 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ - C(ARGS_2LONG, "$arg* failed because the argument list is too long"), + C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \ + C(ARGIDX_2BIG, "$argN index is too big"), \ + C(NO_PTR_STRCT, "This is not a pointer to union/structure."), \ + C(NOSUP_DAT_ARG, "Non pointer structure/union argument is not supported."),\ + C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \ + C(NO_BTF_FIELD, "This field is not found."), \ + C(BAD_BTF_TID, "Failed to get BTF type info."),\ + C(BAD_TYPE4STR, "This type does not fit for string."), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 576b3bcb8ebd..99c051de412a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -688,6 +688,7 @@ static int __trace_uprobe_create(int argc, const char **argv) trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); + traceprobe_finish_parse(&ctx); if (ret) goto error; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 800b4208dba9..c85825e17df8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,11 +122,6 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires pool->lock and should - * be done only from local cpu. Either disabling preemption on local - * cpu or grabbing pool->lock is enough for read access. If - * POOL_DISASSOCIATED is set, it's identical to L. - * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -160,7 +155,7 @@ struct worker_pool { int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ @@ -216,6 +211,7 @@ enum pool_workqueue_stats { PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ @@ -262,12 +258,12 @@ struct pool_workqueue { u64 stats[PWQ_NR_STATS]; /* - * Release of unbound pwq is punted to system_wq. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also RCU protected so that the first pwq can be - * determined without grabbing wq->mutex. + * Release of unbound pwq is punted to a kthread_worker. See put_pwq() + * and pwq_release_workfn() for details. pool_workqueue itself is also + * RCU protected so that the first pwq can be determined without + * grabbing wq->mutex. */ - struct work_struct unbound_release_work; + struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -326,14 +322,33 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ + struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; -static cpumask_var_t *wq_numa_possible_cpumask; - /* possible CPUs of each node */ +/* + * Each pod type describes how CPUs should be grouped for unbound workqueues. + * See the comment above workqueue_attrs->affn_scope. + */ +struct wq_pod_type { + int nr_pods; /* number of pods */ + cpumask_var_t *pod_cpus; /* pod -> cpus */ + int *pod_node; /* pod -> node */ + int *cpu_pod; /* cpu -> pod */ +}; + +static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; + +static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { + [WQ_AFFN_DFL] = "default", + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", +}; /* * Per-cpu work items which run for longer than the following threshold are @@ -345,19 +360,14 @@ static cpumask_var_t *wq_numa_possible_cpumask; static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); -static bool wq_disable_numa; -module_param_named(disable_numa, wq_disable_numa, bool, 0444); - /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ -static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ - -/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ -static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; +/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_pod_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -371,6 +381,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* for further constrain wq_unbound_cpumask by cmdline parameter*/ +static struct cpumask wq_cmdline_cpumask __initdata; + /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); @@ -400,6 +413,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; +/* + * I: kthread_worker to release pwq's. pwq release needs to be bounced to a + * process context while holding a pool lock. Bounce to a dedicated kthread + * worker to avoid A-A deadlocks. + */ +static struct kthread_worker *pwq_release_worker; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -606,35 +626,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * unbound_pwq_by_node - return the unbound pool_workqueue for the given node - * @wq: the target workqueue - * @node: the node ID - * - * This must be called with any of wq_pool_mutex, wq->mutex or RCU - * read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - * - * Return: The unbound pool_workqueue for @node. - */ -static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, - int node) -{ - assert_rcu_or_wq_mutex_or_pool_mutex(wq); - - /* - * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a - * delayed item is pending. The plan is to keep CPU -> NODE - * mapping valid and stable across CPU on/offlines. Once that - * happens, this workaround can be removed. - */ - if (unlikely(node == NUMA_NO_NODE)) - return wq->dfl_pwq; - - return rcu_dereference_raw(wq->numa_pwq_tbl[node]); -} - static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -825,11 +816,6 @@ static bool work_is_canceling(struct work_struct *work) * they're being called with pool->lock held. */ -static bool __need_more_worker(struct worker_pool *pool) -{ - return !pool->nr_running; -} - /* * Need to wake up a worker? Called from anything but currently * running workers. @@ -840,7 +826,7 @@ static bool __need_more_worker(struct worker_pool *pool) */ static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && __need_more_worker(pool); + return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ @@ -871,51 +857,18 @@ static bool too_many_workers(struct worker_pool *pool) return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } -/* - * Wake up functions. - */ - -/* Return the first idle worker. Called with pool->lock held. */ -static struct worker *first_idle_worker(struct worker_pool *pool) -{ - if (unlikely(list_empty(&pool->idle_list))) - return NULL; - - return list_first_entry(&pool->idle_list, struct worker, entry); -} - -/** - * wake_up_worker - wake up an idle worker - * @pool: worker pool to wake worker from - * - * Wake up the first idle worker of @pool. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void wake_up_worker(struct worker_pool *pool) -{ - struct worker *worker = first_idle_worker(pool); - - if (likely(worker)) - wake_up_process(worker->task); -} - /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && @@ -932,16 +885,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); worker->flags &= ~flags; @@ -955,6 +905,244 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) pool->nr_running++; } +/* Return the first idle worker. Called with pool->lock held. */ +static struct worker *first_idle_worker(struct worker_pool *pool) +{ + if (unlikely(list_empty(&pool->idle_list))) + return NULL; + + return list_first_entry(&pool->idle_list, struct worker, entry); +} + +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_enter_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; + + /* can't use worker_set_flags(), also called from create_worker() */ + worker->flags |= WORKER_IDLE; + pool->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &pool->idle_list); + + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); + + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; + worker_clr_flags(worker, WORKER_IDLE); + pool->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @pool: pool of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @pool by searching + * @pool->busy_hash which is keyed by the address of @work. For a worker + * to match, its current execution should match the address of @work and + * its work function. This is to avoid unwanted dependency between + * unrelated work executions through a work item being recycled while still + * being executed. + * + * This is a bit tricky. A work item may be freed once its execution + * starts and nothing prevents the freed area from being recycled for + * another work item. If the same work item address ends up being reused + * before the original execution finishes, workqueue will identify the + * recycled work item as currently executing and make it wait until the + * current execution finishes, introducing an unwanted dependency. + * + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + * + * Return: + * Pointer to worker which is executing @work if found, %NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct worker_pool *pool, + struct work_struct *work) +{ + struct worker *worker; + + hash_for_each_possible(pool->busy_hash, worker, hentry, + (unsigned long)work) + if (worker->current_work == work && + worker->current_func == work->func) + return worker; + + return NULL; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out parameter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to be + * scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on + * @nextp. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +/** + * assign_work - assign a work item and its linked work items to a worker + * @work: work to assign + * @worker: worker to assign to + * @nextp: out parameter for nested worklist walking + * + * Assign @work and its linked work items to @worker. If @work is already being + * executed by another worker in the same pool, it'll be punted there. + * + * If @nextp is not NULL, it's updated to point to the next work of the last + * scheduled work. This allows assign_work() to be nested inside + * list_for_each_entry_safe(). + * + * Returns %true if @work was successfully assigned to @worker. %false if @work + * was punted to another worker already executing it. + */ +static bool assign_work(struct work_struct *work, struct worker *worker, + struct work_struct **nextp) +{ + struct worker_pool *pool = worker->pool; + struct worker *collision; + + lockdep_assert_held(&pool->lock); + + /* + * A single work shouldn't be executed concurrently by multiple workers. + * __queue_work() ensures that @work doesn't jump to a different pool + * while still running in the previous pool. Here, we should ensure that + * @work is not executed concurrently by multiple workers from the same + * pool. Check whether anyone is already processing the work. If so, + * defer the work to the currently executing one. + */ + collision = find_worker_executing_work(pool, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, nextp); + return false; + } + + move_linked_works(work, &worker->scheduled, nextp); + return true; +} + +/** + * kick_pool - wake up an idle worker if necessary + * @pool: pool to kick + * + * @pool may have pending work items. Wake up worker if necessary. Returns + * whether a worker was woken up. + */ +static bool kick_pool(struct worker_pool *pool) +{ + struct worker *worker = first_idle_worker(pool); + struct task_struct *p; + + lockdep_assert_held(&pool->lock); + + if (!need_more_worker(pool) || !worker) + return false; + + p = worker->task; + +#ifdef CONFIG_SMP + /* + * Idle @worker is about to execute @work and waking up provides an + * opportunity to migrate @worker at a lower cost by setting the task's + * wake_cpu field. Let's see if we want to move @worker to improve + * execution locality. + * + * We're waking the worker that went idle the latest and there's some + * chance that @worker is marked idle but hasn't gone off CPU yet. If + * so, setting the wake_cpu won't do anything. As this is a best-effort + * optimization and the race window is narrow, let's leave as-is for + * now. If this becomes pronounced, we can skip over workers which are + * still on cpu when picking an idle worker. + * + * If @pool has non-strict affinity, @worker might have ended up outside + * its affinity scope. Repatriate. + */ + if (!pool->attrs->affn_strict && + !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { + struct work_struct *work = list_first_entry(&pool->worklist, + struct work_struct, entry); + p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); + get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + } +#endif + wake_up_process(p); + return true; +} + #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT /* @@ -1120,10 +1308,9 @@ void wq_worker_sleeping(struct task_struct *task) } pool->nr_running--; - if (need_more_worker(pool)) { + if (kick_pool(pool)) worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; - wake_up_worker(pool); - } + raw_spin_unlock_irq(&pool->lock); } @@ -1171,10 +1358,8 @@ void wq_worker_tick(struct task_struct *task) wq_cpu_intensive_report(worker->current_func); pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; - if (need_more_worker(pool)) { + if (kick_pool(pool)) pwq->stats[PWQ_STAT_CM_WAKEUP]++; - wake_up_worker(pool); - } raw_spin_unlock(&pool->lock); } @@ -1211,94 +1396,6 @@ work_func_t wq_worker_last_func(struct task_struct *task) } /** - * find_worker_executing_work - find worker which is executing a work - * @pool: pool of interest - * @work: work to find worker for - * - * Find a worker which is executing @work on @pool by searching - * @pool->busy_hash which is keyed by the address of @work. For a worker - * to match, its current execution should match the address of @work and - * its work function. This is to avoid unwanted dependency between - * unrelated work executions through a work item being recycled while still - * being executed. - * - * This is a bit tricky. A work item may be freed once its execution - * starts and nothing prevents the freed area from being recycled for - * another work item. If the same work item address ends up being reused - * before the original execution finishes, workqueue will identify the - * recycled work item as currently executing and make it wait until the - * current execution finishes, introducing an unwanted dependency. - * - * This function checks the work item address and work function to avoid - * false positives. Note that this isn't complete as one may construct a - * work function which can introduce dependency onto itself through a - * recycled work item. Well, if somebody wants to shoot oneself in the - * foot that badly, there's only so much we can do, and if such deadlock - * actually occurs, it should be easy to locate the culprit work function. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - * - * Return: - * Pointer to worker which is executing @work if found, %NULL - * otherwise. - */ -static struct worker *find_worker_executing_work(struct worker_pool *pool, - struct work_struct *work) -{ - struct worker *worker; - - hash_for_each_possible(pool->busy_hash, worker, hentry, - (unsigned long)work) - if (worker->current_work == work && - worker->current_func == work->func) - return worker; - - return NULL; -} - -/** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out parameter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - -/** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * @@ -1324,17 +1421,11 @@ static void put_pwq(struct pool_workqueue *pwq) lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; - if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) - return; /* - * @pwq can't be released under pool->lock, bounce to - * pwq_unbound_release_workfn(). This never recurses on the same - * pool->lock as this path is taken only for unbound workqueues and - * the release work item is scheduled on a per-cpu workqueue. To - * avoid lockdep warning, unbound pool->locks are given lockdep - * subclass of 1 in get_unbound_pool(). + * @pwq can't be released under pool->lock, bounce to a dedicated + * kthread_worker to avoid A-A deadlocks. */ - schedule_work(&pwq->unbound_release_work); + kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** @@ -1550,7 +1641,7 @@ fail: static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct worker_pool *pool = pwq->pool; + debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack_noalloc(work); @@ -1559,9 +1650,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); - - if (__need_more_worker(pool)) - wake_up_worker(pool); } /* @@ -1615,8 +1703,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; - struct worker_pool *last_pool; - struct list_head *worklist; + struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1640,23 +1727,23 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ - if (wq->flags & WQ_UNBOUND) { - if (req_cpu == WORK_CPU_UNBOUND) + if (req_cpu == WORK_CPU_UNBOUND) { + if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - } else { - if (req_cpu == WORK_CPU_UNBOUND) + else cpu = raw_smp_processor_id(); - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); } + pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); + pool = pwq->pool; + /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. */ last_pool = get_work_pool(work); - if (last_pool && last_pool != pwq->pool) { + if (last_pool && last_pool != pool) { struct worker *worker; raw_spin_lock(&last_pool->lock); @@ -1665,26 +1752,27 @@ retry: if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; + pool = pwq->pool; + WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } } else { - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } /* - * pwq is determined and locked. For unbound pools, we could have - * raced with pwq release and it could already be dead. If its - * refcnt is zero, repeat pwq selection. Note that pwqs never die - * without another pwq replacing it in the numa_pwq_tbl or while - * work items are executing on it, so the retrying is guaranteed to - * make forward-progress. + * pwq is determined and locked. For unbound pools, we could have raced + * with pwq release and it could already be dead. If its refcnt is zero, + * repeat pwq selection. Note that unbound pwqs never die without + * another pwq replacing it in cpu_pwq or while work items are executing + * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } @@ -1703,21 +1791,20 @@ retry: work_flags = work_color_to_flags(pwq->work_color); if (likely(pwq->nr_active < pwq->max_active)) { + if (list_empty(&pool->worklist)) + pool->watchdog_ts = jiffies; + trace_workqueue_activate_work(work); pwq->nr_active++; - worklist = &pwq->pool->worklist; - if (list_empty(worklist)) - pwq->pool->watchdog_ts = jiffies; + insert_work(pwq, work, &pool->worklist, work_flags); + kick_pool(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; - worklist = &pwq->inactive_works; + insert_work(pwq, work, &pwq->inactive_works, work_flags); } - debug_work_activate(work); - insert_work(pwq, work, worklist, work_flags); - out: - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); } @@ -1754,7 +1841,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL(queue_work_on); /** - * workqueue_select_cpu_near - Select a CPU based on NUMA node + * select_numa_node_cpu - Select a CPU based on NUMA node * @node: NUMA node ID that we want to select a CPU from * * This function will attempt to find a "random" cpu available on a given @@ -1762,14 +1849,10 @@ EXPORT_SYMBOL(queue_work_on); * WORK_CPU_UNBOUND indicating that we should just schedule to any * available CPU if we need to schedule this work. */ -static int workqueue_select_cpu_near(int node) +static int select_numa_node_cpu(int node) { int cpu; - /* No point in doing this if NUMA isn't enabled for workqueues */ - if (!wq_numa_enabled) - return WORK_CPU_UNBOUND; - /* Delay binding to CPU if node is not valid or online */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) return WORK_CPU_UNBOUND; @@ -1826,7 +1909,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - int cpu = workqueue_select_cpu_near(node); + int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); ret = true; @@ -1981,60 +2064,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) } EXPORT_SYMBOL(queue_rcu_work); -/** - * worker_enter_idle - enter idle state - * @worker: worker which is entering idle state - * - * @worker is entering idle state. Update stats and idle timer if - * necessary. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_enter_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || - WARN_ON_ONCE(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev))) - return; - - /* can't use worker_set_flags(), also called from create_worker() */ - worker->flags |= WORKER_IDLE; - pool->nr_idle++; - worker->last_active = jiffies; - - /* idle_list is LIFO */ - list_add(&worker->entry, &pool->idle_list); - - if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) - mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - - /* Sanity check nr_running. */ - WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); -} - -/** - * worker_leave_idle - leave idle state - * @worker: worker which is leaving idle state - * - * @worker is leaving idle state. Update stats. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_leave_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) - return; - worker_clr_flags(worker, WORKER_IDLE); - pool->nr_idle--; - list_del_init(&worker->entry); -} - static struct worker *alloc_worker(int node) { struct worker *worker; @@ -2050,6 +2079,14 @@ static struct worker *alloc_worker(int node) return worker; } +static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) +{ + if (pool->cpu < 0 && pool->attrs->affn_strict) + return pool->attrs->__pod_cpumask; + else + return pool->attrs->cpumask; +} + /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached @@ -2075,7 +2112,7 @@ static void worker_attach_to_pool(struct worker *worker, kthread_set_per_cpu(worker->task, pool->cpu); if (worker->rescue_wq) - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; @@ -2167,16 +2204,25 @@ static struct worker *create_worker(struct worker_pool *pool) } set_user_nice(worker->task, pool->attrs->nice); - kthread_bind_mask(worker->task, pool->attrs->cpumask); + kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); /* start the newly created worker */ raw_spin_lock_irq(&pool->lock); + worker->pool->nr_workers++; worker_enter_idle(worker); + kick_pool(pool); + + /* + * @worker is waiting on a completion in kthread() and will trigger hung + * check if not woken up soon. As kick_pool() might not have waken it + * up, wake it up explicitly once more. + */ wake_up_process(worker->task); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -2304,9 +2350,8 @@ static void idle_worker_timeout(struct timer_list *t) static void idle_cull_fn(struct work_struct *work) { struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); - struct list_head cull_list; + LIST_HEAD(cull_list); - INIT_LIST_HEAD(&cull_list); /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker * cannot proceed beyong worker_detach_from_pool() in its self-destruct @@ -2495,7 +2540,6 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; unsigned long work_data; - struct worker *collision; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from @@ -2512,18 +2556,6 @@ __acquires(&pool->lock) WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); - /* - * A single work shouldn't be executed concurrently by - * multiple workers on a single cpu. Check whether anyone is - * already processing the work. If so, defer the work to the - * currently executing one. - */ - collision = find_worker_executing_work(pool, work); - if (unlikely(collision)) { - move_linked_works(work, &collision->scheduled, NULL); - return; - } - /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); @@ -2552,14 +2584,12 @@ __acquires(&pool->lock) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* - * Wake up another worker if necessary. The condition is always - * false for normal per-cpu workers since nr_running would always - * be >= 1 at this point. This is used to chain execution of the - * pending work items for WORKER_NOT_RUNNING workers such as the - * UNBOUND and CPU_INTENSIVE ones. + * Kick @pool if necessary. It's always noop for per-cpu worker pools + * since nr_running would always be >= 1 at this point. This is used to + * chain execution of the pending work items for WORKER_NOT_RUNNING + * workers such as the UNBOUND and CPU_INTENSIVE ones. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); /* * Record the last pool and clear PENDING which should be the last @@ -2569,6 +2599,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); + pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); @@ -2595,7 +2626,6 @@ __acquires(&pool->lock) * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); - pwq->stats[PWQ_STAT_STARTED]++; trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2661,9 +2691,15 @@ __acquires(&pool->lock) */ static void process_scheduled_works(struct worker *worker) { - while (!list_empty(&worker->scheduled)) { - struct work_struct *work = list_first_entry(&worker->scheduled, - struct work_struct, entry); + struct work_struct *work; + bool first = true; + + while ((work = list_first_entry_or_null(&worker->scheduled, + struct work_struct, entry))) { + if (first) { + worker->pool->watchdog_ts = jiffies; + first = false; + } process_one_work(worker, work); } } @@ -2744,17 +2780,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); - pool->watchdog_ts = jiffies; - - if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { - /* optimization path, not strictly necessary */ - process_one_work(worker, work); - if (unlikely(!list_empty(&worker->scheduled))) - process_scheduled_works(worker); - } else { - move_linked_works(work, &worker->scheduled, NULL); + if (assign_work(work, worker, NULL)) process_scheduled_works(worker); - } } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); @@ -2798,7 +2825,6 @@ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; - struct list_head *scheduled = &rescuer->scheduled; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2829,7 +2855,6 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; - bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2844,18 +2869,14 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - WARN_ON_ONCE(!list_empty(scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq) { - if (first) - pool->watchdog_ts = jiffies; - move_linked_works(work, scheduled, &n); + if (get_work_pwq(work) == pwq && + assign_work(work, rescuer, &n)) pwq->stats[PWQ_STAT_RESCUED]++; - } - first = false; } - if (!list_empty(scheduled)) { + if (!list_empty(&rescuer->scheduled)) { process_scheduled_works(rescuer); /* @@ -2888,12 +2909,10 @@ repeat: put_pwq(pwq); /* - * Leave this pool. If need_more_worker() is %true, notify a - * regular worker; otherwise, we end up with 0 concurrency - * and stalling the execution. + * Leave this pool. Notify regular workers; otherwise, we end up + * with 0 concurrency and stalling the execution. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); raw_spin_unlock_irq(&pool->lock); @@ -3028,7 +3047,6 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, pwq->nr_in_flight[work_color]++; work_flags |= work_color_to_flags(work_color); - debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } @@ -3691,6 +3709,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); + free_cpumask_var(attrs->__pod_cpumask); kfree(attrs); } } @@ -3712,8 +3731,11 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; + if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) + goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); + attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); @@ -3725,12 +3747,26 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); + cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); + to->affn_strict = from->affn_strict; + /* - * Unlike hash and equality test, this function doesn't ignore - * ->no_numa as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->no_numa after copying. + * Unlike hash and equality test, copying shouldn't ignore wq-only + * fields as copying is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears the fields. */ - to->no_numa = from->no_numa; + to->affn_scope = from->affn_scope; + to->ordered = from->ordered; +} + +/* + * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the + * comments in 'struct workqueue_attrs' definition. + */ +static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) +{ + attrs->affn_scope = WQ_AFFN_NR_TYPES; + attrs->ordered = false; } /* hash value of the content of @attr */ @@ -3741,6 +3777,9 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) hash = jhash_1word(attrs->nice, hash); hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash(cpumask_bits(attrs->__pod_cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); return hash; } @@ -3752,9 +3791,57 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, return false; if (!cpumask_equal(a->cpumask, b->cpumask)) return false; + if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) + return false; + if (a->affn_strict != b->affn_strict) + return false; return true; } +/* Update @attrs with actually available CPUs */ +static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, + const cpumask_t *unbound_cpumask) +{ + /* + * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If + * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to + * @unbound_cpumask. + */ + cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); + if (unlikely(cpumask_empty(attrs->cpumask))) + cpumask_copy(attrs->cpumask, unbound_cpumask); +} + +/* find wq_pod_type to use for @attrs */ +static const struct wq_pod_type * +wqattrs_pod_type(const struct workqueue_attrs *attrs) +{ + enum wq_affn_scope scope; + struct wq_pod_type *pt; + + /* to synchronize access to wq_affn_dfl */ + lockdep_assert_held(&wq_pool_mutex); + + if (attrs->affn_scope == WQ_AFFN_DFL) + scope = wq_affn_dfl; + else + scope = attrs->affn_scope; + + pt = &wq_pod_types[scope]; + + if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && + likely(pt->nr_pods)) + return pt; + + /* + * Before workqueue_init_topology(), only SYSTEM is available which is + * initialized in workqueue_init_early(). + */ + pt = &wq_pod_types[WQ_AFFN_SYSTEM]; + BUG_ON(!pt->nr_pods); + return pt; +} + /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize @@ -3793,6 +3880,9 @@ static int init_worker_pool(struct worker_pool *pool) pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; + + wqattrs_clear_for_pool(pool->attrs); + return 0; } @@ -3840,12 +3930,8 @@ static void rcu_free_wq(struct rcu_head *rcu) container_of(rcu, struct workqueue_struct, rcu); wq_free_lockdep(wq); - - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); - else - free_workqueue_attrs(wq->unbound_attrs); - + free_percpu(wq->cpu_pwq); + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } @@ -3872,10 +3958,8 @@ static void rcu_free_pool(struct rcu_head *rcu) static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); - struct list_head cull_list; struct worker *worker; - - INIT_LIST_HEAD(&cull_list); + LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); @@ -3959,10 +4043,10 @@ static void put_unbound_pool(struct worker_pool *pool) */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - int node; - int target_node = NUMA_NO_NODE; + int pod, node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3974,31 +4058,22 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(attrs->cpumask, - wq_numa_possible_cpumask[node])) { - target_node = node; - break; - } + /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ + for (pod = 0; pod < pt->nr_pods; pod++) { + if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { + node = pt->pod_node[pod]; + break; } } /* nope, create a new one */ - pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); if (!pool || init_worker_pool(pool) < 0) goto fail; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + pool->node = node; copy_workqueue_attrs(pool->attrs, attrs); - pool->node = target_node; - - /* - * no_numa isn't a worker_pool attribute, always clear it. See - * 'struct workqueue_attrs' comments for detail. - */ - pool->attrs->no_numa = false; + wqattrs_clear_for_pool(pool->attrs); if (worker_pool_assign_id(pool) < 0) goto fail; @@ -4024,34 +4099,33 @@ static void rcu_free_pwq(struct rcu_head *rcu) } /* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. + * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero + * refcnt and needs to be destroyed. */ -static void pwq_unbound_release_workfn(struct work_struct *work) +static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); + release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last = false; /* - * when @pwq is not linked, it doesn't hold any reference to the + * When @pwq is not linked, it doesn't hold any reference to the * @wq, and @wq is invalid to access. */ if (!list_empty(&pwq->pwqs_node)) { - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); } - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + } call_rcu(&pwq->rcu, rcu_free_pwq); @@ -4095,24 +4169,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) * is updated and visible. */ if (!freezable || !workqueue_freezing) { - bool kick = false; - pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->inactive_works) && - pwq->nr_active < pwq->max_active) { + pwq->nr_active < pwq->max_active) pwq_activate_first_inactive(pwq); - kick = true; - } - /* - * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. In realtime scenarios, always kicking a - * worker will cause interference on the isolated cpu cores, so - * let's kick iff work items were activated. - */ - if (kick) - wake_up_worker(pwq->pool); + kick_pool(pwq->pool); } else { pwq->max_active = 0; } @@ -4135,7 +4198,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); - INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); + kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ @@ -4183,61 +4246,49 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, } /** - * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node + * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue - * @node: the target NUMA node + * @cpu: the target CPU * @cpu_going_down: if >= 0, the CPU to consider as offline - * @cpumask: outarg, the resulting cpumask - * - * Calculate the cpumask a workqueue with @attrs should use on @node. If - * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. * - * If NUMA affinity is not enabled, @attrs->cpumask is always used. If - * enabled and @node has online CPUs requested by @attrs, the returned - * cpumask is the intersection of the possible CPUs of @node and - * @attrs->cpumask. + * Calculate the cpumask a workqueue with @attrs should use on @pod. If + * @cpu_going_down is >= 0, that cpu is considered offline during calculation. + * The result is stored in @attrs->__pod_cpumask. * - * The caller is responsible for ensuring that the cpumask of @node stays - * stable. + * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled + * and @pod has online CPUs requested by @attrs, the returned cpumask is the + * intersection of the possible CPUs of @pod and @attrs->cpumask. * - * Return: %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. + * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, - int cpu_going_down, cpumask_t *cpumask) +static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, + int cpu_going_down) { - if (!wq_numa_enabled || attrs->no_numa) - goto use_dfl; + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int pod = pt->cpu_pod[cpu]; - /* does @node have any online CPUs @attrs wants? */ - cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + /* does @pod have any online CPUs @attrs wants? */ + cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); + cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask); if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, cpumask); + cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask); - if (cpumask_empty(cpumask)) - goto use_dfl; + if (cpumask_empty(attrs->__pod_cpumask)) { + cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); + return; + } - /* yeap, return possible CPUs in @node that @attrs wants */ - cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + /* yeap, return possible CPUs in @pod that @attrs wants */ + cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]); - if (cpumask_empty(cpumask)) { + if (cpumask_empty(attrs->__pod_cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); - return false; - } - - return !cpumask_equal(cpumask, attrs->cpumask); - -use_dfl: - cpumask_copy(cpumask, attrs->cpumask); - return false; } -/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ -static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, - int node, - struct pool_workqueue *pwq) +/* install @pwq into @wq's cpu_pwq and return the old pwq */ +static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, + int cpu, struct pool_workqueue *pwq) { struct pool_workqueue *old_pwq; @@ -4247,8 +4298,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, /* link_pwq() can handle duplicate calls */ link_pwq(pwq); - old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); return old_pwq; } @@ -4265,10 +4316,10 @@ struct apply_wqattrs_ctx { static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { - int node; + int cpu; - for_each_node(node) - put_pwq_unlocked(ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); @@ -4284,76 +4335,64 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; - struct workqueue_attrs *new_attrs, *tmp_attrs; - int node; + struct workqueue_attrs *new_attrs; + int cpu; lockdep_assert_held(&wq_pool_mutex); - ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); + if (WARN_ON(attrs->affn_scope < 0 || + attrs->affn_scope >= WQ_AFFN_NR_TYPES)) + return ERR_PTR(-EINVAL); + + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); - tmp_attrs = alloc_workqueue_attrs(); - if (!ctx || !new_attrs || !tmp_attrs) + if (!ctx || !new_attrs) goto out_free; /* - * Calculate the attrs of the default pwq with unbound_cpumask - * which is wq_unbound_cpumask or to set to wq_unbound_cpumask. - * If the user configured cpumask doesn't overlap with the - * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. - */ - copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask); - if (unlikely(cpumask_empty(new_attrs->cpumask))) - cpumask_copy(new_attrs->cpumask, unbound_cpumask); - - /* - * We may create multiple pwqs with differing cpumasks. Make a - * copy of @new_attrs which will be modified and used to obtain - * pools. - */ - copy_workqueue_attrs(tmp_attrs, new_attrs); - - /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ + copy_workqueue_attrs(new_attrs, attrs); + wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; - for_each_node(node) { - if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { - ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); - if (!ctx->pwq_tbl[node]) - goto out_free; - } else { + for_each_possible_cpu(cpu) { + if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; - ctx->pwq_tbl[node] = ctx->dfl_pwq; + ctx->pwq_tbl[cpu] = ctx->dfl_pwq; + } else { + wq_calc_pod_cpumask(new_attrs, cpu, -1); + ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); + if (!ctx->pwq_tbl[cpu]) + goto out_free; } } /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; ctx->wq = wq; - free_workqueue_attrs(tmp_attrs); return ctx; out_free: - free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); - return NULL; + return ERR_PTR(-ENOMEM); } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { - int node; + int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); @@ -4361,9 +4400,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ - for_each_node(node) - ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, - ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, + ctx->pwq_tbl[cpu]); /* @dfl_pwq might not have been used, ensure it's linked */ link_pwq(ctx->dfl_pwq); @@ -4403,8 +4442,8 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, } ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); /* the ctx has been prepared successfully, let's commit it */ apply_wqattrs_commit(ctx); @@ -4418,12 +4457,11 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * - * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA - * machines, this function maps a separate pwq to each NUMA node with - * possibles CPUs in @attrs->cpumask so that work items are affine to the - * NUMA node it was issued on. Older pwqs are released as in-flight work - * items finish. Note that a work item which repeatedly requeues itself - * back-to-back will stay on its current pwq. + * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps + * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that + * work items are affine to the pod it was issued on. Older pwqs are released as + * in-flight work items finish. Note that a work item which repeatedly requeues + * itself back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * @@ -4446,40 +4484,37 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } /** - * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug * @wq: the target workqueue - * @cpu: the CPU coming up or going down + * @cpu: the CPU to update pool association for + * @hotplug_cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and - * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of * @wq accordingly. * - * If NUMA affinity can't be adjusted due to memory allocation failure, it - * falls back to @wq->dfl_pwq which may not be optimal but is always - * correct. - * - * Note that when the last allowed CPU of a NUMA node goes offline for a - * workqueue with a cpumask spanning multiple nodes, the workers which were - * already executing the work items for the workqueue will lose their CPU - * affinity and may execute on any CPU. This is similar to how per-cpu - * workqueues behave on CPU_DOWN. If a workqueue user wants strict - * affinity, it's the user's responsibility to flush the work item from - * CPU_DOWN_PREPARE. + * + * If pod affinity can't be adjusted due to memory allocation failure, it falls + * back to @wq->dfl_pwq which may not be optimal but is always correct. + * + * Note that when the last allowed CPU of a pod goes offline for a workqueue + * with a cpumask spanning multiple pods, the workers which were already + * executing the work items for the workqueue will lose their CPU affinity and + * may execute on any CPU. This is similar to how per-cpu workqueues behave on + * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's + * responsibility to flush the work item from CPU_DOWN_PREPARE. */ -static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, - bool online) +static void wq_update_pod(struct workqueue_struct *wq, int cpu, + int hotplug_cpu, bool online) { - int node = cpu_to_node(cpu); - int cpu_off = online ? -1 : cpu; + int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; - cpumask_t *cpumask; lockdep_assert_held(&wq_pool_mutex); - if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || - wq->unbound_attrs->no_numa) + if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; /* @@ -4487,36 +4522,29 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ - target_attrs = wq_update_unbound_numa_attrs_buf; - cpumask = target_attrs->cpumask; + target_attrs = wq_update_pod_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); - pwq = unbound_pwq_by_node(wq, node); + wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); - /* - * Let's determine what needs to be done. If the target cpumask is - * different from the default pwq's, we need to compare it to @pwq's - * and create a new one if they don't match. If the target cpumask - * equals the default pwq's, the default pwq should be used. - */ - if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { - if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) - return; - } else { - goto use_dfl_pwq; - } + /* nothing to do if the target cpumask matches the current pwq */ + wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); + pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), + lockdep_is_held(&wq_pool_mutex)); + if (wqattrs_equal(target_attrs, pwq->pool->attrs)) + return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } /* Install the new pwq. */ mutex_lock(&wq->mutex); - old_pwq = numa_pwq_tbl_install(wq, node, pwq); + old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: @@ -4524,7 +4552,7 @@ use_dfl_pwq: raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); - old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); + old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); @@ -4535,21 +4563,26 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; - if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); - if (!wq->cpu_pwqs) - return -ENOMEM; + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); + if (!wq->cpu_pwq) + goto enomem; + if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { - struct pool_workqueue *pwq = - per_cpu_ptr(wq->cpu_pwqs, cpu); - struct worker_pool *cpu_pools = - per_cpu(cpu_worker_pools, cpu); + struct pool_workqueue **pwq_p = + per_cpu_ptr(wq->cpu_pwq, cpu); + struct worker_pool *pool = + &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); - init_pwq(pwq, wq, &cpu_pools[highpri]); + *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, + pool->node); + if (!*pwq_p) + goto enomem; + + init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); - link_pwq(pwq); + link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; @@ -4568,18 +4601,25 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) cpus_read_unlock(); return ret; + +enomem: + if (wq->cpu_pwq) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + free_percpu(wq->cpu_pwq); + wq->cpu_pwq = NULL; + } + return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { - int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - - if (max_active < 1 || max_active > lim) + if (max_active < 1 || max_active > WQ_MAX_ACTIVE) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + max_active, name, 1, WQ_MAX_ACTIVE); - return clamp_val(max_active, 1, lim); + return clamp_val(max_active, 1, WQ_MAX_ACTIVE); } /* @@ -4602,7 +4642,7 @@ static int init_rescuer(struct workqueue_struct *wq) } rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", @@ -4623,17 +4663,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) { - size_t tbl_size = 0; va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; /* - * Unbound && max_active == 1 used to imply ordered, which is no - * longer the case on NUMA machines due to per-node pools. While + * Unbound && max_active == 1 used to imply ordered, which is no longer + * the case on many machines due to per-pod pools. While * alloc_ordered_workqueue() is the right way to create an ordered - * workqueue, keep the previous behavior to avoid subtle breakages - * on NUMA. + * workqueue, keep the previous behavior to avoid subtle breakages. */ if ((flags & WQ_UNBOUND) && max_active == 1) flags |= __WQ_ORDERED; @@ -4643,10 +4681,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, flags |= WQ_UNBOUND; /* allocate wq and format name */ - if (flags & WQ_UNBOUND) - tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); - - wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; @@ -4741,7 +4776,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; - int node; + int cpu; /* * Remove it from sysfs first so that sanity check failure doesn't @@ -4800,33 +4835,23 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - if (!(wq->flags & WQ_UNBOUND)) { - wq_unregister_lockdep(wq); - /* - * The base ref is never dropped on per-cpu pwqs. Directly - * schedule RCU free. - */ - call_rcu(&wq->rcu, rcu_free_wq); - } else { - /* - * We're the sole accessor of @wq at this point. Directly - * access numa_pwq_tbl[] and dfl_pwq to put the base refs. - * @wq will be freed when the last pwq is released. - */ - for_each_node(node) { - pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); - put_pwq_unlocked(pwq); - } + /* + * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq + * to put the base refs. @wq will be auto-destroyed from the last + * pwq_put. RCU read lock prevents @wq from going away from under us. + */ + rcu_read_lock(); - /* - * Put dfl_pwq. @wq may be freed any time after dfl_pwq is - * put. Don't access it afterwards. - */ - pwq = wq->dfl_pwq; - wq->dfl_pwq = NULL; + for_each_possible_cpu(cpu) { + pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); put_pwq_unlocked(pwq); } + + put_pwq_unlocked(wq->dfl_pwq); + wq->dfl_pwq = NULL; + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(destroy_workqueue); @@ -4903,10 +4928,11 @@ bool current_is_workqueue_rescuer(void) * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. - * Note that both per-cpu and unbound workqueues may be associated with - * multiple pool_workqueues which have separate congested states. A - * workqueue being congested on one CPU doesn't mean the workqueue is also - * contested on other CPUs / NUMA nodes. + * + * With the exception of ordered workqueues, all workqueues have per-cpu + * pool_workqueues, each with its own congested state. A workqueue being + * congested on one CPU doesn't mean that the workqueue is contested on any + * other CPUs. * * Return: * %true if congested, %false otherwise. @@ -4922,12 +4948,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); ret = !list_empty(&pwq->inactive_works); + preempt_enable(); rcu_read_unlock(); @@ -5402,7 +5425,7 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - wake_up_worker(pool); + kick_pool(pool); raw_spin_unlock_irq(&pool->lock); @@ -5435,7 +5458,7 @@ static void rebind_workers(struct worker_pool *pool) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + pool_allowed_cpus(pool)) < 0); } raw_spin_lock_irq(&pool->lock); @@ -5529,9 +5552,18 @@ int workqueue_online_cpu(unsigned int cpu) mutex_unlock(&wq_pool_attach_mutex); } - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + /* update pod affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) { + struct workqueue_attrs *attrs = wq->unbound_attrs; + + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) + wq_update_pod(wq, tcpu, cpu, true); + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5547,10 +5579,19 @@ int workqueue_offline_cpu(unsigned int cpu) unbind_workers(cpu); - /* update NUMA affinity of unbound workqueues */ + /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); + list_for_each_entry(wq, &workqueues, list) { + struct workqueue_attrs *attrs = wq->unbound_attrs; + + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) + wq_update_pod(wq, tcpu, cpu, false); + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5746,8 +5787,8 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) continue; ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); - if (!ctx) { - ret = -ENOMEM; + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); break; } @@ -5805,21 +5846,72 @@ out_unlock: return ret; } +static int parse_affn_scope(const char *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { + if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) + return i; + } + return -EINVAL; +} + +static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) +{ + struct workqueue_struct *wq; + int affn, cpu; + + affn = parse_affn_scope(val); + if (affn < 0) + return affn; + if (affn == WQ_AFFN_DFL) + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); + + wq_affn_dfl = affn; + + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } + } + + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); + + return 0; +} + +static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) +{ + return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); +} + +static const struct kernel_param_ops wq_affn_dfl_ops = { + .set = wq_affn_dfl_set, + .get = wq_affn_dfl_get, +}; + +module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * - * pool_ids RO int : the associated pool IDs for each node - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers - * numa RW bool : whether enable NUMA affinity + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) + * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; @@ -5872,28 +5964,6 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - cpus_read_lock(); - rcu_read_lock(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock(); - cpus_read_unlock(); - - return written; -} - static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -5984,50 +6054,84 @@ out_unlock: return ret ?: count; } -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t wq_affn_scope_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); + if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) + written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", + wq_affn_names[WQ_AFFN_DFL], + wq_affn_names[wq_affn_dfl]); + else + written = scnprintf(buf, PAGE_SIZE, "%s\n", + wq_affn_names[wq->unbound_attrs->affn_scope]); mutex_unlock(&wq->mutex); return written; } -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t wq_affn_scope_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int v, ret = -ENOMEM; + int affn, ret = -ENOMEM; - apply_wqattrs_lock(); + affn = parse_affn_scope(buf); + if (affn < 0) + return affn; + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - goto out_unlock; - - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; + if (attrs) { + attrs->affn_scope = affn; ret = apply_workqueue_attrs_locked(wq, attrs); } + apply_wqattrs_unlock(); + free_workqueue_attrs(attrs); + return ret ?: count; +} -out_unlock: +static ssize_t wq_affinity_strict_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + wq->unbound_attrs->affn_strict); +} + +static ssize_t wq_affinity_strict_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret = -ENOMEM; + + if (sscanf(buf, "%d", &v) != 1) + return -EINVAL; + + apply_wqattrs_lock(); + attrs = wq_sysfs_prep_attrs(wq); + if (attrs) { + attrs->affn_strict = (bool)v; + ret = apply_workqueue_attrs_locked(wq, attrs); + } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), + __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; @@ -6393,62 +6497,19 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ -static void __init wq_numa_init(void) -{ - cpumask_var_t *tbl; - int node, cpu; - - if (num_possible_nodes() <= 1) - return; - - if (wq_disable_numa) { - pr_info("workqueue: NUMA affinity support disabled\n"); - return; - } - - for_each_possible_cpu(cpu) { - if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - return; - } - } - - wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_unbound_numa_attrs_buf); - - /* - * We want masks of possible CPUs of each node which isn't readily - * available. Build one from cpu_to_node() which should have been - * fully initialized by now. - */ - tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); - BUG_ON(!tbl); - - for_each_node(node) - BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE)); - - for_each_possible_cpu(cpu) { - node = cpu_to_node(cpu); - cpumask_set_cpu(cpu, tbl[node]); - } - - wq_numa_possible_cpumask = tbl; - wq_numa_enabled = true; -} - /** * workqueue_init_early - early init for workqueue subsystem * - * This is the first half of two-staged workqueue subsystem initialization - * and invoked as soon as the bare basics - memory allocation, cpumasks and - * idr are up. It sets up all the data structures and system workqueues - * and allows early boot code to create workqueues and queue/cancel work - * items. Actual work item execution starts only after kthreads can be - * created and scheduled right before early initcalls. + * This is the first step of three-staged workqueue subsystem initialization and + * invoked as soon as the bare basics - memory allocation, cpumasks and idr are + * up. It sets up all the data structures and system workqueues and allows early + * boot code to create workqueues and queue/cancel work items. Actual work item + * execution starts only after kthreads can be created and scheduled right + * before early initcalls. */ void __init workqueue_init_early(void) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; @@ -6458,8 +6519,30 @@ void __init workqueue_init_early(void) cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (!cpumask_empty(&wq_cmdline_cpumask)) + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + + /* initialize WQ_AFFN_SYSTEM pods */ + pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); + + BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); + + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + + pt->nr_pods = 1; + cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); + pt->pod_node[0] = NUMA_NO_NODE; + pt->cpu_pod[0] = 0; + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -6469,7 +6552,9 @@ void __init workqueue_init_early(void) BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; + pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ @@ -6490,11 +6575,10 @@ void __init workqueue_init_early(void) /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. - * Turn off NUMA so that dfl_pwq is used for all nodes. */ BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; - attrs->no_numa = true; + attrs->ordered = true; ordered_wq_attrs[i] = attrs; } @@ -6502,7 +6586,7 @@ void __init workqueue_init_early(void) system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_UNBOUND_MAX_ACTIVE); + WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", @@ -6525,6 +6609,9 @@ static void __init wq_cpu_intensive_thresh_init(void) if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what @@ -6555,11 +6642,11 @@ static void __init wq_cpu_intensive_thresh_init(void) /** * workqueue_init - bring workqueue subsystem fully online * - * This is the latter half of two-staged workqueue subsystem initialization - * and invoked as soon as kthreads can be created and scheduled. - * Workqueues have been created and work items queued on them, but there - * are no kworkers executing the work items yet. Populate the worker pools - * with the initial workers and enable future kworker creations. + * This is the second step of three-staged workqueue subsystem initialization + * and invoked as soon as kthreads can be created and scheduled. Workqueues have + * been created and work items queued on them, but there are no kworkers + * executing the work items yet. Populate the worker pools with the initial + * workers and enable future kworker creations. */ void __init workqueue_init(void) { @@ -6569,19 +6656,12 @@ void __init workqueue_init(void) wq_cpu_intensive_thresh_init(); - /* - * It'd be simpler to initialize NUMA in workqueue_init_early() but - * CPU to node mapping may not be available that early on some - * archs such as power and arm64. As per-cpu pools created - * previously could be missing node hint and unbound pools NUMA - * affinity, fix them up. - * - * Also, while iterating workqueues, create rescuers if requested. - */ - wq_numa_init(); - mutex_lock(&wq_pool_mutex); + /* + * Per-cpu pools created earlier could be missing node hint. Fix them + * up. Also, create a rescuer for workqueues that requested it. + */ for_each_possible_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->node = cpu_to_node(cpu); @@ -6589,7 +6669,6 @@ void __init workqueue_init(void) } list_for_each_entry(wq, &workqueues, list) { - wq_update_unbound_numa(wq, smp_processor_id(), true); WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); @@ -6613,9 +6692,114 @@ void __init workqueue_init(void) } /* - * Despite the naming, this is a no-op function which is here only for avoiding - * link error. Since compile-time warning may fail to catch, we will need to - * emit run-time warning from __flush_workqueue(). + * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to + * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique + * and consecutive pod ID. The rest of @pt is initialized accordingly. + */ +static void __init init_pod_type(struct wq_pod_type *pt, + bool (*cpus_share_pod)(int, int)) +{ + int cur, pre, cpu, pod; + + pt->nr_pods = 0; + + /* init @pt->cpu_pod[] according to @cpus_share_pod() */ + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->cpu_pod); + + for_each_possible_cpu(cur) { + for_each_possible_cpu(pre) { + if (pre >= cur) { + pt->cpu_pod[cur] = pt->nr_pods++; + break; + } + if (cpus_share_pod(cur, pre)) { + pt->cpu_pod[cur] = pt->cpu_pod[pre]; + break; + } + } + } + + /* init the rest to match @pt->cpu_pod[] */ + pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node); + + for (pod = 0; pod < pt->nr_pods; pod++) + BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); + + for_each_possible_cpu(cpu) { + cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); + pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); + } +} + +static bool __init cpus_dont_share(int cpu0, int cpu1) +{ + return false; +} + +static bool __init cpus_share_smt(int cpu0, int cpu1) +{ +#ifdef CONFIG_SCHED_SMT + return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); +#else + return false; +#endif +} + +static bool __init cpus_share_numa(int cpu0, int cpu1) +{ + return cpu_to_node(cpu0) == cpu_to_node(cpu1); +} + +/** + * workqueue_init_topology - initialize CPU pods for unbound workqueues + * + * This is the third step of there-staged workqueue subsystem initialization and + * invoked after SMP and topology information are fully initialized. It + * initializes the unbound CPU pods accordingly. */ -void __warn_flushing_systemwide_wq(void) { } +void __init workqueue_init_topology(void) +{ + struct workqueue_struct *wq; + int cpu; + + init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); + init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); + init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); + init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); + + mutex_lock(&wq_pool_mutex); + + /* + * Workqueues allocated earlier would have all CPUs sharing the default + * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU + * combinations to apply per-pod sharing. + */ + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } + } + + mutex_unlock(&wq_pool_mutex); +} + +void __warn_flushing_systemwide_wq(void) +{ + pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); + dump_stack(); +} EXPORT_SYMBOL(__warn_flushing_systemwide_wq); + +static int __init workqueue_unbound_cpus_setup(char *str) +{ + if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { + cpumask_clear(&wq_cmdline_cpumask); + pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); + } + + return 1; +} +__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 6b1d66e28269..f6275944ada7 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -48,7 +48,7 @@ struct worker { /* A: runs through worker->node */ unsigned long last_active; /* K: last active timestamp */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ int id; /* I: worker id */ /* |