From 3bd916ee0ecbbdd902fc24845f2fef332b2a310c Mon Sep 17 00:00:00 2001 From: Yonghong Song <yhs@fb.com> Date: Fri, 11 Feb 2022 11:49:48 -0800 Subject: bpf: Emit bpf_timer in vmlinux BTF Currently the following code in check_and_init_map_value() *(struct bpf_timer *)(dst + map->timer_off) = (struct bpf_timer){}; can help generate bpf_timer definition in vmlinuxBTF. But the code above may not zero the whole structure due to anonymour members and that code will be replaced by memset in the subsequent patch and bpf_timer definition will disappear from vmlinuxBTF. Let us emit the type explicitly so bpf program can continue to use it from vmlinux.h. Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20220211194948.3141529-1-yhs@fb.com --- kernel/bpf/helpers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01cfdf40c838..55c084251fab 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/bpf-cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> @@ -1075,6 +1076,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) void *key; u32 idx; + BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; -- cgit v1.2.3-70-g09d2 From ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e Mon Sep 17 00:00:00 2001 From: Halil Pasic <pasic@linux.ibm.com> Date: Fri, 11 Feb 2022 02:12:52 +0100 Subject: swiotlb: fix info leak with DMA_FROM_DEVICE The problem I'm addressing was discovered by the LTP test covering cve-2018-1000204. A short description of what happens follows: 1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV and a corresponding dxferp. The peculiar thing about this is that TUR is not reading from the device. 2) In sg_start_req() the invocation of blk_rq_map_user() effectively bounces the user-space buffer. As if the device was to transfer into it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in sg_build_indirect()") we make sure this first bounce buffer is allocated with GFP_ZERO. 3) For the rest of the story we keep ignoring that we have a TUR, so the device won't touch the buffer we prepare as if the we had a DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device and the buffer allocated by SG is mapped by the function virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here scatter-gather and not scsi generics). This mapping involves bouncing via the swiotlb (we need swiotlb to do virtio in protected guest like s390 Secure Execution, or AMD SEV). 4) When the SCSI TUR is done, we first copy back the content of the second (that is swiotlb) bounce buffer (which most likely contains some previous IO data), to the first bounce buffer, which contains all zeros. Then we copy back the content of the first bounce buffer to the user-space buffer. 5) The test case detects that the buffer, which it zero-initialized, ain't all zeros and fails. One can argue that this is an swiotlb problem, because without swiotlb we leak all zeros, and the swiotlb should be transparent in a sense that it does not affect the outcome (if all other participants are well behaved). Copying the content of the original buffer into the swiotlb buffer is the only way I can think of to make swiotlb transparent in such scenarios. So let's do just that if in doubt, but allow the driver to tell us that the whole mapped buffer is going to be overwritten, in which case we can preserve the old behavior and avoid the performance impact of the extra bounce. Signed-off-by: Halil Pasic <pasic@linux.ibm.com> Signed-off-by: Christoph Hellwig <hch@lst.de> --- Documentation/core-api/dma-attributes.rst | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..17706dc91ec9 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..6150d11a607e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1e7ea160b43..bfc56cb21705 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -628,7 +628,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -- cgit v1.2.3-70-g09d2 From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001 From: Zhang Qiao <zhangqiao22@huawei.com> Date: Fri, 21 Jan 2022 18:12:10 +0800 Subject: cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As previously discussed(https://lkml.org/lkml/2022/1/20/51), cpuset_attach() is affected with similar cpu hotplug race, as follow scenario: cpuset_attach() cpu hotplug --------------------------- ---------------------- down_write(cpuset_rwsem) guarantee_online_cpus() // (load cpus_attach) sched_cpu_deactivate set_cpu_active() // will change cpu_active_mask set_cpus_allowed_ptr(cpus_attach) __set_cpus_allowed_ptr_locked() // (if the intersection of cpus_attach and cpu_active_mask is empty, will return -EINVAL) up_write(cpuset_rwsem) To avoid races such as described above, protect cpuset_attach() call with cpu_hotplug_lock. Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time") Cc: stable@vger.kernel.org # v2.6.32+ Reported-by: Zhao Gongyi <zhaogongyi@huawei.com> Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> Acked-by: Waiman Long <longman@redhat.com> Reviewed-by: Michal Koutný <mkoutny@suse.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4c7254e8f49a..97c53f3cc917 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); + cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ -- cgit v1.2.3-70-g09d2 From 45ce4b4f9009102cd9f581196d480a59208690c1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi <memxor@gmail.com> Date: Thu, 17 Feb 2022 01:49:43 +0530 Subject: bpf: Fix crash due to out of bounds access into reg2btf_ids. When commit e6ac2450d6de ("bpf: Support bpf program calling kernel function") added kfunc support, it defined reg2btf_ids as a cheap way to translate the verifier reg type to the appropriate btf_vmlinux BTF ID, however commit c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL") moved the __BPF_REG_TYPE_MAX from the last member of bpf_reg_type enum to after the base register types, and defined other variants using type flag composition. However, now, the direct usage of reg->type to index into reg2btf_ids may no longer fall into __BPF_REG_TYPE_MAX range, and hence lead to out of bounds access and kernel crash on dereference of bad pointer. Fixes: c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL") Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20220216201943.624869-1-memxor@gmail.com --- kernel/bpf/btf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e16dafeb2450..3e23b3fa79ff 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5688,7 +5688,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } if (check_ptr_off_reg(env, reg, regno)) return -EINVAL; - } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { + } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || + (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { const struct btf_type *reg_ref_t; const struct btf *reg_btf; const char *reg_ref_tname; @@ -5706,7 +5707,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_id = reg->btf_id; } else { reg_btf = btf_vmlinux; - reg_ref_id = *reg2btf_ids[reg->type]; + reg_ref_id = *reg2btf_ids[base_type(reg->type)]; } reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, -- cgit v1.2.3-70-g09d2 From 75134f16e7dd0007aa474b281935c5f42e79f2c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 17 Feb 2022 10:19:02 -0800 Subject: bpf: Add schedule points in batch ops syzbot reported various soft lockups caused by bpf batch operations. INFO: task kworker/1:1:27 blocked for more than 140 seconds. INFO: task hung in rcu_barrier Nothing prevents batch ops to process huge amount of data, we need to add schedule points in them. Note that maybe_wait_bpf_programs(map) calls from generic_map_delete_batch() can be factorized by moving the call after the loop. This will be done later in -next tree once we get this fix merged, unless there is strong opinion doing this optimization sooner. Fixes: aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Reviewed-by: Stanislav Fomichev <sdf@google.com> Acked-by: Brian Vazquez <brianvv@google.com> Link: https://lore.kernel.org/bpf/20220217181902.808742-1-eric.dumazet@gmail.com --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fa4505f9b611..ca70fe6fba38 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1355,6 +1355,7 @@ int generic_map_delete_batch(struct bpf_map *map, maybe_wait_bpf_programs(map); if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; @@ -1412,6 +1413,7 @@ int generic_map_update_batch(struct bpf_map *map, if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) @@ -1509,6 +1511,7 @@ int generic_map_lookup_batch(struct bpf_map *map, swap(prev_key, key); retry = MAP_LOOKUP_RETRIES; cp++; + cond_resched(); } if (err == -EFAULT) -- cgit v1.2.3-70-g09d2 From 44a3918c8245ab10c6c9719dd12e7a8d291980d8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf <jpoimboe@redhat.com> Date: Fri, 18 Feb 2022 11:49:08 -0800 Subject: x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> Signed-off-by: Borislav Petkov <bp@suse.de> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++++++++------ include/linux/bpf.h | 11 +++++++++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 47 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 79c52dd6c597..0a4267c63d3b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include <linux/prctl.h> #include <linux/sched/smt.h> #include <linux/pgtable.h> +#include <linux/bpf.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -650,6 +651,16 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -994,6 +1005,9 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + if (spectre_v2_in_eibrs_mode(mode)) { /* Force it so VMEXIT will restore correctly */ x86_spec_ctrl_base |= SPEC_CTRL_IBRS; @@ -1780,6 +1794,20 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1805,12 +1833,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..1f56806d8eb9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1793,6 +1793,11 @@ struct bpf_core_ctx { int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2012,6 +2017,12 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { return NULL; } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..730ab56d9e92 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -180,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -197,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ -- cgit v1.2.3-70-g09d2 From 6d3971dab239e7db1691690a02ce6becf30689cb Mon Sep 17 00:00:00 2001 From: Christian Brauner <brauner@kernel.org> Date: Mon, 21 Feb 2022 16:16:39 +0100 Subject: cgroup: clarify cgroup_css_set_fork() With recent fixes for the permission checking when moving a task into a cgroup using a file descriptor to a cgroup's cgroup.procs file and calling write() it seems a good idea to clarify CLONE_INTO_CGROUP permission checking with a comment. Cc: Tejun Heo <tj@kernel.org> Cc: <cgroups@vger.kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup/cgroup.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..77702e089d6a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6161,6 +6161,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) if (ret) goto err; + /* + * Spawning a task directly into a cgroup works by passing a file + * descriptor to the target cgroup directory. This can even be an O_PATH + * file descriptor. But it can never be a cgroup.procs file descriptor. + * This was done on purpose so spawning into a cgroup could be + * conceptualized as an atomic + * + * fd = openat(dfd_cgroup, "cgroup.procs", ...); + * write(fd, <child-pid>, ...); + * + * sequence, i.e. it's a shorthand for the caller opening and writing + * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us + * to always use the caller's credentials. + */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); -- cgit v1.2.3-70-g09d2 From 467a726b754f474936980da793b4ff2ec3e382a7 Mon Sep 17 00:00:00 2001 From: Michal Koutný <mkoutny@suse.com> Date: Thu, 17 Feb 2022 17:11:28 +0100 Subject: cgroup-v1: Correct privileges check in release_agent writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idea is to check: a) the owning user_ns of cgroup_ns, b) capabilities in init_user_ns. The commit 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") got this wrong in the write handler of release_agent since it checked user_ns of the opener (may be different from the owning user_ns of cgroup_ns). Secondly, to avoid possibly confused deputy, the capability of the opener must be checked. Fixes: 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/stable/20220216121142.GB30035@blackbody.suse.cz/ Signed-off-by: Michal Koutný <mkoutny@suse.com> Reviewed-by: Masami Ichikawa(CIP) <masami.ichikawa@cybertrust.co.jp> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup/cgroup-v1.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0e877dbcfeea..afc6c0e9c966 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -546,6 +546,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; + struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); @@ -553,8 +554,9 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, * Release agent gets called with all capabilities, * require capabilities to set release agent. */ - if ((of->file->f_cred->user_ns != &init_user_ns) || - !capable(CAP_SYS_ADMIN)) + ctx = of->priv; + if ((ctx->ns->user_ns != &init_user_ns) || + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); -- cgit v1.2.3-70-g09d2 From c70cd039f1d779126347a896a58876782dcc5284 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Date: Wed, 16 Feb 2022 11:17:53 +0800 Subject: cpuset: Fix kernel-doc Fix the following W=1 kernel warnings: kernel/cgroup/cpuset.c:3718: warning: expecting prototype for cpuset_memory_pressure_bump(). Prototype was for __cpuset_memory_pressure_bump() instead. kernel/cgroup/cpuset.c:3568: warning: expecting prototype for cpuset_node_allowed(). Prototype was for __cpuset_node_allowed() instead. Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup/cpuset.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 97c53f3cc917..5de18448016c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3524,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3696,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void) int cpuset_memory_pressure_enabled __read_mostly; -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. @@ -3712,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly; * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. - **/ + */ void __cpuset_memory_pressure_bump(void) { -- cgit v1.2.3-70-g09d2 From ce33c845b030c9cf768370c951bc699470b09fa7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira <bristot@kernel.org> Date: Sun, 20 Feb 2022 23:49:57 +0100 Subject: tracing: Dump stacktrace trigger to the corresponding instance The stacktrace event trigger is not dumping the stacktrace to the instance where it was enabled, but to the global "instance." Use the private_data, pointing to the trigger file, to figure out the corresponding trace instance, and use it in the trigger action, like snapshot_trigger does. Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.1645396236.git.bristot@kernel.org Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Reviewed-by: Tom Zanussi <zanussi@kernel.org> Tested-by: Tom Zanussi <zanussi@kernel.org> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_events_trigger.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d00fee705f9c..e0d50c9577f3 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1540,7 +1540,12 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - trace_dump_stack(STACK_SKIP); + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); } static void -- cgit v1.2.3-70-g09d2 From 0ac983f512033cb7b5e210c9589768ad25b1e36b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Thu, 24 Feb 2022 08:32:28 -0600 Subject: ucounts: Fix systemd LimitNPROC with private users regression Long story short recursively enforcing RLIMIT_NPROC when it is not enforced on the process that creates a new user namespace, causes currently working code to fail. There is no reason to enforce RLIMIT_NPROC recursively when we don't enforce it normally so update the code to detect this case. I would like to simply use capable(CAP_SYS_RESOURCE) to detect when RLIMIT_NPROC is not enforced upon the caller. Unfortunately because RLIMIT_NPROC is charged and checked for enforcement based upon the real uid, using capable() which is euid based is inconsistent with reality. Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by testing for when the real uid would match the conditions when CAP_SYS_RESOURCE would be present if the real uid was the effective uid. Reported-by: Etienne Dechamps <etienne@edechamps.fr> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596 Link: https://lkml.kernel.org/r/e9589141-cfeb-90cd-2d0e-83a62787239a@edechamps.fr Link: https://lkml.kernel.org/r/87sfs8jmpz.fsf_-_@email.froward.int.ebiederm.org Cc: stable@vger.kernel.org Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> --- kernel/user_namespace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 6b2e3ca7ee99..5481ba44a8d6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->user_ns = user_ns; } +static unsigned long enforced_nproc_rlimit(void) +{ + unsigned long limit = RLIM_INFINITY; + + /* Is RLIMIT_NPROC currently enforced? */ + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || + (current_user_ns() != &init_user_ns)) + limit = rlimit(RLIMIT_NPROC); + + return limit; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); -- cgit v1.2.3-70-g09d2 From 302e9edd54985f584cfc180098f3554774126969 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" <rostedt@goodmis.org> Date: Wed, 23 Feb 2022 22:38:37 -0500 Subject: tracing: Have traceon and traceoff trigger honor the instance If a trigger is set on an event to disable or enable tracing within an instance, then tracing should be disabled or enabled in the instance and not at the top level, which is confusing to users. Link: https://lkml.kernel.org/r/20220223223837.14f94ec3@rorschach.local.home Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Tested-by: Daniel Bristot de Oliveira <bristot@kernel.org> Reviewed-by: Tom Zanussi <zanussi@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_events_trigger.c | 52 ++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e0d50c9577f3..efe563140f27 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1295,6 +1295,16 @@ traceon_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_on(file->tr); + return; + } + if (tracing_is_on()) return; @@ -1306,8 +1316,15 @@ traceon_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + } else { + if (tracing_is_on()) + return; + } if (!data->count) return; @@ -1315,7 +1332,10 @@ traceon_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_on(); + if (file) + tracer_tracing_on(file->tr); + else + tracing_on(); } static void @@ -1323,6 +1343,16 @@ traceoff_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_off(file->tr); + return; + } + if (!tracing_is_on()) return; @@ -1334,8 +1364,15 @@ traceoff_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (!tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + } else { + if (!tracing_is_on()) + return; + } if (!data->count) return; @@ -1343,7 +1380,10 @@ traceoff_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_off(); + if (file) + tracer_tracing_off(file->tr); + else + tracing_off(); } static int -- cgit v1.2.3-70-g09d2 From b61edd57740de5895f44f2ea417b164d9e1708bb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" <rostedt@goodmis.org> Date: Fri, 18 Feb 2022 19:00:57 -0500 Subject: eprobes: Remove redundant event type information Currently, the event probes save the type of the event they are attached to when recording the event. For example: # echo 'e:switch sched/sched_switch prev_state=$prev_state prev_prio=$prev_prio next_pid=$next_pid next_prio=$next_prio' > dynamic_events # cat events/eprobes/switch/format name: switch ID: 1717 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:unsigned int __probe_type; offset:8; size:4; signed:0; field:u64 prev_state; offset:12; size:8; signed:0; field:u64 prev_prio; offset:20; size:8; signed:0; field:u64 next_pid; offset:28; size:8; signed:0; field:u64 next_prio; offset:36; size:8; signed:0; print fmt: "(%u) prev_state=0x%Lx prev_prio=0x%Lx next_pid=0x%Lx next_prio=0x%Lx", REC->__probe_type, REC->prev_state, REC->prev_prio, REC->next_pid, REC->next_prio The __probe_type adds 4 bytes to every event. One of the reasons for creating eprobes is to limit what is traced in an event to be able to limit what is written into the ring buffer. Having this redundant 4 bytes to every event takes away from this. The event that is recorded can be retrieved from the event probe itself, that is available when the trace is happening. For user space tools, it could simply read the dynamic_event file to find the event they are for. So there is really no reason to write this information into the ring buffer for every event. Link: https://lkml.kernel.org/r/20220218190057.2f5a19a8@gandalf.local.home Acked-by: Masami Hiramatsu <mhiramat@kernel.org> Reviewed-by: Joel Fernandes <joel@joelfernandes.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace.h | 1 - kernel/trace/trace_eprobe.c | 16 +++++++--------- kernel/trace/trace_probe.c | 10 +++++----- kernel/trace/trace_probe.h | 1 - 4 files changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d038ddbf1bea..c5b09c31e077 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -136,7 +136,6 @@ struct kprobe_trace_entry_head { struct eprobe_trace_entry_head { struct trace_entry ent; - unsigned int type; }; struct kretprobe_trace_entry_head { diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 191db32dec46..541aa13581b9 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -242,7 +242,6 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) static int eprobe_event_define_fields(struct trace_event_call *event_call) { - int ret; struct eprobe_trace_entry_head field; struct trace_probe *tp; @@ -250,8 +249,6 @@ static int eprobe_event_define_fields(struct trace_event_call *event_call) if (WARN_ON_ONCE(!tp)) return -ENOENT; - DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } @@ -270,7 +267,9 @@ print_eprobe_event(struct trace_iterator *iter, int flags, struct trace_event_call *pevent; struct trace_event *probed_event; struct trace_seq *s = &iter->seq; + struct trace_eprobe *ep; struct trace_probe *tp; + unsigned int type; field = (struct eprobe_trace_entry_head *)iter->ent; tp = trace_probe_primary_from_call( @@ -278,15 +277,18 @@ print_eprobe_event(struct trace_iterator *iter, int flags, if (WARN_ON_ONCE(!tp)) goto out; + ep = container_of(tp, struct trace_eprobe, tp); + type = ep->event->event.type; + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - probed_event = ftrace_find_event(field->type); + probed_event = ftrace_find_event(type); if (probed_event) { pevent = container_of(probed_event, struct trace_event_call, event); trace_seq_printf(s, "%s.%s", pevent->class->system, trace_event_name(pevent)); } else { - trace_seq_printf(s, "%u", field->type); + trace_seq_printf(s, "%u", type); } trace_seq_putc(s, ')'); @@ -498,10 +500,6 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); - if (edata->ep->event) - entry->type = edata->ep->event->event.type; - else - entry->type = 0; store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 73d90179b51b..80863c6508e5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -871,15 +871,15 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, switch (ptype) { case PROBE_PRINT_NORMAL: fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; + arg = ", REC->" FIELD_STRING_IP; break; case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; case PROBE_PRINT_EVENT: - fmt = "(%u)"; - arg = "REC->" FIELD_STRING_TYPE; + fmt = ""; + arg = ""; break; default: WARN_ON_ONCE(1); @@ -903,7 +903,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, parg->type->fmt); } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg); for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 99e7a5df025e..92cc149af0fd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,7 +38,6 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" -#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ -- cgit v1.2.3-70-g09d2 From bc82c38a6933aab308387d4aca47e0a05de7b553 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Fri, 11 Feb 2022 08:10:18 +0100 Subject: tracing: Uninline trace_trigger_soft_disabled() partly On a powerpc32 build with CONFIG_CC_OPTIMISE_FOR_SIZE, the inline keyword is not honored and trace_trigger_soft_disabled() appears approx 50 times in vmlinux. Adding -Winline to the build, the following message appears: ./include/linux/trace_events.h:712:1: error: inlining failed in call to 'trace_trigger_soft_disabled': call is unlikely and code size would grow [-Werror=inline] That function is rather big for an inlined function: c003df60 <trace_trigger_soft_disabled>: c003df60: 94 21 ff f0 stwu r1,-16(r1) c003df64: 7c 08 02 a6 mflr r0 c003df68: 90 01 00 14 stw r0,20(r1) c003df6c: bf c1 00 08 stmw r30,8(r1) c003df70: 83 e3 00 24 lwz r31,36(r3) c003df74: 73 e9 01 00 andi. r9,r31,256 c003df78: 41 82 00 10 beq c003df88 <trace_trigger_soft_disabled+0x28> c003df7c: 38 60 00 00 li r3,0 c003df80: 39 61 00 10 addi r11,r1,16 c003df84: 4b fd 60 ac b c0014030 <_rest32gpr_30_x> c003df88: 73 e9 00 80 andi. r9,r31,128 c003df8c: 7c 7e 1b 78 mr r30,r3 c003df90: 41 a2 00 14 beq c003dfa4 <trace_trigger_soft_disabled+0x44> c003df94: 38 c0 00 00 li r6,0 c003df98: 38 a0 00 00 li r5,0 c003df9c: 38 80 00 00 li r4,0 c003dfa0: 48 05 c5 f1 bl c009a590 <event_triggers_call> c003dfa4: 73 e9 00 40 andi. r9,r31,64 c003dfa8: 40 82 00 28 bne c003dfd0 <trace_trigger_soft_disabled+0x70> c003dfac: 73 ff 02 00 andi. r31,r31,512 c003dfb0: 41 82 ff cc beq c003df7c <trace_trigger_soft_disabled+0x1c> c003dfb4: 80 01 00 14 lwz r0,20(r1) c003dfb8: 83 e1 00 0c lwz r31,12(r1) c003dfbc: 7f c3 f3 78 mr r3,r30 c003dfc0: 83 c1 00 08 lwz r30,8(r1) c003dfc4: 7c 08 03 a6 mtlr r0 c003dfc8: 38 21 00 10 addi r1,r1,16 c003dfcc: 48 05 6f 6c b c0094f38 <trace_event_ignore_this_pid> c003dfd0: 38 60 00 01 li r3,1 c003dfd4: 4b ff ff ac b c003df80 <trace_trigger_soft_disabled+0x20> However it is located in a hot path so inlining it is important. But forcing inlining of the entire function by using __always_inline leads to increasing the text size by approx 20 kbytes. Instead, split the fonction in two parts, one part with the likely fast path, flagged __always_inline, and a second part out of line. With this change, on a powerpc32 with CONFIG_CC_OPTIMISE_FOR_SIZE vmlinux text increases by only 1,4 kbytes, which is partly compensated by a decrease of vmlinux data by 7 kbytes. On ppc64_defconfig which has CONFIG_CC_OPTIMISE_FOR_SPEED, this change reduces vmlinux text by more than 30 kbytes. Link: https://lkml.kernel.org/r/69ce0986a52d026d381d612801d978aa4f977460.1644563295.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- include/linux/trace_events.h | 22 ++++++++++++---------- kernel/trace/trace_events_trigger.c | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 70c069aef02c..dcea51fb60e2 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -699,6 +699,8 @@ event_triggers_post_call(struct trace_event_file *file, bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); +bool __trace_trigger_soft_disabled(struct trace_event_file *file); + /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test @@ -708,20 +710,20 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); * triggers that require testing the fields, it will return true, * otherwise false. */ -static inline bool +static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; - if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { - if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL, NULL); - if (eflags & EVENT_FILE_FL_SOFT_DISABLED) - return true; - if (eflags & EVENT_FILE_FL_PID_FILTER) - return trace_event_ignore_this_pid(file); - } - return false; + if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | + EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) + return false; + + return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index efe563140f27..7eb9d04f1c2e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -84,6 +84,20 @@ event_triggers_call(struct trace_event_file *file, } EXPORT_SYMBOL_GPL(event_triggers_call); +bool __trace_trigger_soft_disabled(struct trace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) + event_triggers_call(file, NULL, NULL, NULL); + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) + return true; + if (eflags & EVENT_FILE_FL_PID_FILTER) + return trace_event_ignore_this_pid(file); + return false; +} +EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled); + /** * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event -- cgit v1.2.3-70-g09d2 From 7acf3a127bb7c65ff39099afd78960e77b2ca5de Mon Sep 17 00:00:00 2001 From: Sven Schnelle <svens@linux.ibm.com> Date: Mon, 14 Feb 2022 14:44:56 +0100 Subject: tracing: Ensure trace buffer is at least 4096 bytes large Booting the kernel with 'trace_buf_size=1' give a warning at boot during the ftrace selftests: [ 0.892809] Running postponed tracer tests: [ 0.892893] Testing tracer function: [ 0.901899] Callback from call_rcu_tasks_trace() invoked. [ 0.983829] Callback from call_rcu_tasks_rude() invoked. [ 1.072003] .. bad ring buffer .. corrupted trace buffer .. [ 1.091944] Callback from call_rcu_tasks() invoked. [ 1.097695] PASSED [ 1.097701] Testing dynamic ftrace: .. filter failed count=0 ..FAILED! [ 1.353474] ------------[ cut here ]------------ [ 1.353478] WARNING: CPU: 0 PID: 1 at kernel/trace/trace.c:1951 run_tracer_selftest+0x13c/0x1b0 Therefore enforce a minimum of 4096 bytes to make the selftest pass. Link: https://lkml.kernel.org/r/20220214134456.1751749-1-svens@linux.ibm.com Signed-off-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7c2578efde26..3050892d1812 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1474,10 +1474,12 @@ static int __init set_buf_size(char *str) if (!str) return 0; buf_size = memparse(str, &str); - /* nr_entries can not be zero */ - if (buf_size == 0) - return 0; - trace_buf_size = buf_size; + /* + * nr_entries can not be zero and the startup + * tests require some buffer space. Therefore + * ensure we have at least 4096 bytes of buffer. + */ + trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); -- cgit v1.2.3-70-g09d2 From ab2f993c01f261aa3eeb8842842ff38bff7806b6 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor <nathan@kernel.org> Date: Mon, 14 Feb 2022 12:28:47 -0700 Subject: ftrace: Remove unused ftrace_startup_enable() stub When building with clang + CONFIG_DYNAMIC_FTRACE=n + W=1, there is a warning: kernel/trace/ftrace.c:7194:20: error: unused function 'ftrace_startup_enable' [-Werror,-Wunused-function] static inline void ftrace_startup_enable(int command) { } ^ 1 error generated. Clang warns on instances of static inline functions in .c files with W=1 after commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). The ftrace_startup_enable() stub has been unused since commit e1effa0144a1 ("ftrace: Annotate the ops operation on update"), where its use outside of the CONFIG_DYNAMIC_TRACE section was replaced by ftrace_startup_all(). Remove it to resolve the warning. Link: https://lkml.kernel.org/r/20220214192847.488166-1-nathan@kernel.org Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Nathan Chancellor <nathan@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9feb197b2da..a4b462b6f944 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7191,7 +7191,6 @@ static int __init ftrace_nodyn_init(void) core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } -static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } # define ftrace_startup_sysctl() do { } while (0) -- cgit v1.2.3-70-g09d2 From dd990352f01ee9a6c6eee152e5d11c021caccfe4 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira <bristot@kernel.org> Date: Fri, 18 Feb 2022 16:17:38 +0100 Subject: tracing/osnoise: Make osnoise_main to sleep for microseconds osnoise's runtime and period are in the microseconds scale, but it is currently sleeping in the millisecond's scale. This behavior roots in the usage of hwlat as the skeleton for osnoise. Make osnoise to sleep in the microseconds scale. Also, move the sleep to a specialized function. Link: https://lkml.kernel.org/r/302aa6c7bdf2d131719b22901905e9da122a11b2.1645197336.git.bristot@kernel.org Cc: Ingo Molnar <mingo@redhat.com> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_osnoise.c | 53 ++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 870a08da5b48..cfddb30e65ab 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1436,6 +1436,37 @@ out: static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; +/* + * osnoise_sleep - sleep until the next period + */ +static void osnoise_sleep(void) +{ + u64 interval; + ktime_t wake_time; + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (!interval) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); + return; + } + + wake_time = ktime_add_us(ktime_get(), interval); + __set_current_state(TASK_INTERRUPTIBLE); + + while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + if (kthread_should_stop()) + break; + } +} + /* * osnoise_main - The osnoise detection kernel thread * @@ -1444,30 +1475,10 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - u64 interval; while (!kthread_should_stop()) { - run_osnoise(); - - mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; - mutex_unlock(&interface_lock); - - do_div(interval, USEC_PER_MSEC); - - /* - * differently from hwlat_detector, the osnoise tracer can run - * without a pause because preemption is on. - */ - if (interval < 1) { - /* Let synchronize_rcu_tasks() make progress */ - cond_resched_tasks_rcu_qs(); - continue; - } - - if (msleep_interruptible(interval)) - break; + osnoise_sleep(); } return 0; -- cgit v1.2.3-70-g09d2 From c5229a0bd47814770c895e94fbc97ad21819abfe Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Mon, 20 Dec 2021 16:38:06 +0000 Subject: tracing: Fix selftest config check for function graph start up test CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is required to test direct tramp. Link: https://lkml.kernel.org/r/bdc7e594e13b0891c1d61bc8d56c94b1890eaed7.1640017960.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_selftest.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index afd937a46496..abcadbe933bb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,9 +784,7 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; -#if defined(CONFIG_DYNAMIC_FTRACE) && \ - defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) -#define TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS noinline __noclone static void trace_direct_tramp(void) { } #endif @@ -849,7 +847,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } -#ifdef TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); -- cgit v1.2.3-70-g09d2 From 30939293262eb433c960c4532a0d59c4073b2b84 Mon Sep 17 00:00:00 2001 From: Yu Kuai <yukuai3@huawei.com> Date: Mon, 28 Feb 2022 11:43:54 +0800 Subject: blktrace: fix use after free for struct blk_trace When tracing the whole disk, 'dropped' and 'msg' will be created under 'q->debugfs_dir' and 'bt->dir' is NULL, thus blk_trace_free() won't remove those files. What's worse, the following UAF can be triggered because of accessing stale 'dropped' and 'msg': ================================================================== BUG: KASAN: use-after-free in blk_dropped_read+0x89/0x100 Read of size 4 at addr ffff88816912f3d8 by task blktrace/1188 CPU: 27 PID: 1188 Comm: blktrace Not tainted 5.17.0-rc4-next-20220217+ #469 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 Call Trace: <TASK> dump_stack_lvl+0x34/0x44 print_address_description.constprop.0.cold+0xab/0x381 ? blk_dropped_read+0x89/0x100 ? blk_dropped_read+0x89/0x100 kasan_report.cold+0x83/0xdf ? blk_dropped_read+0x89/0x100 kasan_check_range+0x140/0x1b0 blk_dropped_read+0x89/0x100 ? blk_create_buf_file_callback+0x20/0x20 ? kmem_cache_free+0xa1/0x500 ? do_sys_openat2+0x258/0x460 full_proxy_read+0x8f/0xc0 vfs_read+0xc6/0x260 ksys_read+0xb9/0x150 ? vfs_write+0x3d0/0x3d0 ? fpregs_assert_state_consistent+0x55/0x60 ? exit_to_user_mode_prepare+0x39/0x1e0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fbc080d92fd Code: ce 20 00 00 75 10 b8 00 00 00 00 0f 05 48 3d 01 f0 ff ff 73 31 c3 48 83 1 RSP: 002b:00007fbb95ff9cb0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00007fbb95ff9dc0 RCX: 00007fbc080d92fd RDX: 0000000000000100 RSI: 00007fbb95ff9cc0 RDI: 0000000000000045 RBP: 0000000000000045 R08: 0000000000406299 R09: 00000000fffffffd R10: 000000000153afa0 R11: 0000000000000293 R12: 00007fbb780008c0 R13: 00007fbb78000938 R14: 0000000000608b30 R15: 00007fbb780029c8 </TASK> Allocated by task 1050: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 do_blk_trace_setup+0xcb/0x410 __blk_trace_setup+0xac/0x130 blk_trace_ioctl+0xe9/0x1c0 blkdev_ioctl+0xf1/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 1050: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0x103/0x180 kfree+0x9a/0x4c0 __blk_trace_remove+0x53/0x70 blk_trace_ioctl+0x199/0x1c0 blkdev_common_ioctl+0x5e9/0xb30 blkdev_ioctl+0x1a5/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88816912f380 which belongs to the cache kmalloc-96 of size 96 The buggy address is located 88 bytes inside of 96-byte region [ffff88816912f380, ffff88816912f3e0) The buggy address belongs to the page: page:000000009a1b4e7c refcount:1 mapcount:0 mapping:0000000000000000 index:0x0f flags: 0x17ffffc0000200(slab|node=0|zone=2|lastcpupid=0x1fffff) raw: 0017ffffc0000200 ffffea00044f1100 dead000000000002 ffff88810004c780 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88816912f280: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f300: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc >ffff88816912f380: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ ffff88816912f400: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f480: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ================================================================== Fixes: c0ea57608b69 ("blktrace: remove debugfs file dentries from struct blk_trace") Signed-off-by: Yu Kuai <yukuai3@huawei.com> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Link: https://lore.kernel.org/r/20220228034354.4047385-1-yukuai3@huawei.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- kernel/trace/blktrace.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index af68a67179b4..21dea90eaa93 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -310,10 +310,20 @@ record_it: local_irq_restore(flags); } -static void blk_trace_free(struct blk_trace *bt) +static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); - debugfs_remove(bt->dir); + + /* + * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created + * under 'q->debugfs_dir', thus lookup and remove them. + */ + if (!bt->dir) { + debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); + debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + } else { + debugfs_remove(bt->dir); + } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -335,10 +345,10 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); put_probe_ref(); } @@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q) return -EINVAL; if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(bt); + blk_trace_cleanup(q, bt); return 0; } @@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: if (ret) - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } @@ -1616,7 +1626,7 @@ static int blk_trace_remove_queue(struct request_queue *q) put_probe_ref(); synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); return 0; } @@ -1647,7 +1657,7 @@ static int blk_trace_setup_queue(struct request_queue *q, return 0; free_bt: - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } -- cgit v1.2.3-70-g09d2 From 1d1898f65616c4601208963c3376c1d828cbf2c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" <rostedt@goodmis.org> Date: Tue, 1 Mar 2022 22:29:04 -0500 Subject: tracing/histogram: Fix sorting on old "cpu" value When trying to add a histogram against an event with the "cpu" field, it was impossible due to "cpu" being a keyword to key off of the running CPU. So to fix this, it was changed to "common_cpu" to match the other generic fields (like "common_pid"). But since some scripts used "cpu" for keying off of the CPU (for events that did not have "cpu" as a field, which is most of them), a backward compatibility trick was added such that if "cpu" was used as a key, and the event did not have "cpu" as a field name, then it would fallback and switch over to "common_cpu". This fix has a couple of subtle bugs. One was that when switching over to "common_cpu", it did not change the field name, it just set a flag. But the code still found a "cpu" field. The "cpu" field is used for filtering and is returned when the event does not have a "cpu" field. This was found by: # cd /sys/kernel/tracing # echo hist:key=cpu,pid:sort=cpu > events/sched/sched_wakeup/trigger # cat events/sched/sched_wakeup/hist Which showed the histogram unsorted: { cpu: 19, pid: 1175 } hitcount: 1 { cpu: 6, pid: 239 } hitcount: 2 { cpu: 23, pid: 1186 } hitcount: 14 { cpu: 12, pid: 249 } hitcount: 2 { cpu: 3, pid: 994 } hitcount: 5 Instead of hard coding the "cpu" checks, take advantage of the fact that trace_event_field_field() returns a special field for "cpu" and "CPU" if the event does not have "cpu" as a field. This special field has the "filter_type" of "FILTER_CPU". Check that to test if the returned field is of the CPU type instead of doing the string compare. Also, fix the sorting bug by testing for the hist_field flag of HIST_FIELD_FL_CPU when setting up the sort routine. Otherwise it will use the special CPU field to know what compare routine to use, and since that special field does not have a size, it returns tracing_map_cmp_none. Cc: stable@vger.kernel.org Fixes: 1e3bac71c505 ("tracing/histogram: Rename "cpu" to "common_cpu"") Reported-by: Daniel Bristot de Oliveira <bristot@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_events_hist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ada87bfb5bb8..dc7f733b4cb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2289,9 +2289,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, /* * For backward compatibility, if field_name * was "cpu", then we treat this the same as - * common_cpu. + * common_cpu. This also works for "CPU". */ - if (strcmp(field_name, "cpu") == 0) { + if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, @@ -4832,7 +4832,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; - else if (!field) + else if (!field || hist_field->flags & HIST_FIELD_FL_CPU) cmp_fn = tracing_map_cmp_num(hist_field->size, hist_field->is_signed); else if (is_string_field(field)) -- cgit v1.2.3-70-g09d2 From 1d02b444b8d1345ea4708db3bab4db89a7784b55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Wed, 2 Mar 2022 19:17:44 -0800 Subject: tracing: Fix return value of __setup handlers __setup() handlers should generally return 1 to indicate that the boot options have been handled. Using invalid option values causes the entire kernel boot option string to be reported as Unknown and added to init's environment strings, polluting it. Unknown kernel command line parameters "BOOT_IMAGE=/boot/bzImage-517rc6 kprobe_event=p,syscall_any,$arg1 trace_options=quiet trace_clock=jiffies", will be passed to user space. Run /sbin/init as init process with arguments: /sbin/init with environment: HOME=/ TERM=linux BOOT_IMAGE=/boot/bzImage-517rc6 kprobe_event=p,syscall_any,$arg1 trace_options=quiet trace_clock=jiffies Return 1 from the __setup() handlers so that init's environment is not polluted with kernel boot options. Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Link: https://lkml.kernel.org/r/20220303031744.32356-1-rdunlap@infradead.org Cc: stable@vger.kernel.org Fixes: 7bcfaf54f591 ("tracing: Add trace_options kernel command line parameter") Fixes: e1e232ca6b8f ("tracing: Add trace_clock=<clock> kernel parameter") Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru> Acked-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3050892d1812..eb44418574f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,7 +235,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - return 0; + return 1; } __setup("trace_options=", set_trace_boot_options); @@ -246,7 +246,7 @@ static int __init set_trace_boot_clock(char *str) { strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; - return 0; + return 1; } __setup("trace_clock=", set_trace_boot_clock); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 508f14af4f2c..b62fd785b599 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -32,7 +32,7 @@ static int __init set_kprobe_boot_events(char *str) strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); - return 0; + return 1; } __setup("kprobe_event=", set_kprobe_boot_events); -- cgit v1.2.3-70-g09d2 From 5c26f6ac9416b63d093e29c30e79b3297e425472 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan <surenb@google.com> Date: Fri, 4 Mar 2022 20:28:51 -0800 Subject: mm: refactor vm_area_struct::anon_vma_name usage code Avoid mixing strings and their anon_vma_name referenced pointers by using struct anon_vma_name whenever possible. This simplifies the code and allows easier sharing of anon_vma_name structures when they represent the same name. [surenb@google.com: fix comment] Link: https://lkml.kernel.org/r/20220223153613.835563-1-surenb@google.com Link: https://lkml.kernel.org/r/20220224231834.1481408-1-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Matthew Wilcox <willy@infradead.org> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Colin Cross <ccross@google.com> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Kees Cook <keescook@chromium.org> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Alexey Gladkov <legion@kernel.org> Cc: Sasha Levin <sashal@kernel.org> Cc: Chris Hyser <chris.hyser@oracle.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Peter Collingbourne <pcc@google.com> Cc: Xiaofeng Cao <caoxiaofeng@yulong.com> Cc: David Hildenbrand <david@redhat.com> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/proc/task_mmu.c | 6 ++-- fs/userfaultfd.c | 6 ++-- include/linux/mm.h | 7 ++-- include/linux/mm_inline.h | 87 ++++++++++++++++++++++++++++++++--------------- include/linux/mm_types.h | 5 ++- kernel/fork.c | 4 +-- kernel/sys.c | 19 +++++++---- mm/madvise.c | 87 ++++++++++++++++------------------------------- mm/mempolicy.c | 2 +- mm/mlock.c | 2 +- mm/mmap.c | 12 +++---- mm/mprotect.c | 2 +- 12 files changed, 125 insertions(+), 114 deletions(-) (limited to 'kernel') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6e97ed775074..2c48b1eaaa9c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - const char *anon_name; + struct anon_vma_name *anon_name; if (!mm) { name = "[vdso]"; @@ -327,10 +327,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = vma_anon_name(vma); + anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); - seq_printf(m, "[anon:%s]", anon_name); + seq_printf(m, "[anon:%s]", anon_name->name); } } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e26b10132d47..8e03b3d3f5fa 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -878,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) vma = prev; else @@ -1438,7 +1438,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), - vma_anon_name(vma)); + anon_vma_name(vma)); if (prev) { vma = prev; goto next; @@ -1615,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..5744a3fc4716 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2626,7 +2626,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, const char *); + struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3372,11 +3372,12 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name); + unsigned long len_in, + struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name) { + unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..dd3accaa4e6d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -140,50 +140,81 @@ static __always_inline void del_page_from_lru_list(struct page *page, #ifdef CONFIG_ANON_VMA_NAME /* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. */ -extern const char *vma_anon_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name_alloc(const char *name); +extern void anon_vma_name_free(struct kref *kref); -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); +/* mmap_lock should be read-locked */ +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_get(&anon_name->kref); +} -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_put(&anon_name->kref, anon_vma_name_free); +} -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + struct anon_vma_name *anon_name = anon_vma_name(orig_vma); + + if (anon_name) { + anon_vma_name_get(anon_name); + new_vma->anon_name = anon_name; + } +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) { - const char *vma_name = vma_anon_name(vma); + /* + * Not using anon_vma_name because it generates a warning if mmap_lock + * is not held, which might be the case here. + */ + if (!vma->vm_file) + anon_vma_name_put(vma->anon_name); +} - /* either both NULL, or pointers to same string */ - if (vma_name == name) +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + if (anon_name1 == anon_name2) return true; - return name && vma_name && !strcmp(name, vma_name); + return anon_name1 && anon_name2 && + !strcmp(anon_name1->name, anon_name2->name); } + #else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} + +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_anon_vma_name(struct vm_area_struct *vma) {} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) { return true; } + #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb486..0f549870da6a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -416,7 +416,10 @@ struct vm_area_struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; - /* Serialized by mmap_sem. */ + /* + * Serialized by mmap_sem. Never use directly because it is + * valid only when vm_file is NULL. Use anon_vma_name instead. + */ struct anon_vma_name *anon_name; }; diff --git a/kernel/fork.c b/kernel/fork.c index a024bf6254df..f1e89007f228 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -366,14 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; - dup_vma_anon_name(orig, new); + dup_anon_vma_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { - free_vma_anon_name(vma); + free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index 97dc9e5d6bf9..5b0e172c4d47 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/utsname.h> #include <linux/mman.h> #include <linux/reboot.h> @@ -2286,15 +2287,16 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, { struct mm_struct *mm = current->mm; const char __user *uname; - char *name, *pch; + struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: uname = (const char __user *)arg; if (uname) { - name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + char *name, *pch; + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); @@ -2304,15 +2306,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, return -EINVAL; } } - } else { - /* Reset the name */ - name = NULL; + /* anon_vma has its own copy */ + anon_name = anon_vma_name_alloc(name); + kfree(name); + if (!anon_name) + return -ENOMEM; + } mmap_write_lock(mm); - error = madvise_set_anon_name(mm, addr, size, name); + error = madvise_set_anon_name(mm, addr, size, anon_name); mmap_write_unlock(mm); - kfree(name); + anon_vma_name_put(anon_name); break; default: error = -EINVAL; diff --git a/mm/madvise.c b/mm/madvise.c index 5604064df464..081b1cded21e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -65,7 +65,7 @@ static int madvise_need_mmap_write(int behavior) } #ifdef CONFIG_ANON_VMA_NAME -static struct anon_vma_name *anon_vma_name_alloc(const char *name) +struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; @@ -81,78 +81,49 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name) return anon_name; } -static void vma_anon_name_free(struct kref *kref) +void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } -static inline bool has_vma_anon_name(struct vm_area_struct *vma) +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - return !vma->vm_file && vma->anon_name; -} - -const char *vma_anon_name(struct vm_area_struct *vma) -{ - if (!has_vma_anon_name(vma)) - return NULL; - mmap_assert_locked(vma->vm_mm); - return vma->anon_name->name; -} - -void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) -{ - if (!has_vma_anon_name(orig_vma)) - return; - - kref_get(&orig_vma->anon_name->kref); - new_vma->anon_name = orig_vma->anon_name; -} - -void free_vma_anon_name(struct vm_area_struct *vma) -{ - struct anon_vma_name *anon_name; - - if (!has_vma_anon_name(vma)) - return; + if (vma->vm_file) + return NULL; - anon_name = vma->anon_name; - vma->anon_name = NULL; - kref_put(&anon_name->kref, vma_anon_name_free); + return vma->anon_name; } /* mmap_lock should be write-locked */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - const char *anon_name; + struct anon_vma_name *orig_name = anon_vma_name(vma); - if (!name) { - free_vma_anon_name(vma); + if (!anon_name) { + vma->anon_name = NULL; + anon_vma_name_put(orig_name); return 0; } - anon_name = vma_anon_name(vma); - if (anon_name) { - /* Same name, nothing to do here */ - if (!strcmp(name, anon_name)) - return 0; + if (anon_vma_name_eq(orig_name, anon_name)) + return 0; - free_vma_anon_name(vma); - } - vma->anon_name = anon_vma_name_alloc(name); - if (!vma->anon_name) - return -ENOMEM; + anon_vma_name_get(anon_name); + vma->anon_name = anon_name; + anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - if (name) + if (anon_name) return -EINVAL; return 0; @@ -165,13 +136,13 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, - const char *name) + struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } @@ -179,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, name); + vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; goto success; @@ -209,7 +180,7 @@ success: */ vma->vm_flags = new_flags; if (!vma->vm_file) { - error = replace_vma_anon_name(vma, name); + error = replace_anon_vma_name(vma, anon_name); if (error) return error; } @@ -1041,7 +1012,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, } error = madvise_update_vma(vma, prev, start, end, new_flags, - vma_anon_name(vma)); + anon_vma_name(vma)); out: /* @@ -1225,7 +1196,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long name) + unsigned long anon_name) { int error; @@ -1234,7 +1205,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (const char *)name); + (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1246,7 +1217,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name) + unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; @@ -1266,7 +1237,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)name, + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 028e8dd82b44..69284d3b5e53 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -814,7 +814,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, - vma_anon_name(vma)); + anon_vma_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index 8f584eddd305..25934e7db3e1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index d445c1b9d606..f61a15474dd6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1031,7 +1031,7 @@ again: static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; - if (!is_same_vma_anon_name(vma, anon_name)) + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) return 0; return 1; } @@ -1084,7 +1084,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { @@ -1106,7 +1106,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { @@ -1167,7 +1167,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -3256,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index 5ca3fbcb1495..2887644fd150 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); -- cgit v1.2.3-70-g09d2 From d1eff16d727ff257b706d32114d3881f67cc9c75 Mon Sep 17 00:00:00 2001 From: Qian Cai <quic_qiancai@quicinc.com> Date: Fri, 4 Mar 2022 20:29:10 -0800 Subject: configs/debug: set CONFIG_DEBUG_INFO=y properly CONFIG_DEBUG_INFO can't be set by user directly, so set CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y instead. Otherwise, we end up with no debuginfo in vmlinux which is a big no-no for kernel debugging. Link: https://lkml.kernel.org/r/20220301202920.18488-1-quic_qiancai@quicinc.com Signed-off-by: Qian Cai <quic_qiancai@quicinc.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/configs/debug.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e9ffb0cc1eec..07df6d93c4df 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -16,7 +16,7 @@ CONFIG_SYMBOLIC_ERRNAME=y # # Compile-time checks and compiler options # -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_FRAME_WARN=2048 CONFIG_SECTION_MISMATCH_WARN_ONLY=y -- cgit v1.2.3-70-g09d2 From aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13 Mon Sep 17 00:00:00 2001 From: Halil Pasic <pasic@linux.ibm.com> Date: Sat, 5 Mar 2022 18:07:14 +0100 Subject: swiotlb: rework "fix info leak with DMA_FROM_DEVICE" Unfortunately, we ended up merging an old version of the patch "fix info leak with DMA_FROM_DEVICE" instead of merging the latest one. Christoph (the swiotlb maintainer), he asked me to create an incremental fix (after I have pointed this out the mix up, and asked him for guidance). So here we go. The main differences between what we got and what was agreed are: * swiotlb_sync_single_for_device is also required to do an extra bounce * We decided not to introduce DMA_ATTR_OVERWRITE until we have exploiters * The implantation of DMA_ATTR_OVERWRITE is flawed: DMA_ATTR_OVERWRITE must take precedence over DMA_ATTR_SKIP_CPU_SYNC Thus this patch removes DMA_ATTR_OVERWRITE, and makes swiotlb_sync_single_for_device() bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer. Let me note, that if the size used with dma_sync_* API is less than the size used with dma_[un]map_*, under certain circumstances we may still end up with swiotlb not being transparent. In that sense, this is no perfect fix either. To get this bullet proof, we would have to bounce the entire mapping/bounce buffer. For that we would have to figure out the starting address, and the size of the mapping in swiotlb_sync_single_for_device(). While this does seem possible, there seems to be no firm consensus on how things are supposed to work. Signed-off-by: Halil Pasic <pasic@linux.ibm.com> Fixes: ddbd89deb7d3 ("swiotlb: fix info leak with DMA_FROM_DEVICE") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/core-api/dma-attributes.rst | 8 -------- include/linux/dma-mapping.h | 8 -------- kernel/dma/swiotlb.c | 23 +++++++++++++++-------- 3 files changed, 15 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 17706dc91ec9..1887d92e8e92 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,11 +130,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). - -DMA_ATTR_OVERWRITE ------------------- - -This is a hint to the DMA-mapping subsystem that the device is expected to -overwrite the entire mapped size, thus the caller does not require any of the -previous buffer contents to be preserved. This allows bounce-buffering -implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 6150d11a607e..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,14 +61,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index bfc56cb21705..6db1c475ec82 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -627,10 +627,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || - dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -697,10 +701,13 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) - swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); + /* + * Unconditional bounce is necessary to avoid corruption on + * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite + * the whole lengt of the bounce buffer. + */ + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + BUG_ON(!valid_dma_direction(dir)); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, -- cgit v1.2.3-70-g09d2 From f0cfe17bcc1dd2f0872966b554a148e888833ee9 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira <bristot@kernel.org> Date: Wed, 9 Mar 2022 14:13:02 +0100 Subject: tracing/osnoise: Do not unregister events twice Nicolas reported that using: # trace-cmd record -e all -M 10 -p osnoise --poll Resulted in the following kernel warning: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1217 at kernel/tracepoint.c:404 tracepoint_probe_unregister+0x280/0x370 [...] CPU: 0 PID: 1217 Comm: trace-cmd Not tainted 5.17.0-rc6-next-20220307-nico+ #19 RIP: 0010:tracepoint_probe_unregister+0x280/0x370 [...] CR2: 00007ff919b29497 CR3: 0000000109da4005 CR4: 0000000000170ef0 Call Trace: <TASK> osnoise_workload_stop+0x36/0x90 tracing_set_tracer+0x108/0x260 tracing_set_trace_write+0x94/0xd0 ? __check_object_size.part.0+0x10a/0x150 ? selinux_file_permission+0x104/0x150 vfs_write+0xb5/0x290 ksys_write+0x5f/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7ff919a18127 [...] ---[ end trace 0000000000000000 ]--- The warning complains about an attempt to unregister an unregistered tracepoint. This happens on trace-cmd because it first stops tracing, and then switches the tracer to nop. Which is equivalent to: # cd /sys/kernel/tracing/ # echo osnoise > current_tracer # echo 0 > tracing_on # echo nop > current_tracer The osnoise tracer stops the workload when no trace instance is actually collecting data. This can be caused both by disabling tracing or disabling the tracer itself. To avoid unregistering events twice, use the existing trace_osnoise_callback_enabled variable to check if the events (and the workload) are actually active before trying to deactivate them. Link: https://lore.kernel.org/all/c898d1911f7f9303b7e14726e7cc9678fbfb4a0e.camel@redhat.com/ Link: https://lkml.kernel.org/r/938765e17d5a781c2df429a98f0b2e7cc317b022.1646823913.git.bristot@kernel.org Cc: stable@vger.kernel.org Cc: Marcelo Tosatti <mtosatti@redhat.com> Fixes: 2fac8d6486d5 ("tracing/osnoise: Allow multiple instances of the same tracer") Reported-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_osnoise.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index cfddb30e65ab..2aa3efdca755 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2200,6 +2200,17 @@ static void osnoise_workload_stop(void) if (osnoise_has_registered_instances()) return; + /* + * If callbacks were already disabled in a previous stop + * call, there is no need to disable then again. + * + * For instance, this happens when tracing is stopped via: + * echo 0 > tracing_on + * echo nop > current_tracer. + */ + if (!trace_osnoise_callback_enabled) + return; + trace_osnoise_callback_enabled = false; /* * Make sure that ftrace_nmi_enter/exit() see -- cgit v1.2.3-70-g09d2 From caf4c86bf136845982c5103b2661751b40c474c0 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne <nsaenzju@redhat.com> Date: Mon, 7 Mar 2022 19:07:40 +0100 Subject: tracing/osnoise: Force quiescent states while tracing At the moment running osnoise on a nohz_full CPU or uncontested FIFO priority and a PREEMPT_RCU kernel might have the side effect of extending grace periods too much. This will entice RCU to force a context switch on the wayward CPU to end the grace period, all while introducing unwarranted noise into the tracer. This behaviour is unavoidable as overly extending grace periods might exhaust the system's memory. This same exact problem is what extended quiescent states (EQS) were created for, conversely, rcu_momentary_dyntick_idle() emulates them by performing a zero duration EQS. So let's make use of it. In the common case rcu_momentary_dyntick_idle() is fairly inexpensive: atomically incrementing a local per-CPU counter and doing a store. So it shouldn't affect osnoise's measurements (which has a 1us granularity), so we'll call it unanimously. The uncommon case involve calling rcu_momentary_dyntick_idle() after having the osnoise process: - Receive an expedited quiescent state IPI with preemption disabled or during an RCU critical section. (activates rdp->cpu_no_qs.b.exp code-path). - Being preempted within in an RCU critical section and having the subsequent outermost rcu_read_unlock() called with interrupts disabled. (t->rcu_read_unlock_special.b.blocked code-path). Neither of those are possible at the moment, and are unlikely to be in the future given the osnoise's loop design. On top of this, the noise generated by the situations described above is unavoidable, and if not exposed by rcu_momentary_dyntick_idle() will be eventually seen in subsequent rcu_read_unlock() calls or schedule operations. Link: https://lkml.kernel.org/r/20220307180740.577607-1-nsaenzju@redhat.com Cc: stable@vger.kernel.org Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> Acked-by: Paul E. McKenney <paulmck@kernel.org> Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_osnoise.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 2aa3efdca755..5e3c62a08fc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1386,6 +1386,26 @@ static int run_osnoise(void) osnoise_stop_tracing(); } + /* + * In some cases, notably when running on a nohz_full CPU with + * a stopped tick PREEMPT_RCU has no way to account for QSs. + * This will eventually cause unwarranted noise as PREEMPT_RCU + * will force preemption as the means of ending the current + * grace period. We avoid this problem by calling + * rcu_momentary_dyntick_idle(), which performs a zero duration + * EQS allowing PREEMPT_RCU to end the current grace period. + * This call shouldn't be wrapped inside an RCU critical + * section. + * + * Note that in non PREEMPT_RCU kernels QSs are handled through + * cond_resched() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + local_irq_disable(); + rcu_momentary_dyntick_idle(); + local_irq_enable(); + } + /* * For the non-preemptive kernel config: let threads runs, if * they so wish. -- cgit v1.2.3-70-g09d2 From 78cbc6513217b00be6a9904415ef7ff3619eb035 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Date: Mon, 7 Mar 2022 08:43:03 +0800 Subject: ftrace: Fix some W=1 warnings in kernel doc comments Clean up the following clang-w1 warning: kernel/trace/ftrace.c:7827: warning: Function parameter or member 'ops' not described in 'unregister_ftrace_function'. kernel/trace/ftrace.c:7805: warning: Function parameter or member 'ops' not described in 'register_ftrace_function'. Link: https://lkml.kernel.org/r/20220307004303.26399-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a4b462b6f944..6105b7036482 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7790,7 +7790,7 @@ int ftrace_is_dead(void) /** * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. + * @ops: ops structure that holds the function for profiling. * * Register a function to be called by all functions in the * kernel. @@ -7817,7 +7817,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_function); /** * unregister_ftrace_function - unregister a function for profiling. - * @ops - ops structure that holds the function to unregister + * @ops: ops structure that holds the function to unregister * * Unregister a function that was added to be called by ftrace profiling. */ -- cgit v1.2.3-70-g09d2 From c993ee0f9f81caf5767a50d1faeba39a0dc82af2 Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 11 Mar 2022 13:23:31 +0000 Subject: watch_queue: Fix filter limit check In watch_queue_set_filter(), there are a couple of places where we check that the filter type value does not exceed what the type_filter bitmap can hold. One place calculates the number of bits by: if (tf[i].type >= sizeof(wfilter->type_filter) * 8) which is fine, but the second does: if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) which is not. This can lead to a couple of out-of-bounds writes due to a too-large type: (1) __set_bit() on wfilter->type_filter (2) Writing more elements in wfilter->filters[] than we allocated. Fix this by just using the proper WATCH_TYPE__NR instead, which is the number of types we actually know about. The bug may cause an oops looking something like: BUG: KASAN: slab-out-of-bounds in watch_queue_set_filter+0x659/0x740 Write of size 4 at addr ffff88800d2c66bc by task watch_queue_oob/611 ... Call Trace: <TASK> dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x150 ... kasan_report.cold+0x7f/0x11b ... watch_queue_set_filter+0x659/0x740 ... __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Allocated by task 611: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 watch_queue_set_filter+0x23a/0x740 __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88800d2c66a0 which belongs to the cache kmalloc-32 of size 32 The buggy address is located 28 bytes inside of 32-byte region [ffff88800d2c66a0, ffff88800d2c66c0) Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn <jannh@google.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/watch_queue.h | 3 ++- kernel/watch_queue.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index c994d1b2cdba..3b9a40ae8bdb 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -28,7 +28,8 @@ struct watch_type_filter { struct watch_filter { union { struct rcu_head rcu; - unsigned long type_filter[2]; /* Bitmask of accepted types */ + /* Bitmask of accepted types */ + DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c9eb20dd2c5..427b0318e303 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -320,7 +320,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ - if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } @@ -336,7 +336,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { - if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; -- cgit v1.2.3-70-g09d2 From c1853fbadcba1497f4907971e7107888e0714c81 Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 11 Mar 2022 13:23:46 +0000 Subject: watch_queue: Fix to release page in ->release() When a pipe ring descriptor points to a notification message, the refcount on the backing page is incremented by the generic get function, but the release function, which marks the bitmap, doesn't drop the page ref. Fix this by calling generic_pipe_buf_release() at the end of watch_queue_pipe_buf_release(). Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn <jannh@google.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/watch_queue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 427b0318e303..dfb3a7e28280 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit += page->index; set_bit(bit, wqueue->notes_bitmap); + generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing -- cgit v1.2.3-70-g09d2 From 96a4d8912b28451cd62825fd7caa0e66e091d938 Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 11 Mar 2022 13:24:08 +0000 Subject: watch_queue: Fix to always request a pow-of-2 pipe ring size The pipe ring size must always be a power of 2 as the head and tail pointers are masked off by AND'ing with the size of the ring - 1. watch_queue_set_size(), however, lets you specify any number of notes between 1 and 511. This number is passed through to pipe_resize_ring() without checking/forcing its alignment. Fix this by rounding the number of slots required up to the nearest power of two. The request is meant to guarantee that at least that many notifications can be generated before the queue is full, so rounding down isn't an option, but, alternatively, it may be better to give an error if we aren't allowed to allocate that much ring space. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn <jannh@google.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index dfb3a7e28280..4bcd400984a7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -244,7 +244,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } - ret = pipe_resize_ring(pipe, nr_notes); + ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; -- cgit v1.2.3-70-g09d2 From a66bd7575b5f449ee0ba20cfd21c3bc5b04ef361 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Date: Fri, 11 Mar 2022 13:24:15 +0000 Subject: watch_queue: Use the bitmap API when applicable Use bitmap_alloc() to simplify code, improve the semantic and reduce some open-coded arithmetic in allocator arguments. Also change a memset(0xff) into an equivalent bitmap_fill() to keep consistency. Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/watch_queue.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 4bcd400984a7..5b516eb2c7cc 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -220,7 +220,6 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) struct page **pages; unsigned long *bitmap; unsigned long user_bufs; - unsigned int bmsize; int ret, i, nr_pages; if (!wqueue) @@ -259,13 +258,11 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; } - bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; - bmsize *= sizeof(unsigned long); - bitmap = kmalloc(bmsize, GFP_KERNEL); + bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; - memset(bitmap, 0xff, bmsize); + bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; -- cgit v1.2.3-70-g09d2 From 3b4c0371928c17af03e8397ac842346624017ce6 Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 11 Mar 2022 13:24:22 +0000 Subject: watch_queue: Fix the alloc bitmap size to reflect notes allocated Currently, watch_queue_set_size() sets the number of notes available in wqueue->nr_notes according to the number of notes allocated, but sets the size of the bitmap to the unrounded number of notes originally asked for. Fix this by setting the bitmap size to the number of notes we're actually going to make available (ie. the number allocated). Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn <jannh@google.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/watch_queue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5b516eb2c7cc..9c476d2cbac0 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -243,6 +243,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } + nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; @@ -266,7 +267,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; - wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + wqueue->nr_notes = nr_notes; return 0; error_p: -- cgit v1.2.3-70-g09d2 From 7ea1a0124b6da246b5bc8c66cddaafd36acf3ecb Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 11 Mar 2022 13:24:29 +0000 Subject: watch_queue: Free the alloc bitmap when the watch_queue is torn down Free the watch_queue note allocation bitmap when the watch_queue is destroyed. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn <jannh@google.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/watch_queue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c476d2cbac0..c12267ccc70e 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -370,6 +370,7 @@ static void __put_watch_queue(struct kref *kref) for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); + bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) -- cgit v1.2.3-70-g09d2 From 2ed147f015af2b48f41c6f0b6746aa9ea85c19f3 Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 11 Mar 2022 13:24:36 +0000 Subject: watch_queue: Fix lack of barrier/sync/lock between post and read There's nothing to synchronise post_one_notification() versus pipe_read(). Whilst posting is done under pipe->rd_wait.lock, the reader only takes pipe->mutex which cannot bar notification posting as that may need to be made from contexts that cannot sleep. Fix this by setting pipe->head with a barrier in post_one_notification() and reading pipe->head with a barrier in pipe_read(). If that's not sufficient, the rd_wait.lock will need to be taken, possibly in a ->confirm() op so that it only applies to notifications. The lock would, however, have to be dropped before copy_page_to_iter() is invoked. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn <jannh@google.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/pipe.c | 3 ++- kernel/watch_queue.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/pipe.c b/fs/pipe.c index 4eb88bc138bb..2667db9506e2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - unsigned int head = pipe->head; + /* Read ->head with a barrier vs post_one_notification() */ + unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index c12267ccc70e..37bcd900fd77 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -113,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; - pipe->head = head + 1; + smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); -- cgit v1.2.3-70-g09d2 From 4edc0760412b0c4ecefc7e02cb855b310b122825 Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 11 Mar 2022 13:24:47 +0000 Subject: watch_queue: Make comment about setting ->defunct more accurate watch_queue_clear() has a comment stating that setting ->defunct to true preventing new additions as well as preventing notifications. Whilst the latter is true, the first bit is superfluous since at the time this function is called, the pipe cannot be accessed to add new event sources. Remove the "new additions" bit from the comment. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn <jannh@google.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 37bcd900fd77..00703444a219 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -566,7 +566,7 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new additions and prevent notifications from happening */ + /* Prevent new notifications from being stored. */ wqueue->defunct = true; while (!hlist_empty(&wqueue->watches)) { -- cgit v1.2.3-70-g09d2