From 0658a0961b0ace06b4cf0e1b73a4f20e349f4346 Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Mon, 8 Nov 2021 18:31:30 -0800 Subject: procfs: do not list TID 0 in /proc//task If a task exits concurrently, task_pid_nr_ns may return 0. [akpm@linux-foundation.org: coding style tweaks] [adobriyan@gmail.com: test that /proc/*/task doesn't contain "0"] Link: https://lkml.kernel.org/r/YV88AnVzHxPafQ9o@localhost.localdomain Link: https://lkml.kernel.org/r/8735pn5dx7.fsf@oldenburg.str.redhat.com Signed-off-by: Florian Weimer Signed-off-by: Alexey Dobriyan Acked-by: Christian Brauner Reviewed-by: Alexey Dobriyan Cc: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 533d5836eb9a..5541de99809c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3799,7 +3799,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) task = next_tid(task), ctx->pos++) { char name[10 + 1]; unsigned int len; + tid = task_pid_nr_ns(task, ns); + if (!tid) + continue; /* The task has just exited. */ len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { -- cgit v1.2.3-70-g09d2 From 2c9feeaedfe1f34c4e327cb6f9b8b90052edced1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Nov 2021 18:31:44 -0800 Subject: proc/vmcore: let pfn_is_ram() return a bool The callback should deal with errors internally, it doesn't make sense to expose these via pfn_is_ram(). We'll rework the callbacks next. Right now we consider errors as if "it's RAM"; no functional change. Link: https://lkml.kernel.org/r/20211005121430.30136-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Juergen Gross Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9a15334da208..a9bd80ab670e 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -84,11 +84,11 @@ void unregister_oldmem_pfn_is_ram(void) } EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); -static int pfn_is_ram(unsigned long pfn) +static bool pfn_is_ram(unsigned long pfn) { int (*fn)(unsigned long pfn); /* pfn is ram unless fn() checks pagetype */ - int ret = 1; + bool ret = true; /* * Ask hypervisor if the pfn is really ram. @@ -97,7 +97,7 @@ static int pfn_is_ram(unsigned long pfn) */ fn = oldmem_pfn_is_ram; if (fn) - ret = fn(pfn); + ret = !!fn(pfn); return ret; } @@ -124,7 +124,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) + if (!pfn_is_ram(pfn)) memset(buf, 0, nr_bytes); else { if (encrypted) -- cgit v1.2.3-70-g09d2 From cc5f2704c93456e6f1c671c5cf7b582a55df5484 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Nov 2021 18:31:48 -0800 Subject: proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks Let's support multiple registered callbacks, making sure that registering vmcore callbacks cannot fail. Make the callback return a bool instead of an int, handling how to deal with errors internally. Drop unused HAVE_OLDMEM_PFN_IS_RAM. We soon want to make use of this infrastructure from other drivers: virtio-mem, registering one callback for each virtio-mem device, to prevent reading unplugged virtio-mem memory. Handle it via a generic vmcore_cb structure, prepared for future extensions: for example, once we support virtio-mem on s390x where the vmcore is completely constructed in the second kernel, we want to detect and add plugged virtio-mem memory ranges to the vmcore in order for them to get dumped properly. Handle corner cases that are unexpected and shouldn't happen in sane setups: registering a callback after the vmcore has already been opened (warn only) and unregistering a callback after the vmcore has already been opened (warn and essentially read only zeroes from that point on). Link: https://lkml.kernel.org/r/20211005121430.30136-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Juergen Gross Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/aperture_64.c | 13 +++++- arch/x86/xen/mmu_hvm.c | 11 +++-- fs/proc/vmcore.c | 99 ++++++++++++++++++++++++++++++------------- include/linux/crash_dump.h | 26 ++++++++++-- 4 files changed, 111 insertions(+), 38 deletions(-) (limited to 'fs/proc') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 10562885f5fc..af3ba08b684b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn) (pfn >= aperture_pfn_start + aperture_page_count)); } +#ifdef CONFIG_PROC_VMCORE +static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) +{ + return !!gart_mem_pfn_is_ram(pfn); +} + +static struct vmcore_cb gart_vmcore_cb = { + .pfn_is_ram = gart_oldmem_pfn_is_ram, +}; +#endif + static void __init exclude_from_core(u64 aper_base, u32 aper_order) { aperture_pfn_start = aper_base >> PAGE_SHIFT; aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; #ifdef CONFIG_PROC_VMCORE - WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); + register_vmcore_cb(&gart_vmcore_cb); #endif #ifdef CONFIG_PROC_KCORE WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 6ba8826dcdcc..509bdee3ab90 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -12,10 +12,10 @@ * The kdump kernel has to check whether a pfn of the crashed kernel * was a ballooned page. vmcore is using this function to decide * whether to access a pfn of the crashed kernel. - * Returns 0 if the pfn is not backed by a RAM page, the caller may + * Returns "false" if the pfn is not backed by a RAM page, the caller may * handle the pfn special in this case. */ -static int xen_oldmem_pfn_is_ram(unsigned long pfn) +static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) { struct xen_hvm_get_mem_type a = { .domid = DOMID_SELF, @@ -24,10 +24,13 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn) if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) { pr_warn_once("Unexpected HVMOP_get_mem_type failure\n"); - return -ENXIO; + return true; } return a.mem_type != HVMMEM_mmio_dm; } +static struct vmcore_cb xen_vmcore_cb = { + .pfn_is_ram = xen_vmcore_pfn_is_ram, +}; #endif static void xen_hvm_exit_mmap(struct mm_struct *mm) @@ -61,6 +64,6 @@ void __init xen_hvm_init_mmu_ops(void) if (is_pagetable_dying_supported()) pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap; #ifdef CONFIG_PROC_VMCORE - WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram)); + register_vmcore_cb(&xen_vmcore_cb); #endif } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a9bd80ab670e..7a04b2eca287 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -/* - * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error - * The called function has to take care of module refcounting. - */ -static int (*oldmem_pfn_is_ram)(unsigned long pfn); - -int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +static DECLARE_RWSEM(vmcore_cb_rwsem); +/* List of registered vmcore callbacks. */ +static LIST_HEAD(vmcore_cb_list); +/* Whether we had a surprise unregistration of a callback. */ +static bool vmcore_cb_unstable; +/* Whether the vmcore has been opened once. */ +static bool vmcore_opened; + +void register_vmcore_cb(struct vmcore_cb *cb) { - if (oldmem_pfn_is_ram) - return -EBUSY; - oldmem_pfn_is_ram = fn; - return 0; + down_write(&vmcore_cb_rwsem); + INIT_LIST_HEAD(&cb->next); + list_add_tail(&cb->next, &vmcore_cb_list); + /* + * Registering a vmcore callback after the vmcore was opened is + * very unusual (e.g., manual driver loading). + */ + if (vmcore_opened) + pr_warn_once("Unexpected vmcore callback registration\n"); + up_write(&vmcore_cb_rwsem); } -EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(register_vmcore_cb); -void unregister_oldmem_pfn_is_ram(void) +void unregister_vmcore_cb(struct vmcore_cb *cb) { - oldmem_pfn_is_ram = NULL; - wmb(); + down_write(&vmcore_cb_rwsem); + list_del(&cb->next); + /* + * Unregistering a vmcore callback after the vmcore was opened is + * very unusual (e.g., forced driver removal), but we cannot stop + * unregistering. + */ + if (vmcore_opened) { + pr_warn_once("Unexpected vmcore callback unregistration\n"); + vmcore_cb_unstable = true; + } + up_write(&vmcore_cb_rwsem); } -EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(unregister_vmcore_cb); static bool pfn_is_ram(unsigned long pfn) { - int (*fn)(unsigned long pfn); - /* pfn is ram unless fn() checks pagetype */ + struct vmcore_cb *cb; bool ret = true; - /* - * Ask hypervisor if the pfn is really ram. - * A ballooned page contains no data and reading from such a page - * will cause high load in the hypervisor. - */ - fn = oldmem_pfn_is_ram; - if (fn) - ret = !!fn(pfn); + lockdep_assert_held_read(&vmcore_cb_rwsem); + if (unlikely(vmcore_cb_unstable)) + return false; + + list_for_each_entry(cb, &vmcore_cb_list, next) { + if (unlikely(!cb->pfn_is_ram)) + continue; + ret = cb->pfn_is_ram(cb, pfn); + if (!ret) + break; + } return ret; } +static int open_vmcore(struct inode *inode, struct file *file) +{ + down_read(&vmcore_cb_rwsem); + vmcore_opened = true; + up_read(&vmcore_cb_rwsem); + + return 0; +} + /* Reads a page from the oldmem device from given offset. */ ssize_t read_from_oldmem(char *buf, size_t count, u64 *ppos, int userbuf, @@ -117,6 +146,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); + down_read(&vmcore_cb_rwsem); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -136,8 +166,10 @@ ssize_t read_from_oldmem(char *buf, size_t count, tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - if (tmp < 0) + if (tmp < 0) { + up_read(&vmcore_cb_rwsem); return tmp; + } } *ppos += nr_bytes; count -= nr_bytes; @@ -147,6 +179,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = 0; } while (count); + up_read(&vmcore_cb_rwsem); return read; } @@ -537,14 +570,19 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + int ret; + /* * Check if oldmem_pfn_is_ram was registered to avoid * looping over all pages without a reason. */ - if (oldmem_pfn_is_ram) - return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + down_read(&vmcore_cb_rwsem); + if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) + ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else - return remap_oldmem_pfn_range(vma, from, pfn, size, prot); + ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); + up_read(&vmcore_cb_rwsem); + return ret; } static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) @@ -668,6 +706,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) #endif static const struct proc_ops vmcore_proc_ops = { + .proc_open = open_vmcore, .proc_read = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 2618577a4d6d..0c547d866f1e 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -91,9 +91,29 @@ static inline void vmcore_unusable(void) elfcorehdr_addr = ELFCORE_ADDR_ERR; } -#define HAVE_OLDMEM_PFN_IS_RAM 1 -extern int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)); -extern void unregister_oldmem_pfn_is_ram(void); +/** + * struct vmcore_cb - driver callbacks for /proc/vmcore handling + * @pfn_is_ram: check whether a PFN really is RAM and should be accessed when + * reading the vmcore. Will return "true" if it is RAM or if the + * callback cannot tell. If any callback returns "false", it's not + * RAM and the page must not be accessed; zeroes should be + * indicated in the vmcore instead. For example, a ballooned page + * contains no data and reading from such a page will cause high + * load in the hypervisor. + * @next: List head to manage registered callbacks internally; initialized by + * register_vmcore_cb(). + * + * vmcore callbacks allow drivers managing physical memory ranges to + * coordinate with vmcore handling code, for example, to prevent accessing + * physical memory ranges that should not be accessed when reading the vmcore, + * although included in the vmcore header as memory ranges to dump. + */ +struct vmcore_cb { + bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn); + struct list_head next; +}; +extern void register_vmcore_cb(struct vmcore_cb *cb); +extern void unregister_vmcore_cb(struct vmcore_cb *cb); #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return 0; } -- cgit v1.2.3-70-g09d2 From da4d6b9cf80ae5b0083f640133b85b68b53b6497 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 8 Nov 2021 18:32:05 -0800 Subject: proc: allow pid_revalidate() during LOOKUP_RCU Problem Description: When running running ~128 parallel instances of TZ=/etc/localtime ps -fe >/dev/null on a 128CPU machine, the %sys utilization reaches 97%, and perf shows the following code path as being responsible for heavy contention on the d_lockref spinlock: walk_component() lookup_fast() d_revalidate() pid_revalidate() // returns -ECHILD unlazy_child() lockref_get_not_dead(&nd->path.dentry->d_lockref) <-- contention The reason is that pid_revalidate() is triggering a drop from RCU to ref path walk mode. All concurrent path lookups thus try to grab a reference to the dentry for /proc/, before re-executing pid_revalidate() and then stepping into the /proc/$pid directory. Thus there is huge spinlock contention. This patch allows pid_revalidate() to execute in RCU mode, meaning that the path lookup can successfully enter the /proc/$pid directory while still in RCU mode. Later on, the path lookup may still drop into ref mode, but the contention will be much reduced at this point. By applying this patch, %sys utilization falls to around 85% under the same workload, and the number of ps processes executed per unit time increases by 3x-4x. Although this particular workload is a bit contrived, we have seen some large collections of eager monitoring scripts which produced similarly high %sys time due to contention in the /proc directory. As a result this patch, Al noted that several procfs methods which were only called in ref-walk mode could now be called from RCU mode. To ensure that this patch is safe, I audited all the inode get_link and permission() implementations, as well as dentry d_revalidate() implementations, in fs/proc. The purpose here is to ensure that they either are safe to call in RCU (i.e. don't sleep) or correctly bail out of RCU mode if they don't support it. My analysis shows that all at-risk procfs methods are safe to call under RCU, and thus this patch is safe. Procfs RCU-walk Analysis: This analysis is up-to-date with 5.15-rc3. When called under RCU mode, these functions have arguments as follows: * get_link() receives a NULL dentry pointer when called in RCU mode. * permission() receives MAY_NOT_BLOCK in the mode parameter when called from RCU. * d_revalidate() receives LOOKUP_RCU in flags. For the following functions, either they are trivially RCU safe, or they explicitly bail at the beginning of the function when they run: proc_ns_get_link (bails out) proc_get_link (RCU safe) proc_pid_get_link (bails out) map_files_d_revalidate (bails out) map_misc_d_revalidate (bails out) proc_net_d_revalidate (RCU safe) proc_sys_revalidate (bails out, also not under /proc/$pid) tid_fd_revalidate (bails out) proc_sys_permission (not under /proc/$pid) The remainder of the functions require a bit more detail: * proc_fd_permission: RCU safe. All of the body of this function is under rcu_read_lock(), except generic_permission() which declares itself RCU safe in its documentation string. * proc_self_get_link uses GFP_ATOMIC in the RCU case, so it is RCU aware and otherwise looks safe. The same is true of proc_thread_self_get_link. * proc_map_files_get_link: calls ns_capable, which calls capable(), and thus calls into the audit code (see note #1 below). The remainder is just a call to the trivially safe proc_pid_get_link(). * proc_pid_permission: calls ptrace_may_access(), which appears RCU safe, although it does call into the "security_ptrace_access_check()" hook, which looks safe under smack and selinux. Just the audit code is of concern. Also uses get_task_struct() and put_task_struct(), see note #2 below. * proc_tid_comm_permission: Appears safe, though calls put_task_struct (see note #2 below). Note #1: Most of the concern of RCU safety has centered around the audit code. However, since b17ec22fb339 ("selinux: slow_avc_audit has become non-blocking"), it's safe to call this code under RCU. So all of the above are safe by my estimation. Note #2: get_task_struct() and put_task_struct(): The majority of get_task_struct() is under RCU read lock, and in any case it is a simple increment. But put_task_struct() is complex, given that it could at some point free the task struct, and this process has many steps which I couldn't manually verify. However, several other places call put_task_struct() under RCU, so it appears safe to use here too (see kernel/hung_task.c:165 or rcu/tree-stall.h:296) Patch description: pid_revalidate() drops from RCU into REF lookup mode. When many threads are resolving paths within /proc in parallel, this can result in heavy spinlock contention on d_lockref as each thread tries to grab a reference to the /proc dentry (and drop it shortly thereafter). Investigation indicates that it is not necessary to drop RCU in pid_revalidate(), as no RCU data is modified and the function never sleeps. So, remove the LOOKUP_RCU check. Link: https://lkml.kernel.org/r/20211004175629.292270-2-stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan Cc: Konrad Wilk Cc: Alexander Viro Cc: Matthew Wilcox Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5541de99809c..264509e584e3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1979,19 +1979,21 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; + int ret = 0; - if (flags & LOOKUP_RCU) - return -ECHILD; - - inode = d_inode(dentry); - task = get_proc_task(inode); + rcu_read_lock(); + inode = d_inode_rcu(dentry); + if (!inode) + goto out; + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { pid_update_inode(task, inode); - put_task_struct(task); - return 1; + ret = 1; } - return 0; +out: + rcu_read_unlock(); + return ret; } static inline bool proc_inode_is_dead(struct inode *inode) -- cgit v1.2.3-70-g09d2