From f2d5dcb48f7ba9e3ff249d58fc1fa963d374e66a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 10 Oct 2023 15:55:49 +0100 Subject: bounds: support non-power-of-two CONFIG_NR_CPUS ilog2() rounds down, so for example when PowerPC 85xx sets CONFIG_NR_CPUS to 24, we will only allocate 4 bits to store the number of CPUs instead of 5. Use bits_per() instead, which rounds up. Found by code inspection. The effect of this would probably be a misaccounting when doing NUMA balancing, so to a user, it would only be a performance penalty. The effects may be more wide-spread; it's hard to tell. Link: https://lkml.kernel.org/r/20231010145549.1244748-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Fixes: 90572890d202 ("mm: numa: Change page last {nid,pid} into {cpu,pid}") Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index b529182e8b04..c5a9fcd2d622 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -19,7 +19,7 @@ int main(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); #ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); #ifdef CONFIG_LRU_GEN -- cgit v1.2.3-70-g09d2 From b73aa539a7789d2366aa7ffa627cede77d6f1c4e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 7 Jan 2024 17:16:41 +0800 Subject: panic: suppress gnu_printf warning with GCC 13.2.1 and W=1, there's compiling warning like this: kernel/panic.c: In function `__warn': kernel/panic.c:676:17: warning: function `__warn' might be a candidate for `gnu_printf' format attribute [-Wsuggest-attribute=format] 676 | vprintk(args->fmt, args->args); | ^~~~~~~ The normal __printf(x,y) adding can't fix it. So add workaround which disables -Wsuggest-attribute=format to mute it. Link: https://lkml.kernel.org/r/20240107091641.579849-1-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Andrew Morton --- kernel/panic.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 2807639aab51..d49b68184c56 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -666,8 +666,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint, pr_warn("WARNING: CPU: %d PID: %d at %pS\n", raw_smp_processor_id(), current->pid, caller); +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +#endif if (args) vprintk(args->fmt, args->args); +#pragma GCC diagnostic pop print_modules(); -- cgit v1.2.3-70-g09d2 From 6db9d317833dd89d666ca0373c7b60f413bca9eb Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 15 Jan 2024 14:25:19 +0800 Subject: user_namespace: remove unnecessary NULL values from kbuf kbuf is assigned first, so it does not need to initialize the assignment. Link: https://lkml.kernel.org/r/20240115062519.31298-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Randy Dunlap Signed-off-by: Andrew Morton --- kernel/user_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ce4d99df5f0e..0b0b95418b16 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -931,7 +931,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; - char *kbuf = NULL, *pos, *next_line; + char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ -- cgit v1.2.3-70-g09d2 From 08701813a1b439f01e52c10e66ee2e99d9d23c6c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Jan 2024 18:16:31 +0100 Subject: ptrace_attach: shift send(SIGSTOP) into ptrace_set_stopped() Turn send_sig_info(SIGSTOP) into send_signal_locked(SIGSTOP) and move it from ptrace_attach() to ptrace_set_stopped(). This looks more logical and avoids lock(siglock) right after unlock(). Link: https://lkml.kernel.org/r/20240122171631.GA29844@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/ptrace.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2fabd497d659..d5f89f9ef29f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,10 +375,13 @@ static int check_ptrace_options(unsigned long data) return 0; } -static inline void ptrace_set_stopped(struct task_struct *task) +static inline void ptrace_set_stopped(struct task_struct *task, bool seize) { guard(spinlock)(&task->sighand->siglock); + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID); /* * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING @@ -457,14 +460,8 @@ static int ptrace_attach(struct task_struct *task, long request, return -EPERM; task->ptrace = flags; - ptrace_link(task, current); - - /* SEIZE doesn't trap tracee on attach */ - if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); - - ptrace_set_stopped(task); + ptrace_set_stopped(task, seize); } } -- cgit v1.2.3-70-g09d2 From 2e3fc6ca521499a985a1829a21759ac25359c0f3 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 2 Feb 2024 21:20:42 +0800 Subject: panic: add option to dump blocked tasks in panic_print For debugging kernel panics and other bugs, there is already an option of panic_print to dump all tasks' call stacks. On today's large servers running many containers, there could be thousands of tasks or more, and this will print out huge amount of call stacks, taking a lot of time (for serial console which is main target user case of panic_print). And in many cases, only those several tasks being blocked are key for the panic, so add an option to only dump blocked tasks' call stacks. [akpm@linux-foundation.org: clarify documentation a little] Link: https://lkml.kernel.org/r/20240202132042.3609657-1-feng.tang@intel.com Signed-off-by: Feng Tang Tested-by: Guilherme G. Piccoli Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Peter Zijlstra (Intel) Cc: Randy Dunlap Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 1 + Documentation/admin-guide/sysctl/kernel.rst | 1 + kernel/panic.c | 4 ++++ 3 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 31b3a25680d0..800b4b5dcbb4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4182,6 +4182,7 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer bit 6: print all CPUs backtrace (if available in the arch) + bit 7: print only tasks in uninterruptible (blocked) state *Be aware* that this option may print a _lot_ of lines, so there are risks of losing older messages in the log. Use this option carefully, maybe worth to setup a diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index bc578663619d..a9b71190399d 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -853,6 +853,7 @@ bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer bit 5 print all printk messages in buffer bit 6 print all CPUs backtrace (if available in the arch) +bit 7 print only tasks in uninterruptible (blocked) state ===== ============================================ So for example to print tasks and memory info on panic, user can:: diff --git a/kernel/panic.c b/kernel/panic.c index d49b68184c56..d4cb86d97f8f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -73,6 +73,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 #define PANIC_PRINT_ALL_CPU_BT 0x00000040 +#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -227,6 +228,9 @@ static void panic_print_sys_info(bool console_flush) if (panic_print & PANIC_PRINT_FTRACE_INFO) ftrace_dump(DUMP_ALL); + + if (panic_print & PANIC_PRINT_BLOCKED_TASKS) + show_state_filter(TASK_UNINTERRUPTIBLE); } void check_panic_on_warn(const char *origin) -- cgit v1.2.3-70-g09d2 From 49fd5f5ac4b59dcd53b5788d56d4ae7a8a1e1434 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Feb 2024 17:56:47 +0100 Subject: get_signal: don't abuse ksig->info.si_signo and ksig->sig Patch series "get_signal: minor cleanups and fix". Lets remove this clear_siginfo() right now. It is incomplete (and thus looks confusing) and unnecessary. Also, PF_USER_WORKER's already don't get a fully initialized ksig anyway. This patch (of 3): Cleanup and preparation for the next changes. get_signal() uses signr or ksig->info.si_signo or ksig->sig in a chaotic way, this looks confusing. Change it to always use signr. Link: https://lkml.kernel.org/r/20240226165612.GA20787@redhat.com Link: https://lkml.kernel.org/r/20240226165647.GA20826@redhat.com Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Eric W. Biederman Cc: Peter Collingbourne Cc: Wen Yang Signed-off-by: Andrew Morton --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c9c57d053ce4..09a6dd07cf6b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2842,7 +2842,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(ksig->info.si_signo))) { + if (likely(do_signal_stop(signr))) { /* It released the siglock. */ goto relock; } @@ -2866,7 +2866,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(ksig->info.si_signo); + print_fatal_signal(signr); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2890,7 +2890,7 @@ relock: /* * Death signals, no core dump. */ - do_group_exit(ksig->info.si_signo); + do_group_exit(signr); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); @@ -2900,7 +2900,7 @@ out: if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) hide_si_addr_tag_bits(ksig); - return ksig->sig > 0; + return signr > 0; } /** -- cgit v1.2.3-70-g09d2 From dd69edd643a8263f9a96d0a3a82d8d50d9df9b48 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Feb 2024 17:56:50 +0100 Subject: get_signal: hide_si_addr_tag_bits: fix the usage of uninitialized ksig ksig->ka and ksig->info are not initialized if get_signal() returns 0 or if the caller is PF_USER_WORKER. Check signr != 0 before SA_EXPOSE_TAGBITS and move the "out" label down. The latter means that ksig->sig won't be initialized if a PF_USER_WORKER thread gets a fatal signal but this is fine, PF_USER_WORKER's don't use ksig. And there is nothing new, in this case ksig->ka and ksig-info are not initialized anyway. Add a comment. Link: https://lkml.kernel.org/r/20240226165650.GA20829@redhat.com Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Eric W. Biederman Cc: Peter Collingbourne Cc: Wen Yang Signed-off-by: Andrew Morton --- kernel/signal.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 09a6dd07cf6b..a69d3069067a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2881,8 +2881,9 @@ relock: /* * PF_USER_WORKER threads will catch and exit on fatal signals - * themselves. They have cleanup that must be performed, so - * we cannot call do_exit() on their behalf. + * themselves. They have cleanup that must be performed, so we + * cannot call do_exit() on their behalf. Note that ksig won't + * be properly initialized, PF_USER_WORKER's shouldn't use it. */ if (current->flags & PF_USER_WORKER) goto out; @@ -2894,12 +2895,12 @@ relock: /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); -out: + ksig->sig = signr; - if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) + if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) hide_si_addr_tag_bits(ksig); - +out: return signr > 0; } -- cgit v1.2.3-70-g09d2 From a436184e3bfb14b3c38e6ed0c2e7f6d810312c4f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Feb 2024 17:56:53 +0100 Subject: get_signal: don't initialize ksig->info if SIGNAL_GROUP_EXIT/group_exec_task This initialization is incomplete and unnecessary, neither do_group_exit() nor PF_USER_WORKER need ksig->info. Link: https://lkml.kernel.org/r/20240226165653.GA20834@redhat.com Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Eric W. Biederman Cc: Peter Collingbourne Cc: Wen Yang Signed-off-by: Andrew Morton --- kernel/signal.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a69d3069067a..9c6a5ccac328 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2727,12 +2727,15 @@ relock: /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { - clear_siginfo(&ksig->info); - ksig->info.si_signo = signr = SIGKILL; + signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, - &sighand->action[SIGKILL - 1]); + &sighand->action[SIGKILL-1]); recalc_sigpending(); + /* + * implies do_group_exit() or return to PF_USER_WORKER, + * no need to initialize ksig->info/etc. + */ goto fatal; } -- cgit v1.2.3-70-g09d2 From 4bb7be96fc8871f37ff705e02443a0526fb4df8d Mon Sep 17 00:00:00 2001 From: "yang.zhang" Date: Thu, 22 Feb 2024 17:21:19 +0800 Subject: kexec: copy only happens before uchunk goes to zero When loading segments, ubytes is <= mbytes. When ubytes is exhausted, there could be remaining mbytes. Then in the while loop, the buf pointer advancing with mchunk will causing meaningless reading even though it doesn't harm. So let's change to make sure that all of the copying and the rest only happens before uchunk goes to zero. Link: https://lkml.kernel.org/r/20240222092119.5602-1-gaoshanliukou@163.com Signed-off-by: yang.zhang Acked-by: Baoquan He Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d08fc7b5db97..2fc3d0e3715a 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -800,22 +800,24 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } kunmap_local(ptr); if (result) { result = -EFAULT; goto out; } - ubytes -= uchunk; maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; mbytes -= mchunk; cond_resched(); @@ -866,11 +868,18 @@ static int kimage_load_crash_segment(struct kimage *image, memset(ptr + uchunk, 0, mchunk - uchunk); } - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } kexec_flush_icache_page(page); kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); @@ -878,12 +887,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = -EFAULT; goto out; } - ubytes -= uchunk; maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; mbytes -= mchunk; cond_resched(); -- cgit v1.2.3-70-g09d2 From 75060b6ead0e93b7b43a451ba1e13c49b1aa2025 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 6 Mar 2024 07:49:16 +0100 Subject: watchdog/core: remove sysctl handlers from public header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions are only used in the file where they are defined. Remove them from the header and make them static. Also guard proc_soft_watchdog with a #define-guard as it is not used otherwise. Link: https://lkml.kernel.org/r/20240306-const-sysctl-prep-watchdog-v1-1-bd45da3a41cf@weissschuh.net Signed-off-by: Thomas Weißschuh Signed-off-by: Andrew Morton --- include/linux/nmi.h | 7 ------- kernel/watchdog.c | 22 ++++++++++++---------- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index e92e378df000..f53438eae815 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -216,13 +216,6 @@ void watchdog_update_hrtimer_threshold(u64 period); static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif -struct ctl_table; -int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); -int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); -int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *); -int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *); - #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 81a8862295d6..d7b2125503af 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -796,8 +796,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, /* * /proc/sys/kernel/watchdog */ -int proc_watchdog(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_watchdog(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | WATCHDOG_SOFTOCKUP_ENABLED, @@ -807,8 +807,8 @@ int proc_watchdog(struct ctl_table *table, int write, /* * /proc/sys/kernel/nmi_watchdog */ -int proc_nmi_watchdog(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_nmi_watchdog(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { if (!watchdog_hardlockup_available && write) return -ENOTSUPP; @@ -816,21 +816,23 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, table, write, buffer, lenp, ppos); } +#ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * /proc/sys/kernel/soft_watchdog */ -int proc_soft_watchdog(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_soft_watchdog(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } +#endif /* * /proc/sys/kernel/watchdog_thresh */ -int proc_watchdog_thresh(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_watchdog_thresh(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -852,8 +854,8 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * user to specify a mask that will include cpus that have not yet * been brought online, if desired. */ -int proc_watchdog_cpumask(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_watchdog_cpumask(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int err; -- cgit v1.2.3-70-g09d2