From d741bf41d7c7db4898bacfcb020353cddc032fd8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:24 +0900 Subject: kprobes: Remove kretprobe hash The kretprobe hash is mostly superfluous, replace it with a per-task variable. This gets rid of the task hash and it's related locking. Note that this may change the kprobes module-exported API for kretprobe handlers. If any out-of-tree kretprobe user uses ri->rp, use get_kretprobe(ri) instead. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870620431.1229682.16325792502413731312.stgit@devnote2 --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 49677d668de4..53a1f508a097 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2161,6 +2161,10 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; +#ifdef CONFIG_KRETPROBES + p->kretprobe_instances.first = NULL; +#endif + /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed -- cgit v1.2.3-70-g09d2 From b4e00444cab4c3f3fec876dc0cccc8cbb0d1a948 Mon Sep 17 00:00:00 2001 From: Eddy Wu Date: Sat, 7 Nov 2020 14:47:22 +0800 Subject: fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent current->group_leader->exit_signal may change during copy_process() if current->real_parent exits. Move the assignment inside tasklist_lock to avoid the race. Signed-off-by: Eddy Wu Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..6d266388d380 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2167,14 +2167,9 @@ static __latent_entropy struct task_struct *copy_process( /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { - p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { - if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2218,9 +2213,14 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else + p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; + p->exit_signal = args->exit_signal; } klp_copy_process(p); -- cgit v1.2.3-70-g09d2 From 23d67a54857a768acdb0804cdd6037c324a50ecd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:00 -0500 Subject: seccomp: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SECCOMP, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-5-krisman@collabora.com --- include/asm-generic/syscall.h | 2 +- include/linux/entry-common.h | 8 ++------ include/linux/seccomp.h | 2 +- include/linux/thread_info.h | 6 ++++++ kernel/entry/common.c | 2 +- kernel/fork.c | 2 +- kernel/seccomp.c | 6 +++--- 7 files changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index f3135e734387..524d8e68ff5e 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -135,7 +135,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP. + * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 3fe8f868f15e..fa3cdb102dbf 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -21,10 +21,6 @@ # define _TIF_SYSCALL_TRACEPOINT (0) #endif -#ifndef _TIF_SECCOMP -# define _TIF_SECCOMP (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -49,7 +45,7 @@ #endif #define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) @@ -64,7 +60,7 @@ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) -#define SYSCALL_WORK_ENTER (0) +#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP) #define SYSCALL_WORK_EXIT (0) /* diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 02aef2844c38..47763f3999f7 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -42,7 +42,7 @@ struct seccomp { extern int __secure_computing(const struct seccomp_data *sd); static inline int secure_computing(void) { - if (unlikely(test_thread_flag(TIF_SECCOMP))) + if (unlikely(test_syscall_work(SECCOMP))) return __secure_computing(NULL); return 0; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0e9fb15d6b42..a308ba4ef07b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -35,6 +35,12 @@ enum { GOOD_STACK, }; +enum syscall_work_bit { + SYSCALL_WORK_BIT_SECCOMP, +}; + +#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) + #include #ifdef __KERNEL__ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e7a11e38daba..5747a6eb2c48 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -54,7 +54,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, } /* Do seccomp after ptrace, to catch any tracer changes. */ - if (ti_work & _TIF_SECCOMP) { + if (work & SYSCALL_WORK_SECCOMP) { ret = __secure_computing(NULL); if (ret == -1L) return ret; diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..bc5b1090f415 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1625,7 +1625,7 @@ static void copy_seccomp(struct task_struct *p) * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) - set_tsk_thread_flag(p, TIF_SECCOMP); + set_task_syscall_work(p, SECCOMP); #endif } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..f67e92d11ad7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -356,14 +356,14 @@ static inline void seccomp_assign_mode(struct task_struct *task, task->seccomp.mode = seccomp_mode; /* - * Make sure TIF_SECCOMP cannot be set before the mode (and + * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and * filter) is set. */ smp_mb__before_atomic(); /* Assume default seccomp processes want spec flaw mitigation. */ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) arch_seccomp_spec_mitigate(task); - set_tsk_thread_flag(task, TIF_SECCOMP); + set_task_syscall_work(task, SECCOMP); } #ifdef CONFIG_SECCOMP_FILTER @@ -929,7 +929,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* * Make sure that any changes to mode from another thread have - * been seen after TIF_SECCOMP was seen. + * been seen after SYSCALL_WORK_SECCOMP was seen. */ rmb(); -- cgit v1.2.3-70-g09d2 From 64c19ba29b66e98af9306b4a7525fb22c895d252 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:02 -0500 Subject: ptrace: Migrate to use SYSCALL_TRACE flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_TRACE, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-7-krisman@collabora.com --- include/asm-generic/syscall.h | 15 ++++++++------- include/linux/entry-common.h | 10 ++++++---- include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 17 +++++++++-------- kernel/entry/common.c | 4 ++-- kernel/fork.c | 2 +- kernel/ptrace.c | 6 +++--- 7 files changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 524d8e68ff5e..ed94e5658d0c 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -43,7 +43,7 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); * @regs: task_pt_regs() of @task * * It's only valid to call this when @task is stopped for system - * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), + * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), * after tracehook_report_syscall_entry() returned nonzero to prevent * the system call from taking place. * @@ -63,7 +63,7 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs); * Returns 0 if the system call succeeded, or -ERRORCODE if it failed. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); @@ -76,7 +76,7 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); * This value is meaningless if syscall_get_error() returned nonzero. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); @@ -93,7 +93,7 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); * code; the user sees a failed system call with this errno code. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val); @@ -108,7 +108,7 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, * @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); @@ -123,7 +123,7 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, * The first argument gets value @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args); @@ -135,7 +135,8 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP. + * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or + * %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 2a01eee2dbba..ae426ab9c372 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -41,7 +41,7 @@ #endif #define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) @@ -53,12 +53,14 @@ #endif #define SYSCALL_EXIT_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ ARCH_SYSCALL_EXIT_WORK) #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ - SYSCALL_WORK_SYSCALL_TRACEPOINT) -#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT) + SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) +#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index c232043c12d3..761a4590d554 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -38,10 +38,12 @@ enum { enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, + SYSCALL_WORK_BIT_SYSCALL_TRACE, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index f7d82e4fafd6..3f20368afe9e 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -83,11 +83,12 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * tracehook_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * - * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set, - * when the current task has just entered the kernel for a system call. - * Full user register state is available here. Changing the values - * in @regs can affect the system call number and arguments to be tried. - * It is safe to block here, preventing the system call from beginning. + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or + * %TIF_SYSCALL_EMU have been set, when the current task has just + * entered the kernel for a system call. Full user register state is + * available here. Changing the values in @regs can affect the system + * call number and arguments to be tried. It is safe to block here, + * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is @@ -109,15 +110,15 @@ static inline __must_check int tracehook_report_syscall_entry( * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * - * This will be called if %TIF_SYSCALL_TRACE has been set, when the - * current task has just finished an attempted system call. Full + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when + * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. - * In this case, %TIF_SYSCALL_TRACE might not be set. + * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f651967847ec..917328a9edaa 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,7 +47,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { ret = arch_syscall_enter_tracehook(regs); if (ret || (ti_work & _TIF_SYSCALL_EMU)) return -1L; @@ -237,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, trace_sys_exit(regs, syscall_get_return_value(current, regs)); step = report_single_step(ti_work); - if (step || ti_work & _TIF_SYSCALL_TRACE) + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index bc5b1090f415..99f68c20f2ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2158,7 +2158,7 @@ static __latent_entropy struct task_struct *copy_process( * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); + clear_task_syscall_work(p, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..55a2bc3186a7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child) const struct cred *old_cred; BUG_ON(!child->ptrace); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif @@ -812,9 +812,9 @@ static int ptrace_resume(struct task_struct *child, long request, return -EIO; if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_task_syscall_work(child, SYSCALL_TRACE); else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) -- cgit v1.2.3-70-g09d2 From 64eb35f701f04b30706e21d1b02636b5d31a37d2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:03 -0500 Subject: ptrace: Migrate TIF_SYSCALL_EMU to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_EMU, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-8-krisman@collabora.com --- include/linux/entry-common.h | 8 ++------ include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 2 +- kernel/entry/common.c | 19 ++++++++++--------- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 +++++----- 6 files changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index ae426ab9c372..b30f82bed92b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -13,10 +13,6 @@ * Define dummy _TIF work flags if not defined by the architecture or for * disabled functionality. */ -#ifndef _TIF_SYSCALL_EMU -# define _TIF_SYSCALL_EMU (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -42,7 +38,6 @@ #define SYSCALL_ENTER_WORK \ (_TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) /* @@ -58,7 +53,8 @@ #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ - SYSCALL_WORK_SYSCALL_TRACE) + SYSCALL_WORK_SYSCALL_TRACE | \ + SYSCALL_WORK_SYSCALL_EMU) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 761a4590d554..85b8a4216168 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -39,11 +39,13 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, + SYSCALL_WORK_BIT_SYSCALL_EMU, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) +#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3f20368afe9e..54b925224a13 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -84,7 +84,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or - * %TIF_SYSCALL_EMU have been set, when the current task has just + * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 917328a9edaa..90533f34ea99 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,9 +47,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_syscall_enter_tracehook(regs); - if (ret || (ti_work & _TIF_SYSCALL_EMU)) + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -208,21 +208,22 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) } #ifndef _TIF_SINGLESTEP -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { return false; } #else /* - * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * If SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ -#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) - -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { - return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; + if (!(work & SYSCALL_WORK_SYSCALL_EMU)) + return false; + + return !!(current_thread_info()->flags & _TIF_SINGLESTEP); } #endif @@ -236,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); - step = report_single_step(ti_work); + step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index 99f68c20f2ff..02b689a23457 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2159,8 +2159,8 @@ static __latent_entropy struct task_struct *copy_process( */ user_disable_single_step(p); clear_task_syscall_work(p, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 55a2bc3186a7..237bcd6d255c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,8 +118,8 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(child, SYSCALL_EMU); #endif child->parent = child->real_parent; @@ -816,11 +816,11 @@ static int ptrace_resume(struct task_struct *child, long request, else clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + set_task_syscall_work(child, SYSCALL_EMU); else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_task_syscall_work(child, SYSCALL_EMU); #endif if (is_singleblock(request)) { -- cgit v1.2.3-70-g09d2 From 5fbda3ecd14a5343644979c98d6eb65b7e7de9d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:43 +0100 Subject: sched: highmem: Store local kmaps in task struct Instead of storing the map per CPU provide and use per task storage. That prepares for local kmaps which are preemptible. The context switch code is preparatory and not yet in use because kmap_atomic() runs with preemption disabled. Will be made usable in the next step. The context switch logic is safe even when an interrupt happens after clearing or before restoring the kmaps. The kmap index in task struct is not modified so any nesting kmap in an interrupt will use unused indices and on return the counter is the same as before. Also add an assert into the return to user space code. Going back to user space with an active kmap local is a nono. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.372935758@linutronix.de --- include/linux/highmem-internal.h | 10 ++++ include/linux/sched.h | 9 ++++ kernel/entry/common.c | 2 + kernel/fork.c | 1 + kernel/sched/core.c | 25 ++++++++++ mm/highmem.c | 99 ++++++++++++++++++++++++++++++++++++---- 6 files changed, 136 insertions(+), 10 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 6ceed907b14e..c5a22177db85 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -9,6 +9,16 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(void *vaddr); +void kmap_local_fork(struct task_struct *tsk); +void __kmap_local_sched_out(void); +void __kmap_local_sched_in(void); +static inline void kmap_assert_nomap(void) +{ + DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); +} +#else +static inline void kmap_local_fork(struct task_struct *tsk) { } +static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM diff --git a/include/linux/sched.h b/include/linux/sched.h index a33f35f68060..8946911cee9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -629,6 +630,13 @@ struct wake_q_node { struct wake_q_node *next; }; +struct kmap_ctrl { +#ifdef CONFIG_KMAP_LOCAL + int idx; + pte_t pteval[KM_MAX_IDX]; +#endif +}; + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -1294,6 +1302,7 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif + struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..4ae1fe0898e9 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) /* Ensure that the address limit is intact and no locks are held */ addr_limit_user_check(); + kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..17dcd1817799 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -930,6 +930,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) account_kernel_stack(tsk, 1); kcov_task_init(tsk); + kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c962922784d1..953abdbe1472 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4094,6 +4094,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -4116,6 +4132,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -4182,6 +4199,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + /* + * kmap_local_sched_out() is invoked with rq::lock held and + * interrupts disabled. There is no requirement for that, but the + * sched out code does not have an interrupt enabled section. + * Restoring the maps on sched in does not require interrupts being + * disabled either. + */ + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* diff --git a/mm/highmem.c b/mm/highmem.c index 39aaca1a1ece..d1ef06aa6de6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high); #include -static DEFINE_PER_CPU(int, __kmap_local_idx); - /* * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second * slot is unused which acts as a guard page @@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_idx); static inline int kmap_local_idx_push(void) { - int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1; - WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx >= KM_MAX_IDX); - return idx; + current->kmap_ctrl.idx += KM_INCR; + BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); + return current->kmap_ctrl.idx - 1; } static inline int kmap_local_idx(void) { - return __this_cpu_read(__kmap_local_idx) - 1; + return current->kmap_ctrl.idx - 1; } static inline void kmap_local_idx_pop(void) { - int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR); - - BUG_ON(idx < 0); + current->kmap_ctrl.idx -= KM_INCR; + BUG_ON(current->kmap_ctrl.idx < 0); } #ifndef arch_kmap_local_post_map @@ -464,6 +460,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) pteval = pfn_pte(pfn, prot); set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); arch_kmap_local_post_map(vaddr, pteval); + current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; preempt_enable(); return (void *)vaddr; @@ -522,10 +519,92 @@ void kunmap_local_indexed(void *vaddr) arch_kmap_local_pre_unmap(addr); pte_clear(&init_mm, addr, kmap_pte - idx); arch_kmap_local_post_unmap(addr); + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); kmap_local_idx_pop(); preempt_enable(); } EXPORT_SYMBOL(kunmap_local_indexed); + +/* + * Invoked before switch_to(). This is safe even when during or after + * clearing the maps an interrupt which needs a kmap_local happens because + * the task::kmap_ctrl.idx is not modified by the unmapping code so a + * nested kmap_local will use the next unused index and restore the index + * on unmap. The already cleared kmaps of the outgoing task are irrelevant + * because the interrupt context does not know about them. The same applies + * when scheduling back in for an interrupt which happens before the + * restore is complete. + */ +void __kmap_local_sched_out(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Clear kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* + * This is a horrible hack for XTENSA to calculate the + * coloured PTE index. Uses the PFN encoded into the pteval + * and the map index calculation because the actual mapped + * virtual address is not stored in task::kmap_ctrl. + * For any sane architecture this is optimized out. + */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + } +} + +void __kmap_local_sched_in(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Restore kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* See comment in __kmap_local_sched_out() */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(addr, pteval); + } +} + +void kmap_local_fork(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) + memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); +} + #endif #if defined(HASHED_PAGE_VIRTUAL) -- cgit v1.2.3-70-g09d2 From 1446e1df9eb183fdf81c3f0715402f1d7595d4cb Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:34 -0500 Subject: kernel: Implement selective syscall userspace redirection Introduce a mechanism to quickly disable/enable syscall handling for a specific process and redirect to userspace via SIGSYS. This is useful for processes with parts that require syscall redirection and parts that don't, but who need to perform this boundary crossing really fast, without paying the cost of a system call to reconfigure syscall handling on each boundary transition. This is particularly important for Windows games running over Wine. The proposed interface looks like this: prctl(PR_SET_SYSCALL_USER_DISPATCH, , , , [selector]) The range [,+) is a part of the process memory map that is allowed to by-pass the redirection code and dispatch syscalls directly, such that in fast paths a process doesn't need to disable the trap nor the kernel has to check the selector. This is essential to return from SIGSYS to a blocked area without triggering another SIGSYS from rt_sigreturn. selector is an optional pointer to a char-sized userspace memory region that has a key switch for the mechanism. This key switch is set to either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the redirection without calling the kernel. The feature is meant to be set per-thread and it is disabled on fork/clone/execv. Internally, this doesn't add overhead to the syscall hot path, and it requires very little per-architecture support. I avoided using seccomp, even though it duplicates some functionality, due to previous feedback that maybe it shouldn't mix with seccomp since it is not a security mechanism. And obviously, this should never be considered a security mechanism, since any part of the program can by-pass it by using the syscall dispatcher. For the sysinfo benchmark, which measures the overhead added to executing a native syscall that doesn't require interception, the overhead using only the direct dispatcher region to issue syscalls is pretty much irrelevant. The overhead of using the selector goes around 40ns for a native (unredirected) syscall in my system, and it is (as expected) dominated by the supervisor-mode user-address access. In fact, with SMAP off, the overhead is consistently less than 5ns on my test box. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Acked-by: Kees Cook Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com --- fs/exec.c | 3 + include/linux/sched.h | 2 + include/linux/syscall_user_dispatch.h | 40 +++++++++++++ include/linux/thread_info.h | 2 + include/uapi/linux/prctl.h | 5 ++ kernel/entry/Makefile | 2 +- kernel/entry/common.h | 7 +++ kernel/entry/syscall_user_dispatch.c | 104 ++++++++++++++++++++++++++++++++++ kernel/fork.c | 1 + kernel/sys.c | 5 ++ 10 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 include/linux/syscall_user_dispatch.h create mode 100644 kernel/entry/common.h create mode 100644 kernel/entry/syscall_user_dispatch.c (limited to 'kernel/fork.c') diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..aee36e5733ce 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -1302,6 +1303,8 @@ int begin_new_exec(struct linux_binprm * bprm) flush_thread(); me->personality &= ~bprm->per_clear; + clear_syscall_work_syscall_user_dispatch(me); + /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..5a24a033b3f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -965,6 +966,7 @@ struct task_struct { unsigned int sessionid; #endif struct seccomp seccomp; + struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h new file mode 100644 index 000000000000..a0ae443fb7df --- /dev/null +++ b/include/linux/syscall_user_dispatch.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#ifndef _SYSCALL_USER_DISPATCH_H +#define _SYSCALL_USER_DISPATCH_H + +#include + +#ifdef CONFIG_GENERIC_ENTRY + +struct syscall_user_dispatch { + char __user *selector; + unsigned long offset; + unsigned long len; + bool on_dispatch; +}; + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector); + +#define clear_syscall_work_syscall_user_dispatch(tsk) \ + clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH) + +#else +struct syscall_user_dispatch {}; + +static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + return -EINVAL; +} + +static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *tsk) +{ +} + +#endif /* CONFIG_GENERIC_ENTRY */ + +#endif /* _SYSCALL_USER_DISPATCH_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ca80a214df09..c8a974cead73 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -42,6 +42,7 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, + SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) @@ -49,6 +50,7 @@ enum syscall_work_bit { #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #endif #include diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 7f0827705c9a..90deb41c8a34 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -247,4 +247,9 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58 +/* Dispatch syscalls to a userspace handler */ +#define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 34c8a3f1c735..095c775e001e 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.h b/kernel/entry/common.h new file mode 100644 index 000000000000..f6e6d02f07fe --- /dev/null +++ b/kernel/entry/common.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _COMMON_H +#define _COMMON_H + +bool syscall_user_dispatch(struct pt_regs *regs); + +#endif diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c new file mode 100644 index 000000000000..b0338a5625d9 --- /dev/null +++ b/kernel/entry/syscall_user_dispatch.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "common.h" + +static void trigger_sigsys(struct pt_regs *regs) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSYS; + info.si_code = SYS_USER_DISPATCH; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = 0; + info.si_arch = syscall_get_arch(current); + info.si_syscall = syscall_get_nr(current, regs); + + force_sig_info(&info); +} + +bool syscall_user_dispatch(struct pt_regs *regs) +{ + struct syscall_user_dispatch *sd = ¤t->syscall_dispatch; + char state; + + if (likely(instruction_pointer(regs) - sd->offset < sd->len)) + return false; + + if (unlikely(arch_syscall_is_vdso_sigreturn(regs))) + return false; + + if (likely(sd->selector)) { + /* + * access_ok() is performed once, at prctl time, when + * the selector is loaded by userspace. + */ + if (unlikely(__get_user(state, sd->selector))) + do_exit(SIGSEGV); + + if (likely(state == PR_SYS_DISPATCH_OFF)) + return false; + + if (state != PR_SYS_DISPATCH_ON) + do_exit(SIGSYS); + } + + sd->on_dispatch = true; + syscall_rollback(current, regs); + trigger_sigsys(regs); + + return true; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + switch (mode) { + case PR_SYS_DISPATCH_OFF: + if (offset || len || selector) + return -EINVAL; + break; + case PR_SYS_DISPATCH_ON: + /* + * Validate the direct dispatcher region just for basic + * sanity against overflow and a 0-sized dispatcher + * region. If the user is able to submit a syscall from + * an address, that address is obviously valid. + */ + if (offset && offset + len <= offset) + return -EINVAL; + + if (selector && !access_ok(selector, sizeof(*selector))) + return -EFAULT; + + break; + default: + return -EINVAL; + } + + current->syscall_dispatch.selector = selector; + current->syscall_dispatch.offset = offset; + current->syscall_dispatch.len = len; + current->syscall_dispatch.on_dispatch = false; + + if (mode == PR_SYS_DISPATCH_ON) + set_syscall_work(SYSCALL_USER_DISPATCH); + else + clear_syscall_work(SYSCALL_USER_DISPATCH); + + return 0; +} diff --git a/kernel/fork.c b/kernel/fork.c index 02b689a23457..4a5ecb41f440 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); + clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); diff --git a/kernel/sys.c b/kernel/sys.c index a730c03ee607..51f00fe20e4d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_SET_SYSCALL_USER_DISPATCH: + error = set_syscall_user_dispatch(arg2, arg3, arg4, + (char __user *) arg5); + break; default: error = -EINVAL; break; -- cgit v1.2.3-70-g09d2 From bcfe06bf2622f7c4899468e427683aec49070687 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:27 -0800 Subject: mm: memcontrol: Use helpers to read page's memcg data Patch series "mm: allow mapping accounted kernel pages to userspace", v6. Currently a non-slab kernel page which has been charged to a memory cgroup can't be mapped to userspace. The underlying reason is simple: PageKmemcg flag is defined as a page type (like buddy, offline, etc), so it takes a bit from a page->mapped counter. Pages with a type set can't be mapped to userspace. But in general the kmemcg flag has nothing to do with mapping to userspace. It only means that the page has been accounted by the page allocator, so it has to be properly uncharged on release. Some bpf maps are mapping the vmalloc-based memory to userspace, and their memory can't be accounted because of this implementation detail. This patchset removes this limitation by moving the PageKmemcg flag into one of the free bits of the page->mem_cgroup pointer. Also it formalizes accesses to the page->mem_cgroup and page->obj_cgroups using new helpers, adds several checks and removes a couple of obsolete functions. As the result the code became more robust with fewer open-coded bit tricks. This patch (of 4): Currently there are many open-coded reads of the page->mem_cgroup pointer, as well as a couple of read helpers, which are barely used. It creates an obstacle on a way to reuse some bits of the pointer for storing additional bits of information. In fact, we already do this for slab pages, where the last bit indicates that a pointer has an attached vector of objcg pointers instead of a regular memcg pointer. This commits uses 2 existing helpers and introduces a new helper to converts all read sides to calls of these helpers: struct mem_cgroup *page_memcg(struct page *page); struct mem_cgroup *page_memcg_rcu(struct page *page); struct mem_cgroup *page_memcg_check(struct page *page); page_memcg_check() is intended to be used in cases when the page can be a slab page and have a memcg pointer pointing at objcg vector. It does check the lowest bit, and if set, returns NULL. page_memcg() contains a VM_BUG_ON_PAGE() check for the page not being a slab page. To make sure nobody uses a direct access, struct page's mem_cgroup/obj_cgroups is converted to unsigned long memcg_data. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com --- fs/buffer.c | 2 +- fs/iomap/buffered-io.c | 2 +- include/linux/memcontrol.h | 114 +++++++++++++++++++++++++++++++++--- include/linux/mm.h | 22 ------- include/linux/mm_types.h | 5 +- include/trace/events/writeback.h | 2 +- kernel/fork.c | 7 ++- mm/debug.c | 4 +- mm/huge_memory.c | 4 +- mm/memcontrol.c | 121 ++++++++++++++++++--------------------- mm/page_alloc.c | 4 +- mm/page_io.c | 6 +- mm/slab.h | 9 ++- mm/workingset.c | 2 +- 14 files changed, 184 insertions(+), 120 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/buffer.c b/fs/buffer.c index 23f645657488..b56f99f82b5b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page) } while (bh != head); } /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 10cc7979ce38..16a1e82e3aeb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page) return !TestSetPageDirty(page); /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de5..f95c1433461c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -343,6 +343,79 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +/* + * page_memcg - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + return (struct mem_cgroup *)page->memcg_data; +} + +/* + * page_memcg_rcu - locklessly get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + */ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + WARN_ON_ONCE(!rcu_read_lock_held()); + + return (struct mem_cgroup *)READ_ONCE(page->memcg_data); +} + +/* + * page_memcg_check - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function unlike page_memcg() can take any page + * as an argument. It has to be used in cases when it's not known if a page + * has an associated memory cgroup pointer or an object cgroups vector. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + /* + * Because page->memcg_data might be changed asynchronously + * for slab pages, READ_ONCE() should be used here. + */ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + /* + * The lowest bit set means that memcg isn't a valid + * memcg pointer, but a obj_cgroups pointer. + * In this case the page is shared and doesn't belong + * to any specific memory cgroup. + */ + if (memcg_data & 0x1UL) + return NULL; + + return (struct mem_cgroup *)memcg_data; +} + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) @@ -743,15 +816,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline void __mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - __mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + __mod_memcg_state(memcg, idx, val); } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + mod_memcg_state(memcg, idx, val); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, @@ -834,16 +911,17 @@ static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg = page_memcg(head); pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!head->mem_cgroup) { + if (!memcg) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -878,8 +956,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { - if (page->mem_cgroup) - count_memcg_events(page->mem_cgroup, idx, 1); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + count_memcg_events(memcg, idx, 1); } static inline void count_memcg_event_mm(struct mm_struct *mm, @@ -941,6 +1021,22 @@ void mem_cgroup_split_huge_fixup(struct page *head); struct mem_cgroup; +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; +} + +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + return NULL; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; @@ -1430,7 +1526,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page, if (mem_cgroup_disabled()) return; - if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) + if (unlikely(&page_memcg(page)->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } diff --git a/include/linux/mm.h b/include/linux/mm.h index db6ae4d3fb4e..6b0c9d2c1d10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1484,28 +1484,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return page->mem_cgroup; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(page->mem_cgroup); -} -#else -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; -} -#endif - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..80f5d755c037 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -199,10 +199,7 @@ struct page { atomic_t _refcount; #ifdef CONFIG_MEMCG - union { - struct mem_cgroup *mem_cgroup; - struct obj_cgroup **obj_cgroups; - }; + unsigned long memcg_data; #endif /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index e7cbccc7c14c..39a40dfb578a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); + __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..cbd4f6f58409 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -404,9 +404,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* - * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and memcg_kmem_uncharge_page() in - * free_thread_stack() will ignore this page. + * If memcg_kmem_charge_page() fails, page's + * memory cgroup pointer is NULL, and + * memcg_kmem_uncharge_page() in free_thread_stack() + * will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); diff --git a/mm/debug.c b/mm/debug.c index ccca576b2899..8a40b3fefbeb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,8 +182,8 @@ hex_only: pr_warn("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG - if (!page_poisoned && page->mem_cgroup) - pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); + if (!page_poisoned && page->memcg_data) + pr_warn("pages's memcg:%lx\n", page->memcg_data); #endif } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9474dbc150ed..cedfb3503411 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -470,7 +470,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) #ifdef CONFIG_MEMCG static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); if (memcg) @@ -2765,7 +2765,7 @@ void deferred_split_huge_page(struct page *page) { struct deferred_split *ds_queue = get_deferred_split_queue(page); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); #endif unsigned long flags; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dcbf24d2227..3968d68503cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -533,7 +533,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; @@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = page->mem_cgroup; - - /* - * The lowest bit set means that memcg isn't a valid - * memcg pointer, but a obj_cgroups pointer. - * In this case the page is shared and doesn't belong - * to any specific memory cgroup. - */ - if ((unsigned long) memcg & 0x1UL) - memcg = NULL; + memcg = page_memcg_check(page); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); @@ -1050,7 +1041,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm); */ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); if (mem_cgroup_disabled()) return NULL; @@ -1349,7 +1340,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd goto out; } - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* * Swapcache readahead pages are added to the LRU - and * possibly migrated - before they are charged. @@ -2109,7 +2100,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } /** - * lock_page_memcg - lock a page->mem_cgroup binding + * lock_page_memcg - lock a page and memcg binding * @page: the page * * This function protects unlocked LRU pages from being moved to @@ -2141,7 +2132,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = head->mem_cgroup; + memcg = page_memcg(head); if (unlikely(!memcg)) return NULL; @@ -2149,7 +2140,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != head->mem_cgroup) { + if (memcg != page_memcg(head)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2187,14 +2178,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) } /** - * unlock_page_memcg - unlock a page->mem_cgroup binding + * unlock_page_memcg - unlock a page and memcg binding * @page: the page */ void unlock_page_memcg(struct page *page) { struct page *head = compound_head(page); - __unlock_page_memcg(head->mem_cgroup); + __unlock_page_memcg(page_memcg(head)); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2884,7 +2875,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page->mem_cgroup, page); + VM_BUG_ON_PAGE(page_memcg(page), page); /* * Any of the following ensures page->mem_cgroup stability: * @@ -2893,7 +2884,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - lock_page_memcg() * - exclusive reference */ - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; } #ifdef CONFIG_MEMCG_KMEM @@ -2908,8 +2899,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (cmpxchg(&page->obj_cgroups, NULL, - (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) + if (cmpxchg(&page->memcg_data, 0, (unsigned long)vec | 0x1UL)) kfree(vec); else kmemleak_not_leak(vec); @@ -2920,6 +2910,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * + * A passed kernel object can be a slab object or a generic kernel page, so + * different mechanisms for getting the memory cgroup pointer should be used. + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ @@ -2932,17 +2928,6 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) page = virt_to_head_page(p); - /* - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer - * or a pointer to obj_cgroup vector. In the latter case the lowest - * bit of the pointer is set. - * The page->mem_cgroup pointer can be asynchronously changed - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed - * from a valid memcg pointer to objcg vector or back. - */ - if (!page->mem_cgroup) - return NULL; - /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in @@ -2960,8 +2945,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } - /* All other pages use page->mem_cgroup */ - return page->mem_cgroup; + /* + * page_memcg_check() is used here, because page_has_obj_cgroups() + * check above could fail because the object cgroups vector wasn't set + * at that moment, but it can be set concurrently. + * page_memcg_check(page) will guarantee that a proper memory + * cgroup pointer or NULL will be returned. + */ + return page_memcg_check(page); } __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) @@ -3099,7 +3090,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; __SetPageKmemcg(page); return 0; } @@ -3115,7 +3106,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); unsigned int nr_pages = 1 << order; if (!memcg) @@ -3123,7 +3114,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); __memcg_kmem_uncharge(memcg, nr_pages); - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&memcg->css); /* slab pages do not have PageKmemcg flag set */ @@ -3274,7 +3265,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct mem_cgroup *memcg = head->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(head); int i; if (mem_cgroup_disabled()) @@ -3282,7 +3273,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { css_get(&memcg->css); - head[i].mem_cgroup = memcg; + head[i].memcg_data = (unsigned long)memcg; } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -4664,7 +4655,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; @@ -5641,14 +5632,14 @@ static int mem_cgroup_move_account(struct page *page, /* * Prevent mem_cgroup_migrate() from looking at - * page->mem_cgroup of its source page while we change it. + * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; if (!trylock_page(page)) goto out; ret = -EINVAL; - if (page->mem_cgroup != from) + if (page_memcg(page) != from) goto out_unlock; pgdat = page_pgdat(page); @@ -5703,13 +5694,13 @@ static int mem_cgroup_move_account(struct page *page, /* * All state has been migrated, let's switch to the new memcg. * - * It is safe to change page->mem_cgroup here because the page + * It is safe to change page's memcg here because the page * is referenced, charged, isolated, and locked: we can't race * with (un)charging, migration, LRU putback, or anything else - * that would rely on a stable page->mem_cgroup. + * that would rely on a stable page's memory cgroup. * * Note that lock_page_memcg is a memcg lock, not a page lock, - * to save space. As soon as we switch page->mem_cgroup to a + * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. */ @@ -5718,7 +5709,7 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - page->mem_cgroup = to; + page->memcg_data = (unsigned long)to; __unlock_page_memcg(from); @@ -5784,7 +5775,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; @@ -5828,7 +5819,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -6774,12 +6765,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page->mem_cgroup is protected - * by the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. + * already charged pages, too. page and memcg binding is + * protected by the page lock, which serializes swap cache + * removal, which in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound_head(page)->mem_cgroup) + if (page_memcg(compound_head(page))) goto out; id = lookup_swap_cgroup_id(ent); @@ -6863,21 +6854,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page->mem_cgroup) + if (!page_memcg(page)) return; /* * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully + * page_memcg(page) at this point, we have fully * exclusive access to the page. */ - if (ug->memcg != page->mem_cgroup) { + if (ug->memcg != page_memcg(page)) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page->mem_cgroup; + ug->memcg = page_memcg(page); /* pairs with css_put in uncharge_batch */ css_get(&ug->memcg->css); @@ -6894,7 +6885,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } ug->dummy_page = page; - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&ug->memcg->css); } @@ -6937,7 +6928,7 @@ void mem_cgroup_uncharge(struct page *page) return; /* Don't touch page->lru of any random page, pre-check: */ - if (!page->mem_cgroup) + if (!page_memcg(page)) return; uncharge_gather_clear(&ug); @@ -6987,11 +6978,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) return; /* Page cache replacement: new page already charged? */ - if (newpage->mem_cgroup) + if (page_memcg(newpage)) return; /* Swapcache readahead pages can get replaced before being charged */ - memcg = oldpage->mem_cgroup; + memcg = page_memcg(oldpage); if (!memcg) return; @@ -7186,7 +7177,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* Readahead page, never charged */ if (!memcg) @@ -7207,7 +7198,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(oldid, page); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - page->mem_cgroup = NULL; + page->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); @@ -7250,7 +7241,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* Readahead page, never charged */ if (!memcg) @@ -7331,7 +7322,7 @@ bool mem_cgroup_swap_full(struct page *page) if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg) return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23f5066bd4a5..271133b8243b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1092,7 +1092,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page->mem_cgroup | + (unsigned long)page_memcg(page) | #endif (page->flags & check_flags))) return false; @@ -1117,7 +1117,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) + if (unlikely(page_memcg(page))) bad_reason = "page still charged to cgroup"; #endif return bad_reason; diff --git a/mm/page_io.c b/mm/page_io.c index 433df1263349..9bca17ecc4df 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page) static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; - if (!page->mem_cgroup) + memcg = page_memcg(page); + if (!memcg) return; rcu_read_lock(); - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } diff --git a/mm/slab.h b/mm/slab.h index 6d7c6a5056ba..e2535cee0d33 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -242,18 +242,17 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla static inline struct obj_cgroup **page_obj_cgroups(struct page *page) { /* - * page->mem_cgroup and page->obj_cgroups are sharing the same + * Page's memory cgroup and obj_cgroups vector are sharing the same * space. To distinguish between them in case we don't know for sure * that the page is a slab page (e.g. page_cgroup_ino()), let's * always set the lowest bit of obj_cgroups. */ - return (struct obj_cgroup **) - ((unsigned long)page->obj_cgroups & ~0x1UL); + return (struct obj_cgroup **)(page->memcg_data & ~0x1UL); } static inline bool page_has_obj_cgroups(struct page *page) { - return ((unsigned long)page->obj_cgroups & 0x1UL); + return page->memcg_data & 0x1UL; } int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, @@ -262,7 +261,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, static inline void memcg_free_page_obj_cgroups(struct page *page) { kfree(page_obj_cgroups(page)); - page->obj_cgroups = NULL; + page->memcg_data = 0; } static inline size_t obj_full_size(struct kmem_cache *s) diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..130348cbf40a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) struct lruvec *lruvec; int memcgid; - /* Page is fully exclusive and pins page->mem_cgroup */ + /* Page is fully exclusive and pins page's memory cgroup pointer */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); -- cgit v1.2.3-70-g09d2 From 1f702603e7125a390b5cdf5ce00539781cfcc86a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:19 -0600 Subject: exec: Simplify unshare_files Now that exec no longer needs to return the unshared files to their previous value there is no reason to return displaced. Instead when unshare_fd creates a copy of the file table, call put_files_struct before returning from unshare_files. Acked-by: Christian Brauner v1: https://lkml.kernel.org/r/20200817220425.9389-2-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-2-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- fs/coredump.c | 5 +---- fs/exec.c | 5 +---- include/linux/fdtable.h | 2 +- kernel/fork.c | 12 ++++++------ 4 files changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/coredump.c b/fs/coredump.c index 0cd9056d79cc..abf807235262 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -585,7 +585,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) int ispipe; size_t *argv = NULL; int argc = 0; - struct files_struct *displaced; /* require nonrelative corefile path and be extra careful */ bool need_suid_safe = false; bool core_dumped = false; @@ -791,11 +790,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) } /* get us an unshared descriptor table; almost always a no-op */ - retval = unshare_files(&displaced); + retval = unshare_files(); if (retval) goto close_fail; - if (displaced) - put_files_struct(displaced); if (!dump_interrupted()) { /* * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would diff --git a/fs/exec.c b/fs/exec.c index 0d6533ab1c97..48fa4fc1b116 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1238,7 +1238,6 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; - struct files_struct *displaced; int retval; /* Once we are committed compute the creds */ @@ -1259,11 +1258,9 @@ int begin_new_exec(struct linux_binprm * bprm) goto out; /* Ensure the files table is not shared. */ - retval = unshare_files(&displaced); + retval = unshare_files(); if (retval) goto out; - if (displaced) - put_files_struct(displaced); /* * Must be called _before_ exec_mmap() as bprm->mm is diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index a32bf47c593e..f46a084b60b2 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -109,7 +109,7 @@ struct task_struct; struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); -int unshare_files(struct files_struct **); +int unshare_files(void); struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..837b546528c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -3023,21 +3023,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) * the exec layer of the kernel. */ -int unshare_files(struct files_struct **displaced) +int unshare_files(void) { struct task_struct *task = current; - struct files_struct *copy = NULL; + struct files_struct *old, *copy = NULL; int error; error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); - if (error || !copy) { - *displaced = NULL; + if (error || !copy) return error; - } - *displaced = task->files; + + old = task->files; task_lock(task); task->files = copy; task_unlock(task); + put_files_struct(old); return 0; } -- cgit v1.2.3-70-g09d2 From f7cfd871ae0c5008d94b6f66834e7845caa93c15 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:12:00 -0600 Subject: exec: Transform exec_update_mutex into a rw_semaphore Recently syzbot reported[0] that there is a deadlock amongst the users of exec_update_mutex. The problematic lock ordering found by lockdep was: perf_event_open (exec_update_mutex -> ovl_i_mutex) chown (ovl_i_mutex -> sb_writes) sendfile (sb_writes -> p->lock) by reading from a proc file and writing to overlayfs proc_pid_syscall (p->lock -> exec_update_mutex) While looking at possible solutions it occured to me that all of the users and possible users involved only wanted to state of the given process to remain the same. They are all readers. The only writer is exec. There is no reason for readers to block on each other. So fix this deadlock by transforming exec_update_mutex into a rw_semaphore named exec_update_lock that only exec takes for writing. Cc: Jann Horn Cc: Vasiliy Kulikov Cc: Al Viro Cc: Bernd Edlinger Cc: Oleg Nesterov Cc: Christopher Yeoh Cc: Cyrill Gorcunov Cc: Sargun Dhillon Cc: Christian Brauner Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Fixes: eea9673250db ("exec: Add exec_update_mutex to replace cred_guard_mutex") [0] https://lkml.kernel.org/r/00000000000063640c05ade8e3de@google.com Reported-by: syzbot+db9cdf3dd1f64252c6ef@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/87ft4mbqen.fsf@x220.int.ebiederm.org Signed-off-by: Eric W. Biederman --- fs/exec.c | 12 ++++++------ fs/proc/base.c | 10 +++++----- include/linux/sched/signal.h | 11 ++++++----- init/init_task.c | 2 +- kernel/events/core.c | 12 ++++++------ kernel/fork.c | 6 +++--- kernel/kcmp.c | 30 +++++++++++++++--------------- kernel/pid.c | 4 ++-- 8 files changed, 44 insertions(+), 43 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..ca89e0e3ef10 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -965,8 +965,8 @@ EXPORT_SYMBOL(read_code); /* * Maps the mm_struct mm into the current task struct. - * On success, this function returns with the mutex - * exec_update_mutex locked. + * On success, this function returns with exec_update_lock + * held for writing. */ static int exec_mmap(struct mm_struct *mm) { @@ -981,7 +981,7 @@ static int exec_mmap(struct mm_struct *mm) if (old_mm) sync_mm_rss(old_mm); - ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; @@ -995,7 +995,7 @@ static int exec_mmap(struct mm_struct *mm) mmap_read_lock(old_mm); if (unlikely(old_mm->core_state)) { mmap_read_unlock(old_mm); - mutex_unlock(&tsk->signal->exec_update_mutex); + up_write(&tsk->signal->exec_update_lock); return -EINTR; } } @@ -1382,7 +1382,7 @@ int begin_new_exec(struct linux_binprm * bprm) return 0; out_unlock: - mutex_unlock(&me->signal->exec_update_mutex); + up_write(&me->signal->exec_update_lock); out: return retval; } @@ -1423,7 +1423,7 @@ void setup_new_exec(struct linux_binprm * bprm) * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; - mutex_unlock(&me->signal->exec_update_mutex); + up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); diff --git a/fs/proc/base.c b/fs/proc/base.c index b362523a9829..55ce0ee9c5c7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->exec_update_mutex); + int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE @@ -2930,7 +2930,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->exec_update_mutex); + result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; @@ -2966,7 +2966,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return result; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1bad18a1d8ba..4b6a8234d7fc 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -228,12 +228,13 @@ struct signal_struct { * credential calculations * (notably. ptrace) * Deprecated do not use in new code. - * Use exec_update_mutex instead. - */ - struct mutex exec_update_mutex; /* Held while task_struct is being - * updated during exec, and may have - * inconsistent permissions. + * Use exec_update_lock instead. */ + struct rw_semaphore exec_update_lock; /* Held while task_struct is + * being updated during exec, + * and may have inconsistent + * permissions. + */ } __randomize_layout; /* diff --git a/init/init_task.c b/init/init_task.c index a56f0abb63e9..15f6eb93a04f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -26,7 +26,7 @@ static struct signal_struct init_signals = { .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), - .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), + .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), .cputimer = { diff --git a/kernel/events/core.c b/kernel/events/core.c index dc568ca295bd..55b2330b556c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1325,7 +1325,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: - * exec_update_mutex + * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -11721,14 +11721,14 @@ SYSCALL_DEFINE5(perf_event_open, } if (task) { - err = mutex_lock_interruptible(&task->signal->exec_update_mutex); + err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_task; /* * Preserve ptrace permission check for backwards compatibility. * - * We must hold exec_update_mutex across this and any potential + * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). @@ -12017,7 +12017,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); if (task) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); put_task_struct(task); } @@ -12053,7 +12053,7 @@ err_alloc: free_event(event); err_cred: if (task) - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); err_task: if (task) put_task_struct(task); @@ -12358,7 +12358,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. * - * Can be called with exec_update_mutex held when called from + * Can be called with exec_update_lock held when called from * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..e8cb80b266d2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1221,7 +1221,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) struct mm_struct *mm; int err; - err = mutex_lock_killable(&task->signal->exec_update_mutex); + err = down_read_killable(&task->signal->exec_update_lock); if (err) return ERR_PTR(err); @@ -1231,7 +1231,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mmput(mm); mm = ERR_PTR(-EACCES); } - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return mm; } @@ -1591,7 +1591,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); - mutex_init(&sig->exec_update_mutex); + init_rwsem(&sig->exec_update_lock); return 0; } diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b3ff9288c6cc..c0d2ad9b4705 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -75,25 +75,25 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) return file; } -static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2) { - if (likely(m2 != m1)) - mutex_unlock(m2); - mutex_unlock(m1); + if (likely(l2 != l1)) + up_read(l2); + up_read(l1); } -static int kcmp_lock(struct mutex *m1, struct mutex *m2) +static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2) { int err; - if (m2 > m1) - swap(m1, m2); + if (l2 > l1) + swap(l1, l2); - err = mutex_lock_killable(m1); - if (!err && likely(m1 != m2)) { - err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + err = down_read_killable(l1); + if (!err && likely(l1 != l2)) { + err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING); if (err) - mutex_unlock(m1); + up_read(l1); } return err; @@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, /* * One should have enough rights to inspect task details. */ - ret = kcmp_lock(&task1->signal->exec_update_mutex, - &task2->signal->exec_update_mutex); + ret = kcmp_lock(&task1->signal->exec_update_lock, + &task2->signal->exec_update_lock); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || @@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, } err_unlock: - kcmp_unlock(&task1->signal->exec_update_mutex, - &task2->signal->exec_update_mutex); + kcmp_unlock(&task1->signal->exec_update_lock, + &task2->signal->exec_update_lock); err: put_task_struct(task1); put_task_struct(task2); diff --git a/kernel/pid.c b/kernel/pid.c index a96bc4bf4f86..4856818c9de1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -628,7 +628,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) struct file *file; int ret; - ret = mutex_lock_killable(&task->signal->exec_update_mutex); + ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); @@ -637,7 +637,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) else file = ERR_PTR(-EPERM); - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return file ?: ERR_PTR(-EBADF); } -- cgit v1.2.3-70-g09d2 From 57efa1fe5957694fa541c9062de0a127f0b9acb0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Dec 2020 19:05:44 -0800 Subject: mm/gup: prevent gup_fast from racing with COW during fork Since commit 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") pages under a FOLL_PIN will not be write protected during COW for fork. This means that pages returned from pin_user_pages(FOLL_WRITE) should not become write protected while the pin is active. However, there is a small race where get_user_pages_fast(FOLL_PIN) can establish a FOLL_PIN at the same time copy_present_page() is write protecting it: CPU 0 CPU 1 get_user_pages_fast() internal_get_user_pages_fast() copy_page_range() pte_alloc_map_lock() copy_present_page() atomic_read(has_pinned) == 0 page_maybe_dma_pinned() == false atomic_set(has_pinned, 1); gup_pgd_range() gup_pte_range() pte_t pte = gup_get_pte(ptep) pte_access_permitted(pte) try_grab_compound_head() pte = pte_wrprotect(pte) set_pte_at(); pte_unmap_unlock() // GUP now returns with a write protected page The first attempt to resolve this by using the write protect caused problems (and was missing a barrrier), see commit f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Instead wrap copy_p4d_range() with the write side of a seqcount and check the read side around gup_pgd_range(). If there is a collision then get_user_pages_fast() fails and falls back to slow GUP. Slow GUP is safe against this race because copy_page_range() is only called while holding the exclusive side of the mmap_lock on the src mm_struct. [akpm@linux-foundation.org: coding style fixes] Link: https://lore.kernel.org/r/CAHk-=wi=iCnYCARbPGjkVJu9eyYeZ13N64tZYLdOB8CP5Q_PLw@mail.gmail.com Link: https://lkml.kernel.org/r/2-v4-908497cf359a+4782-gup_fork_jgg@nvidia.com Fixes: f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Signed-off-by: Jason Gunthorpe Suggested-by: Linus Torvalds Reviewed-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Peter Xu Acked-by: "Ahmed S. Darwish" [seqcount_t parts] Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Jann Horn Cc: Kirill Shutemov Cc: Kirill Tkhai Cc: Leon Romanovsky Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/tboot.c | 1 + drivers/firmware/efi/efi.c | 1 + include/linux/mm_types.h | 8 ++++++++ kernel/fork.c | 1 + mm/gup.c | 18 ++++++++++++++++++ mm/init-mm.c | 1 + mm/memory.c | 13 ++++++++++++- 7 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index ae64f98ec2ab..4c09ba110204 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 6c6eec044a97..df3f9bcab581 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -57,6 +57,7 @@ struct mm_struct efi_mm = { .mm_rb = RB_ROOT, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(efi_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..915f4f100383 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -446,6 +447,13 @@ struct mm_struct { */ atomic_t has_pinned; + /** + * @write_protect_seq: Locked when any thread is write + * protecting pages mapped by this mm to enforce a later COW, + * for instance during page table copying for fork(). + */ + seqcount_t write_protect_seq; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..dc55f68a6ee3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1007,6 +1007,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); + seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; diff --git a/mm/gup.c b/mm/gup.c index c7e24301860a..9c6a2f5001c5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2684,11 +2684,18 @@ static unsigned long lockless_pages_from_mm(unsigned long start, { unsigned long flags; int nr_pinned = 0; + unsigned seq; if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || !gup_fast_permitted(start, end)) return 0; + if (gup_flags & FOLL_PIN) { + seq = raw_read_seqcount(¤t->mm->write_protect_seq); + if (seq & 1) + return 0; + } + /* * Disable interrupts. The nested form is used, in order to allow full, * general purpose use of this routine. @@ -2703,6 +2710,17 @@ static unsigned long lockless_pages_from_mm(unsigned long start, local_irq_save(flags); gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); + + /* + * When pinning pages for DMA there could be a concurrent write protect + * from fork() via copy_page_range(), in this case always fail fast GUP. + */ + if (gup_flags & FOLL_PIN) { + if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { + unpin_user_pages(pages, nr_pinned); + return 0; + } + } return nr_pinned; } diff --git a/mm/init-mm.c b/mm/init-mm.c index 3a613c85f9ed..153162669f80 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -31,6 +31,7 @@ struct mm_struct init_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), diff --git a/mm/memory.c b/mm/memory.c index c48f8df6e502..50632c4366b8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1171,6 +1171,15 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); + /* + * Disabling preemption is not needed for the write side, as + * the read side doesn't spin, but goes to the mmap_lock. + * + * Use the raw variant of the seqcount_t write API to avoid + * lockdep complaining about preemptibility. + */ + mmap_assert_write_locked(src_mm); + raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; @@ -1187,8 +1196,10 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow) + if (is_cow) { + raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } return ret; } -- cgit v1.2.3-70-g09d2 From da3ceeff923e3bc750a8423c840462760c463926 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Dec 2020 19:07:04 -0800 Subject: mm: memcg/slab: rename *_lruvec_slab_state to *_lruvec_kmem_state The *_lruvec_slab_state is also suitable for pages allocated from buddy, not just for the slab objects. But the function name seems to tell us that only slab object is applicable. So we can rename the keyword of slab to kmem. Link: https://lkml.kernel.org/r/20201117085249.24319-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 18 +++++++++--------- kernel/fork.c | 2 +- mm/memcontrol.c | 2 +- mm/workingset.c | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dd992b81bcb7..71c961452d9a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -788,15 +788,15 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); - __mod_lruvec_slab_state(p, idx, val); + __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } @@ -1229,7 +1229,7 @@ static inline void mod_lruvec_page_state(struct page *page, mod_node_page_state(page_pgdat(page), idx, val); } -static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1237,7 +1237,7 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, __mod_node_page_state(page_pgdat(page), idx, val); } -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1332,14 +1332,14 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } -static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, 1); + __mod_lruvec_kmem_state(p, idx, 1); } -static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, -1); + __mod_lruvec_kmem_state(p, idx, -1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/kernel/fork.c b/kernel/fork.c index dc55f68a6ee3..5e7cc88eadb5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -385,7 +385,7 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); else - mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce19e8484c89..b50f7f336b16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -853,7 +853,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..25f75bbe80e0 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -445,12 +445,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_slab_state(node, WORKINGSET_NODES); + __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } } @@ -544,7 +544,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -559,7 +559,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); - __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); + __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3-70-g09d2 From cebd0eb29acdfc2f5e44e5f356ffcd0c44f16b4a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 22 Dec 2020 12:00:21 -0800 Subject: kasan: rename (un)poison_shadow to (un)poison_range This is a preparatory commit for the upcoming addition of a new hardware tag-based (MTE-based) KASAN mode. The new mode won't be using shadow memory. Rename external annotation kasan_unpoison_shadow() to kasan_unpoison_range(), and introduce internal functions (un)poison_range() (without kasan_ prefix). Co-developed-by: Marco Elver Link: https://lkml.kernel.org/r/fccdcaa13dc6b2211bf363d6c6d499279a54fe3a.1606161801.git.andreyknvl@google.com Signed-off-by: Marco Elver Signed-off-by: Andrey Konovalov Signed-off-by: Vincenzo Frascino Reviewed-by: Alexander Potapenko Tested-by: Vincenzo Frascino Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 +++--- kernel/fork.c | 4 ++-- mm/kasan/common.c | 49 +++++++++++++++++++++++++++---------------------- mm/kasan/generic.c | 23 +++++++++++------------ mm/kasan/kasan.h | 3 ++- mm/kasan/tags.c | 2 +- mm/slab_common.c | 2 +- 7 files changed, 47 insertions(+), 42 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 7828436a3a99..9740c06a04a1 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -72,7 +72,7 @@ extern void kasan_enable_current(void); /* Disable reporting bugs for current task */ extern void kasan_disable_current(void); -void kasan_unpoison_shadow(const void *address, size_t size); +void kasan_unpoison_range(const void *address, size_t size); void kasan_unpoison_task_stack(struct task_struct *task); @@ -109,7 +109,7 @@ struct kasan_cache { size_t __ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { - kasan_unpoison_shadow(ptr, __ksize(ptr)); + kasan_unpoison_range(ptr, __ksize(ptr)); } size_t kasan_metadata_size(struct kmem_cache *cache); @@ -118,7 +118,7 @@ void kasan_restore_multi_shot(bool enabled); #else /* CONFIG_KASAN */ -static inline void kasan_unpoison_shadow(const void *address, size_t size) {} +static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_unpoison_task_stack(struct task_struct *task) {} diff --git a/kernel/fork.c b/kernel/fork.c index 41906a52a764..37720a6d04ea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -225,8 +225,8 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; - /* Clear the KASAN shadow of the stack. */ - kasan_unpoison_shadow(s->addr, THREAD_SIZE); + /* Mark stack accessible for KASAN. */ + kasan_unpoison_range(s->addr, THREAD_SIZE); /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 89e5ef9417a7..73e79a34671b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -108,7 +108,7 @@ void *memcpy(void *dest, const void *src, size_t len) * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. */ -void kasan_poison_shadow(const void *address, size_t size, u8 value) +void poison_range(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -125,7 +125,7 @@ void kasan_poison_shadow(const void *address, size_t size, u8 value) __memset(shadow_start, value, shadow_end - shadow_start); } -void kasan_unpoison_shadow(const void *address, size_t size) +void unpoison_range(const void *address, size_t size) { u8 tag = get_tag(address); @@ -136,7 +136,7 @@ void kasan_unpoison_shadow(const void *address, size_t size) */ address = reset_tag(address); - kasan_poison_shadow(address, size, tag); + poison_range(address, size, tag); if (size & KASAN_SHADOW_MASK) { u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); @@ -148,12 +148,17 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } +void kasan_unpoison_range(const void *address, size_t size) +{ + unpoison_range(address, size); +} + static void __kasan_unpoison_stack(struct task_struct *task, const void *sp) { void *base = task_stack_page(task); size_t size = sp - base; - kasan_unpoison_shadow(base, size); + unpoison_range(base, size); } /* Unpoison the entire stack for a task. */ @@ -172,7 +177,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - kasan_unpoison_shadow(base, watermark - base); + unpoison_range(base, watermark - base); } void kasan_alloc_pages(struct page *page, unsigned int order) @@ -186,13 +191,13 @@ void kasan_alloc_pages(struct page *page, unsigned int order) tag = random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); + unpoison_range(page_address(page), PAGE_SIZE << order); } void kasan_free_pages(struct page *page, unsigned int order) { if (likely(!PageHighMem(page))) - kasan_poison_shadow(page_address(page), + poison_range(page_address(page), PAGE_SIZE << order, KASAN_FREE_PAGE); } @@ -284,18 +289,18 @@ void kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); - kasan_poison_shadow(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE); + poison_range(page_address(page), page_size(page), + KASAN_KMALLOC_REDZONE); } void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - kasan_unpoison_shadow(object, cache->object_size); + unpoison_range(object, cache->object_size); } void kasan_poison_object_data(struct kmem_cache *cache, void *object) { - kasan_poison_shadow(object, + poison_range(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); } @@ -408,7 +413,7 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, } rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); + poison_range(object, rounded_up_size, KASAN_KMALLOC_FREE); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) || unlikely(!(cache->flags & SLAB_KASAN))) @@ -448,9 +453,9 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, tag = assign_tag(cache, object, false, keep_tag); /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ - kasan_unpoison_shadow(set_tag(object, tag), size); - kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + unpoison_range(set_tag(object, tag), size); + poison_range((void *)redzone_start, redzone_end - redzone_start, + KASAN_KMALLOC_REDZONE); if (cache->flags & SLAB_KASAN) kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags); @@ -489,9 +494,9 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, KASAN_SHADOW_SCALE_SIZE); redzone_end = (unsigned long)ptr + page_size(page); - kasan_unpoison_shadow(ptr, size); - kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, - KASAN_PAGE_REDZONE); + unpoison_range(ptr, size); + poison_range((void *)redzone_start, redzone_end - redzone_start, + KASAN_PAGE_REDZONE); return (void *)ptr; } @@ -523,7 +528,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); return; } - kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); + poison_range(ptr, page_size(page), KASAN_FREE_PAGE); } else { __kasan_slab_free(page->slab_cache, ptr, ip, false); } @@ -709,7 +714,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) * // vmalloc() allocates memory * // let a = area->addr * // we reach kasan_populate_vmalloc - * // and call kasan_unpoison_shadow: + * // and call unpoison_range: * STORE shadow(a), unpoison_val * ... * STORE shadow(a+99), unpoison_val x = LOAD p @@ -744,7 +749,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size) return; size = round_up(size, KASAN_SHADOW_SCALE_SIZE); - kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID); + poison_range(start, size, KASAN_VMALLOC_INVALID); } void kasan_unpoison_vmalloc(const void *start, unsigned long size) @@ -752,7 +757,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) if (!is_vmalloc_or_module_addr(start)) return; - kasan_unpoison_shadow(start, size); + unpoison_range(start, size); } static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d341859a1b95..9fe44f9b3b30 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -202,11 +202,11 @@ static void register_global(struct kasan_global *global) { size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); - kasan_unpoison_shadow(global->beg, global->size); + unpoison_range(global->beg, global->size); - kasan_poison_shadow(global->beg + aligned_size, - global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); + poison_range(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE); } void __asan_register_globals(struct kasan_global *globals, size_t size) @@ -285,13 +285,12 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); - kasan_unpoison_shadow((const void *)(addr + rounded_down_size), - size - rounded_down_size); - kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_LEFT); - kasan_poison_shadow(right_redzone, - padding_size + KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_RIGHT); + unpoison_range((const void *)(addr + rounded_down_size), + size - rounded_down_size); + poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_LEFT); + poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_RIGHT); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -301,7 +300,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); + unpoison_range(stack_top, stack_bottom - stack_top); } EXPORT_SYMBOL(__asan_allocas_unpoison); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ac499456740f..42ab02c61331 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -150,7 +150,8 @@ static inline bool addr_has_shadow(const void *addr) return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); } -void kasan_poison_shadow(const void *address, size_t size, u8 value); +void poison_range(const void *address, size_t size, u8 value); +void unpoison_range(const void *address, size_t size); /** * check_memory_region - Check memory region, and report if invalid access. diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 5c8b08a25715..c0b3f327812b 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - kasan_poison_shadow((void *)addr, size, tag); + poison_range((void *)addr, size, tag); } EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/mm/slab_common.c b/mm/slab_common.c index 2f2b55c2798e..573fbacd9ef5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1176,7 +1176,7 @@ size_t ksize(const void *objp) * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. */ - kasan_unpoison_shadow(objp, size); + kasan_unpoison_range(objp, size); return size; } EXPORT_SYMBOL(ksize); -- cgit v1.2.3-70-g09d2