From e82a118f57b89bbb437ce70780fc2678d5c281e5 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Sun, 12 Apr 2020 22:25:33 +0200 Subject: clone3: fix cgroup argument sanity check Checking that cgroup field value of struct clone_args is less than 0 is useless, as it is defined as unsigned 64-bit integer. Moreover, it doesn't catch the situations where its higher bits are lost during the assignment to the cgroup field of the cgroup field of the internal struct kernel_clone_args (where it is declared as signed 32-bit integer), so it is still possible to pass garbage there. A check against INT_MAX solves both these issues. Fixes: ef2c41cf38a7559b ("clone3: allow spawning processes into cgroups") Signed-off-by: Eugene Syromiatnikov Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200412202533.GA29554@asgard.redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 4385f3d639f2..b4f7775623c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2631,7 +2631,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, !valid_signal(args.exit_signal))) return -EINVAL; - if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0) + if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup > INT_MAX) return -EINVAL; *kargs = (struct kernel_clone_args){ -- cgit v1.2.3-70-g09d2 From 62173872ca65767c586217dec0a32485da8a2f07 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Sun, 12 Apr 2020 22:31:23 +0200 Subject: clone3: add a check for the user struct size if CLONE_INTO_CGROUP is set Passing CLONE_INTO_CGROUP with an under-sized structure (that doesn't properly contain cgroup field) seems like garbage input, especially considering the fact that fd 0 is a valid descriptor. Signed-off-by: Eugene Syromiatnikov Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200412203123.GA5869@asgard.redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b4f7775623c8..3ab7cf88e455 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2631,7 +2631,8 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, !valid_signal(args.exit_signal))) return -EINVAL; - if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup > INT_MAX) + if ((args.flags & CLONE_INTO_CGROUP) && + (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) return -EINVAL; *kargs = (struct kernel_clone_args){ -- cgit v1.2.3-70-g09d2 From a966dcfe153ab0a3d8d79cd971a079411a489be7 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Sun, 12 Apr 2020 22:26:58 +0200 Subject: clone3: add build-time CLONE_ARGS_SIZE_VER* validity checks CLONE_ARGS_SIZE_VER* macros are defined explicitly and not via the offsets of the relevant struct clone_args fields, which makes it rather error-prone, so it probably makes sense to add some compile-time checks for them (including the one that breaks on struct clone_args extension as a reminder to add a relevant size macro and a similar check). Function copy_clone_args_from_user seems to be a good place for such checks. Signed-off-by: Eugene Syromiatnikov Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200412202658.GA31499@asgard.redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 3ab7cf88e455..8c700f881d92 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2605,6 +2605,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args args; pid_t *kset_tid = kargs->set_tid; + BUILD_BUG_ON(offsetofend(struct clone_args, tls) != + CLONE_ARGS_SIZE_VER0); + BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != + CLONE_ARGS_SIZE_VER1); + BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != + CLONE_ARGS_SIZE_VER2); + BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) -- cgit v1.2.3-70-g09d2 From d5f177d35c24429c87db2567d20563fc16f7e8f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Mar 2020 19:56:53 -0700 Subject: rcu-tasks: Add an RCU Tasks Trace to simplify protection of tracing hooks Because RCU does not watch exception early-entry/late-exit, idle-loop, or CPU-hotplug execution, protection of tracing and BPF operations is needlessly complicated. This commit therefore adds a variant of Tasks RCU that: o Has explicit read-side markers to allow finite grace periods in the face of in-kernel loops for PREEMPT=n builds. These markers are rcu_read_lock_trace() and rcu_read_unlock_trace(). o Protects code in the idle loop, exception entry/exit, and CPU-hotplug code paths. In this respect, RCU-tasks trace is similar to SRCU, but with lighter-weight readers. o Avoids expensive read-side instruction, having overhead similar to that of Preemptible RCU. There are of course downsides: o The grace-period code can send IPIs to CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. This is mitigated by later commits. o It is necessary to scan the full tasklist, much as for Tasks RCU. o There is a single callback queue guarded by a single lock, again, much as for Tasks RCU. However, those early use cases that request multiple grace periods in quick succession are expected to do so from a single task, which makes the single lock almost irrelevant. If needed, multiple callback queues can be provided using any number of schemes. Perhaps most important, this variant of RCU does not affect the vanilla flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace readers can operate from idle, offline, and exception entry/exit in no way enables rcu_preempt and rcu_sched readers to do so. The memory ordering was outlined here: https://lore.kernel.org/lkml/20200319034030.GX3199@paulmck-ThinkPad-P72/ This effort benefited greatly from off-list discussions of BPF requirements with Alexei Starovoitov and Andrii Nakryiko. At least some of the on-list discussions are captured in the Link: tags below. In addition, KCSAN was quite helpful in finding some early bugs. Link: https://lore.kernel.org/lkml/20200219150744.428764577@infradead.org/ Link: https://lore.kernel.org/lkml/87mu8p797b.fsf@nanos.tec.linutronix.de/ Link: https://lore.kernel.org/lkml/20200225221305.605144982@linutronix.de/ Cc: Alexei Starovoitov Cc: Andrii Nakryiko [ paulmck: Apply feedback from Steve Rostedt and Joel Fernandes. ] [ paulmck: Decrement trc_n_readers_need_end upon IPI failure. ] [ paulmck: Fix locking issue reported by rcutorture. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 84 ++++++++++ include/linux/sched.h | 8 + init/init_task.c | 4 + kernel/fork.c | 4 + kernel/rcu/Kconfig | 11 +- kernel/rcu/tasks.h | 361 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 467 insertions(+), 5 deletions(-) create mode 100644 include/linux/rcupdate_trace.h (limited to 'kernel/fork.c') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h new file mode 100644 index 000000000000..ed97e10817bd --- /dev/null +++ b/include/linux/rcupdate_trace.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion, adapted for tracing. + * + * Copyright (C) 2020 Paul E. McKenney. + */ + +#ifndef __LINUX_RCUPDATE_TRACE_H +#define __LINUX_RCUPDATE_TRACE_H + +#include +#include + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +extern struct lockdep_map rcu_trace_lock_map; + +static inline int rcu_read_lock_trace_held(void) +{ + return lock_is_held(&rcu_trace_lock_map); +} + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +static inline int rcu_read_lock_trace_held(void) +{ + return 1; +} + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +void rcu_read_unlock_trace_special(struct task_struct *t); + +/** + * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section + * + * When synchronize_rcu_trace() is invoked by one task, then that task + * is guaranteed to block until all other tasks exit their read-side + * critical sections. Similarly, if call_rcu_trace() is invoked on one + * task while other tasks are within RCU read-side critical sections, + * invocation of the corresponding RCU callback is deferred until after + * the all the other tasks exit their critical sections. + * + * For more details, please see the documentation for rcu_read_lock(). + */ +static inline void rcu_read_lock_trace(void) +{ + struct task_struct *t = current; + + WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + rcu_lock_acquire(&rcu_trace_lock_map); +} + +/** + * rcu_read_unlock_trace - mark end of RCU-trace read-side critical section + * + * Pairs with a preceding call to rcu_read_lock_trace(), and nesting is + * allowed. Invoking a rcu_read_unlock_trace() when there is no matching + * rcu_read_lock_trace() is verboten, and will result in lockdep complaints. + * + * For more details, please see the documentation for rcu_read_unlock(). + */ +static inline void rcu_read_unlock_trace(void) +{ + int nesting; + struct task_struct *t = current; + + rcu_lock_release(&rcu_trace_lock_map); + nesting = READ_ONCE(t->trc_reader_nesting) - 1; + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (likely(!READ_ONCE(t->trc_reader_need_end)) || nesting) + return; // We assume shallow reader nesting. + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +void synchronize_rcu_tasks_trace(void); +void rcu_barrier_tasks_trace(void); + +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + +#endif /* __LINUX_RCUPDATE_TRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a4b727f57095..864f60e51c41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -724,6 +724,14 @@ struct task_struct { struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + int trc_reader_nesting; + int trc_ipi_to_cpu; + bool trc_reader_need_end; + bool trc_reader_checked; + struct list_head trc_holdout_list; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + struct sched_info sched_info; struct list_head tasks; diff --git a/init/init_task.c b/init/init_task.c index bd403ed3e418..e8b3740ee598 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -141,6 +141,10 @@ struct task_struct init_task .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, #endif +#ifdef CONFIG_TASKS_TRACE_RCU + .trc_reader_nesting = 0, + .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), +#endif #ifdef CONFIG_CPUSETS .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), #endif diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..72e9396235b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1683,6 +1683,10 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 6ee6372a4459..cb1d18ef343c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU || TASKS_RUDE_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -93,6 +93,15 @@ config TASKS_RUDE_RCU switches on all online CPUs, including idle ones, so use with caution. +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d8b09d5d1db1..fd34fd673a8c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -181,12 +181,17 @@ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) preempt_enable(); } +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { + struct task_struct *t = current; + preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); preempt_enable(); + exit_tasks_rcu_finish_trace(t); } #ifndef CONFIG_TINY_RCU @@ -196,15 +201,19 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) */ static void __init rcu_tasks_bootup_oddness(void) { -#ifdef CONFIG_TASKS_RCU +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU pr_info("\tRude variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -569,3 +578,347 @@ static int __init rcu_spawn_tasks_rude_kthread(void) core_initcall(rcu_spawn_tasks_rude_kthread); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + if (atomic_dec_and_test(&trc_n_readers_need_end)) + wake_up(&trc_wait); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + // If the task is in a read-side critical section, set up its + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + if (unlikely(t->trc_reader_nesting)) { + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + } + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Wait for CPU-hotplug paths to complete. + cpus_read_lock(); + cpus_read_unlock(); + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + t->trc_reader_checked = false; + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* Do intermediate processing between task and holdout scans. */ +static void rcu_tasks_trace_postscan(void) +{ + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool ndrpt, bool *frptp) +{ + struct task_struct *g, *t; + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(void) +{ + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + wait_event_idle_exclusive(trc_wait, + atomic_read(&trc_n_readers_need_end) == 0); + + smp_mb(); // Caller's code must be ordered after wakeup. +} + +/* Report any needed quiescent state for this exiting task. */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ -- cgit v1.2.3-70-g09d2 From 276c410448dbca357a2bc3539acfe04862e5f172 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 16:02:06 -0700 Subject: rcu-tasks: Split ->trc_reader_need_end This commit splits ->trc_reader_need_end by using the rcu_special union. This change permits readers to check to see if a memory barrier is required without any added overhead in the common case where no such barrier is required. This commit also adds the read-side checking. Later commits will add the machinery to properly set the new ->trc_reader_special.b.need_mb field. This commit also makes rcu_read_unlock_trace_special() tolerate nested read-side critical sections within interrupt and NMI handlers. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 11 +++++++---- include/linux/sched.h | 4 ++-- init/init_task.c | 1 + kernel/fork.c | 1 + kernel/rcu/tasks.h | 33 ++++++++++++++++++++------------- 5 files changed, 31 insertions(+), 19 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index ed97e10817bd..c42b365ca176 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -31,7 +31,7 @@ static inline int rcu_read_lock_trace_held(void) #ifdef CONFIG_TASKS_TRACE_RCU -void rcu_read_unlock_trace_special(struct task_struct *t); +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section @@ -50,6 +50,8 @@ static inline void rcu_read_lock_trace(void) struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers rcu_lock_acquire(&rcu_trace_lock_map); } @@ -69,10 +71,11 @@ static inline void rcu_read_unlock_trace(void) rcu_lock_release(&rcu_trace_lock_map); nesting = READ_ONCE(t->trc_reader_nesting) - 1; - WRITE_ONCE(t->trc_reader_nesting, nesting); - if (likely(!READ_ONCE(t->trc_reader_need_end)) || nesting) + if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { + WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. - rcu_read_unlock_trace_special(t); + } + rcu_read_unlock_trace_special(t, nesting); } void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); diff --git a/include/linux/sched.h b/include/linux/sched.h index 864f60e51c41..9437b53cc603 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -613,7 +613,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 pad; /* No garbage from compiler! */ + u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; @@ -727,7 +727,7 @@ struct task_struct { #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; - bool trc_reader_need_end; + union rcu_special trc_reader_special; bool trc_reader_checked; struct list_head trc_holdout_list; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/init/init_task.c b/init/init_task.c index e8b3740ee598..825972daec32 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -143,6 +143,7 @@ struct task_struct init_task #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, + .trc_reader_special.s = 0, .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), #endif #ifdef CONFIG_CPUSETS diff --git a/kernel/fork.c b/kernel/fork.c index 72e9396235b4..96eb4b535ced 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1685,6 +1685,7 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index eeac4a122234..17b1b9a31071 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -723,10 +723,17 @@ DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); /* If we are the last reader, wake up the grace-period kthread. */ -void rcu_read_unlock_trace_special(struct task_struct *t) +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - WRITE_ONCE(t->trc_reader_need_end, false); - if (atomic_dec_and_test(&trc_n_readers_need_end)) + int nq = t->trc_reader_special.b.need_qs; + + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) wake_up(&trc_wait); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -777,8 +784,8 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -804,8 +811,8 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // exit from that critical section. if (unlikely(t->trc_reader_nesting)) { atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); } return true; } @@ -884,7 +891,7 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { - WRITE_ONCE(t->trc_reader_need_end, false); + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); @@ -916,7 +923,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], t->trc_reader_nesting, - " N"[!!t->trc_reader_need_end], + " N"[!!t->trc_reader_special.b.need_qs], cpu); sched_show_task(t); } @@ -980,11 +987,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) break; // Count reached zero. // Stall warning time, so make a list of the offenders. for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_need_end)) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_need_end)) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { show_stalled_task_trace(t, &firstreport); trc_del_holdout(t); } @@ -1003,8 +1010,8 @@ void exit_tasks_rcu_finish_trace(struct task_struct *t) WRITE_ONCE(t->trc_reader_checked, true); WARN_ON_ONCE(t->trc_reader_nesting); WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) - rcu_read_unlock_trace_special(t); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); } /** -- cgit v1.2.3-70-g09d2 From 3f2c788a13143620c5471ac96ac4f033fc9ac3f3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 7 May 2020 12:32:14 +0200 Subject: fork: prevent accidental access to clone3 features Jan reported an issue where an interaction between sign-extending clone's flag argument on ppc64le and the new CLONE_INTO_CGROUP feature causes clone() to consistently fail with EBADF. The whole story is a little longer. The legacy clone() syscall is odd in a bunch of ways and here two things interact. First, legacy clone's flag argument is word-size dependent, i.e. it's an unsigned long whereas most system calls with flag arguments use int or unsigned int. Second, legacy clone() ignores unknown and deprecated flags. The two of them taken together means that users on 64bit systems can pass garbage for the upper 32bit of the clone() syscall since forever and things would just work fine. Just try this on a 64bit kernel prior to v5.7-rc1 where this will succeed and on v5.7-rc1 where this will fail with EBADF: int main(int argc, char *argv[]) { pid_t pid; /* Note that legacy clone() has different argument ordering on * different architectures so this won't work everywhere. * * Only set the upper 32 bits. */ pid = syscall(__NR_clone, 0xffffffff00000000 | SIGCHLD, NULL, NULL, NULL, NULL); if (pid < 0) exit(EXIT_FAILURE); if (pid == 0) exit(EXIT_SUCCESS); if (wait(NULL) != pid) exit(EXIT_FAILURE); exit(EXIT_SUCCESS); } Since legacy clone() couldn't be extended this was not a problem so far and nobody really noticed or cared since nothing in the kernel ever bothered to look at the upper 32 bits. But once we introduced clone3() and expanded the flag argument in struct clone_args to 64 bit we opened this can of worms. With the first flag-based extension to clone3() making use of the upper 32 bits of the flag argument we've effectively made it possible for the legacy clone() syscall to reach clone3() only flags. The sign extension scenario is just the odd corner-case that we needed to figure this out. The reason we just realized this now and not already when we introduced CLONE_CLEAR_SIGHAND was that CLONE_INTO_CGROUP assumes that a valid cgroup file descriptor has been given. So the sign extension (or the user accidently passing garbage for the upper 32 bits) caused the CLONE_INTO_CGROUP bit to be raised and the kernel to error out when it didn't find a valid cgroup file descriptor. Let's fix this by always capping the upper 32 bits for all codepaths that are not aware of clone3() features. This ensures that we can't reach clone3() only features by accident via legacy clone as with the sign extension case and also that legacy clone() works exactly like before, i.e. ignoring any unknown flags. This solution risks no regressions and is also pretty clean. Fixes: 7f192e3cd316 ("fork: add clone3") Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") Reported-by: Jan Stancek Signed-off-by: Christian Brauner Cc: Arnd Bergmann Cc: Dmitry V. Levin Cc: Andreas Schwab Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: stable@vger.kernel.org # 5.3+ Link: https://sourceware.org/pipermail/libc-alpha/2020-May/113596.html Link: https://lore.kernel.org/r/20200507103214.77218-1-christian.brauner@ubuntu.com --- kernel/fork.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..48ed22774efa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2486,11 +2486,11 @@ long do_fork(unsigned long clone_flags, int __user *child_tidptr) { struct kernel_clone_args args = { - .flags = (clone_flags & ~CSIGNAL), + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, - .exit_signal = (clone_flags & CSIGNAL), + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = stack_start, .stack_size = stack_size, }; @@ -2508,8 +2508,9 @@ long do_fork(unsigned long clone_flags, pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (flags & CSIGNAL), + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), .stack = (unsigned long)fn, .stack_size = (unsigned long)arg, }; @@ -2570,11 +2571,11 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #endif { struct kernel_clone_args args = { - .flags = (clone_flags & ~CSIGNAL), + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, - .exit_signal = (clone_flags & CSIGNAL), + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = newsp, .tls = tls, }; -- cgit v1.2.3-70-g09d2 From d08b9f0ca6605e13dcb48f04e55a30545b3c71eb Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:07 -0700 Subject: scs: Add support for Clang's Shadow Call Stack (SCS) This change adds generic support for Clang's Shadow Call Stack, which uses a shadow stack to protect return addresses from being overwritten by an attacker. Details are available here: https://clang.llvm.org/docs/ShadowCallStack.html Note that security guarantees in the kernel differ from the ones documented for user space. The kernel must store addresses of shadow stacks in memory, which means an attacker capable reading and writing arbitrary memory may be able to locate them and hijack control flow by modifying the stacks. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Reviewed-by: Miguel Ojeda [will: Numerous cosmetic changes] Signed-off-by: Will Deacon --- Makefile | 6 ++++ arch/Kconfig | 24 +++++++++++++++ include/linux/compiler-clang.h | 4 +++ include/linux/compiler_types.h | 4 +++ include/linux/scs.h | 68 ++++++++++++++++++++++++++++++++++++++++++ init/init_task.c | 8 +++++ kernel/Makefile | 1 + kernel/fork.c | 9 ++++++ kernel/sched/core.c | 2 ++ kernel/scs.c | 65 ++++++++++++++++++++++++++++++++++++++++ 10 files changed, 191 insertions(+) create mode 100644 include/linux/scs.h create mode 100644 kernel/scs.c (limited to 'kernel/fork.c') diff --git a/Makefile b/Makefile index 679f302a8b8b..33dc0d0cdd08 100644 --- a/Makefile +++ b/Makefile @@ -866,6 +866,12 @@ ifdef CONFIG_LIVEPATCH KBUILD_CFLAGS += $(call cc-option, -flive-patching=inline-clone) endif +ifdef CONFIG_SHADOW_CALL_STACK +CC_FLAGS_SCS := -fsanitize=shadow-call-stack +KBUILD_CFLAGS += $(CC_FLAGS_SCS) +export CC_FLAGS_SCS +endif + # arch Makefile may override CC so keep this after arch Makefile is included NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include) diff --git a/arch/Kconfig b/arch/Kconfig index 786a85d4ad40..334a3d9b19df 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -533,6 +533,30 @@ config STACKPROTECTOR_STRONG about 20% of all kernel functions, which increases the kernel code size by about 2%. +config ARCH_SUPPORTS_SHADOW_CALL_STACK + bool + help + An architecture should select this if it supports Clang's Shadow + Call Stack, has asm/scs.h, and implements runtime support for shadow + stack switching. + +config SHADOW_CALL_STACK + bool "Clang Shadow Call Stack" + depends on CC_IS_CLANG && ARCH_SUPPORTS_SHADOW_CALL_STACK + help + This option enables Clang's Shadow Call Stack, which uses a + shadow stack to protect function return addresses from being + overwritten by an attacker. More information can be found in + Clang's documentation: + + https://clang.llvm.org/docs/ShadowCallStack.html + + Note that security guarantees in the kernel differ from the + ones documented for user space. The kernel must store addresses + of shadow stacks in memory, which means an attacker capable of + reading and writing arbitrary memory may be able to locate them + and hijack control flow by modifying the stacks. + config HAVE_ARCH_WITHIN_STACK_FRAMES bool help diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 333a6695a918..790c0c6b8552 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -42,3 +42,7 @@ * compilers, like ICC. */ #define barrier() __asm__ __volatile__("" : : : "memory") + +#if __has_feature(shadow_call_stack) +# define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) +#endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e970f97a7fcb..97b62f47a80d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -193,6 +193,10 @@ struct ftrace_likely_data { # define randomized_struct_fields_end #endif +#ifndef __noscs +# define __noscs +#endif + #ifndef asm_volatile_goto #define asm_volatile_goto(x...) asm goto(x) #endif diff --git a/include/linux/scs.h b/include/linux/scs.h new file mode 100644 index 000000000000..3f3662621a27 --- /dev/null +++ b/include/linux/scs.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#ifndef _LINUX_SCS_H +#define _LINUX_SCS_H + +#include +#include +#include +#include + +#ifdef CONFIG_SHADOW_CALL_STACK + +/* + * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit + * architecture) provided ~40% safety margin on stack usage while keeping + * memory allocation overhead reasonable. + */ +#define SCS_SIZE SZ_1K +#define GFP_SCS (GFP_KERNEL | __GFP_ZERO) + +/* An illegal pointer value to mark the end of the shadow stack. */ +#define SCS_END_MAGIC (0x5f6UL + POISON_POINTER_DELTA) + +#define task_scs(tsk) (task_thread_info(tsk)->scs_base) +#define task_scs_offset(tsk) (task_thread_info(tsk)->scs_offset) + +void scs_init(void); +int scs_prepare(struct task_struct *tsk, int node); +void scs_release(struct task_struct *tsk); + +static inline void scs_task_reset(struct task_struct *tsk) +{ + /* + * Reset the shadow stack to the base address in case the task + * is reused. + */ + task_scs_offset(tsk) = 0; +} + +static inline unsigned long *__scs_magic(void *s) +{ + return (unsigned long *)(s + SCS_SIZE) - 1; +} + +static inline bool scs_corrupted(struct task_struct *tsk) +{ + unsigned long *magic = __scs_magic(task_scs(tsk)); + + return (task_scs_offset(tsk) >= SCS_SIZE - 1 || + READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC); +} + +#else /* CONFIG_SHADOW_CALL_STACK */ + +static inline void scs_init(void) {} +static inline void scs_task_reset(struct task_struct *tsk) {} +static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; } +static inline bool scs_corrupted(struct task_struct *tsk) { return false; } +static inline void scs_release(struct task_struct *tsk) {} + +#endif /* CONFIG_SHADOW_CALL_STACK */ + +#endif /* _LINUX_SCS_H */ diff --git a/init/init_task.c b/init/init_task.c index bd403ed3e418..169e34066d35 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,13 @@ static struct sighand_struct init_sighand = { .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), }; +#ifdef CONFIG_SHADOW_CALL_STACK +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] + __init_task_data = { + [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC +}; +#endif + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..c332eb9d4841 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..f6339f9d232d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include @@ -456,6 +457,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -840,6 +843,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +904,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..934e03cfaec7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -6040,6 +6041,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..38f8f31c9451 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include +#include +#include +#include + +static struct kmem_cache *scs_cache; + +static void *scs_alloc(int node) +{ + void *s; + + s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + if (s) { + *__scs_magic(s) = SCS_END_MAGIC; + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + } + + return s; +} + +static void scs_free(void *s) +{ + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = s; + task_scs_offset(tsk) = 0; + + return 0; +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_free(s); +} -- cgit v1.2.3-70-g09d2 From 9d78edeaec759f997c303f286ecd39daee166f2a Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 18 May 2020 20:07:38 +0200 Subject: proc: proc_pid_ns takes super_block as an argument syzbot found that touch /proc/testfile causes NULL pointer dereference at tomoyo_get_local_path() because inode of the dentry is NULL. Before c59f415a7cb6, Tomoyo received pid_ns from proc's s_fs_info directly. Since proc_pid_ns() can only work with inode, using it in the tomoyo_get_local_path() was wrong. To avoid creating more functions for getting proc_ns, change the argument type of the proc_pid_ns() function. Then, Tomoyo can use the existing super_block to get pid_ns. Link: https://lkml.kernel.org/r/0000000000002f0c7505a5b0e04c@google.com Link: https://lkml.kernel.org/r/20200518180738.2939611-1-gladkov.alexey@gmail.com Reported-by: syzbot+c1af344512918c61362c@syzkaller.appspotmail.com Fixes: c59f415a7cb6 ("Use proc_pid_ns() to get pid_namespace from the proc superblock") Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman --- fs/locks.c | 4 ++-- fs/proc/array.c | 2 +- fs/proc/base.c | 10 +++++----- fs/proc/self.c | 2 +- fs/proc/thread_self.c | 2 +- include/linux/proc_fs.h | 4 ++-- kernel/fork.c | 2 +- net/ipv6/ip6_flowlabel.c | 2 +- security/tomoyo/realpath.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/locks.c b/fs/locks.c index 399c5dbb72c4..ab702d6efb55 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2823,7 +2823,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, { struct inode *inode = NULL; unsigned int fl_pid; - struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)); + struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); fl_pid = locks_translate_pid(fl, proc_pidns); /* @@ -2901,7 +2901,7 @@ static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; - struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)); + struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); fl = hlist_entry(v, struct file_lock, fl_link); diff --git a/fs/proc/array.c b/fs/proc/array.c index 8e16f14bb05a..043311014db2 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -728,7 +728,7 @@ static int children_seq_show(struct seq_file *seq, void *v) { struct inode *inode = file_inode(seq->file); - seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode))); + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb))); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 5a307b3bb2d1..30c9fceca0b7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -754,7 +754,7 @@ static const struct inode_operations proc_def_inode_operations = { static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; @@ -1423,7 +1423,7 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); @@ -2466,7 +2466,7 @@ static int proc_timers_open(struct inode *inode, struct file *file) return -ENOMEM; tp->pid = proc_pid(inode); - tp->ns = proc_pid_ns(inode); + tp->ns = proc_pid_ns(inode->i_sb); return 0; } @@ -3377,7 +3377,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); - struct pid_namespace *ns = proc_pid_ns(file_inode(file)); + struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) @@ -3730,7 +3730,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = proc_pid_ns(inode); + ns = proc_pid_ns(inode->i_sb); tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); diff --git a/fs/proc/self.c b/fs/proc/self.c index 309301ac0136..ca5158fa561c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 2493cbbdfa6f..ac284f409568 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 2cb424e6f36a..6ec524d8842c 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -202,9 +202,9 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)); /* get the associated pid namespace for a file in procfs */ -static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) +static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) { - return proc_sb_info(inode->i_sb)->pid_ns; + return proc_sb_info(sb)->pid_ns; } #endif /* _LINUX_PROC_FS_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 4385f3d639f2..e7bdaccad942 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1745,7 +1745,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)); + ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index d64b83e85642..ce4fbba4acce 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -779,7 +779,7 @@ static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); - state->pid_ns = proc_pid_ns(file_inode(seq->file)); + state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index 08b096e2f7e3..df4798980416 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -162,7 +162,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); - struct pid_namespace *proc_pidns = proc_pid_ns(d_inode(dentry)); + struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { -- cgit v1.2.3-70-g09d2 From e31cf2f4ca422ac9b14ecc4a1295b8977a20f812 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:33 -0700 Subject: mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/alpha/boot/bootp.c | 1 - arch/alpha/boot/bootpz.c | 1 - arch/alpha/boot/main.c | 1 - arch/alpha/include/asm/io.h | 1 - arch/alpha/kernel/process.c | 1 - arch/alpha/kernel/ptrace.c | 1 - arch/alpha/kernel/setup.c | 1 - arch/alpha/kernel/smp.c | 1 - arch/alpha/kernel/sys_alcor.c | 1 - arch/alpha/kernel/sys_cabriolet.c | 1 - arch/alpha/kernel/sys_dp264.c | 1 - arch/alpha/kernel/sys_eb64p.c | 1 - arch/alpha/kernel/sys_eiger.c | 1 - arch/alpha/kernel/sys_jensen.c | 1 - arch/alpha/kernel/sys_marvel.c | 1 - arch/alpha/kernel/sys_miata.c | 1 - arch/alpha/kernel/sys_mikasa.c | 1 - arch/alpha/kernel/sys_nautilus.c | 1 - arch/alpha/kernel/sys_noritake.c | 1 - arch/alpha/kernel/sys_rawhide.c | 1 - arch/alpha/kernel/sys_ruffian.c | 1 - arch/alpha/kernel/sys_rx164.c | 1 - arch/alpha/kernel/sys_sable.c | 1 - arch/alpha/kernel/sys_sio.c | 1 - arch/alpha/kernel/sys_sx164.c | 1 - arch/alpha/kernel/sys_takara.c | 1 - arch/alpha/kernel/sys_titan.c | 1 - arch/alpha/kernel/sys_wildfire.c | 1 - arch/alpha/mm/init.c | 1 - arch/arm/kernel/machine_kexec.c | 1 - arch/arm/kernel/module.c | 1 - arch/arm/kernel/ptrace.c | 1 - arch/arm/kernel/smp.c | 1 - arch/arm/mach-ebsa110/core.c | 1 - arch/arm/mach-footbridge/common.c | 1 - arch/arm/mach-imx/mm-imx21.c | 1 - arch/arm/mach-imx/mm-imx27.c | 1 - arch/arm/mach-imx/mm-imx3.c | 1 - arch/arm/mach-iop32x/i2c.c | 1 - arch/arm/mach-iop32x/iq31244.c | 1 - arch/arm/mach-iop32x/iq80321.c | 1 - arch/arm/mach-iop32x/n2100.c | 1 - arch/arm/mach-ixp4xx/common.c | 1 - arch/arm/mach-sa1100/assabet.c | 1 - arch/arm/mm/copypage-v4mc.c | 1 - arch/arm/mm/copypage-v6.c | 1 - arch/arm/mm/copypage-xscale.c | 1 - arch/arm/mm/dump.c | 1 - arch/arm/mm/fault-armv.c | 1 - arch/arm/mm/fault.c | 1 - arch/arm/mm/pageattr.c | 1 - arch/arm64/kernel/hibernate.c | 1 - arch/arm64/kernel/ptrace.c | 1 - arch/arm64/kernel/smp.c | 1 - arch/arm64/mm/dump.c | 1 - arch/arm64/mm/fault.c | 1 - arch/arm64/mm/kasan_init.c | 1 - arch/arm64/mm/pageattr.c | 1 - arch/csky/kernel/module.c | 1 - arch/csky/kernel/ptrace.c | 1 - arch/csky/mm/init.c | 1 - arch/csky/mm/tlb.c | 1 - arch/h8300/kernel/process.c | 1 - arch/h8300/kernel/setup.c | 1 - arch/h8300/kernel/signal.c | 1 - arch/h8300/mm/fault.c | 1 - arch/h8300/mm/init.c | 1 - arch/h8300/mm/memory.c | 1 - arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/kernel/efi.c | 1 - arch/ia64/kernel/ptrace.c | 1 - arch/ia64/kernel/smp.c | 1 - arch/ia64/kernel/smpboot.c | 1 - arch/ia64/mm/contig.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/68000/timers.c | 1 - arch/m68k/amiga/config.c | 1 - arch/m68k/apollo/config.c | 1 - arch/m68k/atari/atasound.c | 1 - arch/m68k/atari/stram.c | 1 - arch/m68k/bvme6000/config.c | 1 - arch/m68k/kernel/process.c | 1 - arch/m68k/kernel/ptrace.c | 1 - arch/m68k/kernel/setup_no.c | 1 - arch/m68k/kernel/signal.c | 1 - arch/m68k/kernel/uboot.c | 1 - arch/m68k/mac/config.c | 1 - arch/m68k/mm/mcfmmu.c | 1 - arch/m68k/mm/sun3kmap.c | 1 - arch/m68k/mm/sun3mmu.c | 1 - arch/m68k/mvme147/config.c | 1 - arch/m68k/mvme16x/config.c | 1 - arch/m68k/q40/config.c | 1 - arch/m68k/sun3/config.c | 1 - arch/m68k/sun3/dvma.c | 1 - arch/m68k/sun3/mmu_emu.c | 1 - arch/m68k/sun3/sun3dvma.c | 1 - arch/m68k/sun3x/dvma.c | 1 - arch/m68k/sun3x/prom.c | 1 - arch/microblaze/kernel/signal.c | 1 - arch/microblaze/mm/fault.c | 1 - arch/mips/fw/arc/memory.c | 1 - arch/mips/include/asm/mach-generic/floppy.h | 1 - arch/mips/include/asm/mach-jazz/floppy.h | 1 - arch/mips/jazz/jazzdma.c | 1 - arch/mips/kernel/module.c | 1 - arch/mips/kernel/process.c | 1 - arch/mips/kernel/ptrace.c | 1 - arch/mips/kernel/ptrace32.c | 1 - arch/mips/kernel/smp-bmips.c | 1 - arch/mips/kernel/traps.c | 1 - arch/mips/kvm/tlb.c | 1 - arch/mips/lib/dump_tlb.c | 1 - arch/mips/lib/r3k_dump_tlb.c | 1 - arch/mips/mm/c-octeon.c | 1 - arch/mips/mm/c-r3k.c | 1 - arch/mips/mm/c-r4k.c | 1 - arch/mips/mm/c-tx39.c | 1 - arch/mips/mm/init.c | 1 - arch/mips/mm/page.c | 1 - arch/mips/mm/pgtable-32.c | 1 - arch/mips/mm/pgtable-64.c | 1 - arch/mips/mm/sc-ip22.c | 1 - arch/mips/mm/sc-mips.c | 1 - arch/mips/mm/sc-r5k.c | 1 - arch/mips/mm/tlb-r3k.c | 1 - arch/mips/mm/tlb-r4k.c | 1 - arch/mips/sgi-ip27/ip27-init.c | 1 - arch/mips/sgi-ip27/ip27-timer.c | 1 - arch/mips/sgi-ip32/ip32-memory.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nds32/mm/proc.c | 1 - arch/nios2/kernel/module.c | 1 - arch/nios2/mm/init.c | 1 - arch/nios2/mm/pgtable.c | 1 - arch/nios2/mm/tlb.c | 1 - arch/openrisc/include/asm/tlbflush.h | 1 - arch/openrisc/kernel/asm-offsets.c | 1 - arch/openrisc/kernel/process.c | 1 - arch/openrisc/kernel/ptrace.c | 1 - arch/openrisc/kernel/setup.c | 1 - arch/openrisc/kernel/traps.c | 1 - arch/openrisc/mm/init.c | 1 - arch/openrisc/mm/tlb.c | 1 - arch/parisc/include/asm/mmu_context.h | 1 - arch/parisc/kernel/module.c | 1 - arch/parisc/kernel/ptrace.c | 1 - arch/parisc/kernel/smp.c | 1 - arch/parisc/mm/init.c | 1 - arch/powerpc/include/asm/io.h | 1 - arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kernel/process.c | 1 - arch/powerpc/kernel/signal_32.c | 1 - arch/powerpc/kernel/signal_64.c | 1 - arch/powerpc/kernel/traps.c | 1 - arch/powerpc/kernel/vdso.c | 1 - arch/powerpc/lib/code-patching.c | 1 - arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/hash_pgtable.c | 1 - arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/radix_pgtable.c | 1 - arch/powerpc/mm/fault.c | 1 - arch/powerpc/mm/hugetlbpage.c | 1 - arch/powerpc/mm/init_32.c | 1 - arch/powerpc/mm/init_64.c | 1 - arch/powerpc/mm/mem.c | 1 - arch/powerpc/mm/nohash/40x.c | 1 - arch/powerpc/mm/nohash/fsl_booke.c | 1 - arch/powerpc/mm/pgtable_32.c | 1 - arch/powerpc/mm/pgtable_64.c | 1 - arch/powerpc/mm/ptdump/hashpagetable.c | 1 - arch/powerpc/mm/ptdump/ptdump.c | 1 - arch/powerpc/perf/callchain.c | 1 - arch/powerpc/perf/callchain_32.c | 1 - arch/powerpc/perf/callchain_64.c | 1 - arch/powerpc/platforms/8xx/cpm1.c | 1 - arch/powerpc/platforms/8xx/micropatch.c | 1 - arch/powerpc/platforms/cell/setup.c | 1 - arch/powerpc/platforms/chrp/setup.c | 1 - arch/powerpc/platforms/maple/setup.c | 1 - arch/powerpc/platforms/maple/time.c | 1 - arch/powerpc/platforms/powermac/setup.c | 1 - arch/powerpc/platforms/powermac/time.c | 1 - arch/powerpc/platforms/pseries/setup.c | 1 - arch/powerpc/sysdev/cpm2.c | 1 - arch/powerpc/xmon/xmon.c | 1 - arch/riscv/kernel/setup.c | 1 - arch/riscv/mm/init.c | 1 - arch/s390/include/asm/tlbflush.h | 1 - arch/s390/kernel/machine_kexec.c | 1 - arch/s390/kernel/ptrace.c | 1 - arch/s390/kernel/vdso.c | 1 - arch/s390/mm/dump_pagetables.c | 1 - arch/s390/mm/fault.c | 1 - arch/s390/mm/init.c | 1 - arch/s390/mm/pageattr.c | 1 - arch/s390/mm/pgtable.c | 1 - arch/s390/mm/vmem.c | 1 - arch/sh/kernel/machine_kexec.c | 1 - arch/sh/kernel/ptrace_32.c | 1 - arch/sh/kernel/signal_32.c | 1 - arch/sh/mm/cache-sh3.c | 1 - arch/sh/mm/cache-sh4.c | 1 - arch/sh/mm/cache-sh7705.c | 1 - arch/sh/mm/nommu.c | 1 - arch/sparc/kernel/leon_smp.c | 1 - arch/sparc/kernel/process_32.c | 1 - arch/sparc/kernel/process_64.c | 1 - arch/sparc/kernel/ptrace_32.c | 1 - arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/setup_32.c | 1 - arch/sparc/kernel/setup_64.c | 1 - arch/sparc/kernel/signal32.c | 1 - arch/sparc/kernel/signal_32.c | 1 - arch/sparc/kernel/signal_64.c | 1 - arch/sparc/kernel/smp_32.c | 1 - arch/sparc/kernel/smp_64.c | 1 - arch/sparc/kernel/traps_64.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/sparc/mm/hugetlbpage.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 1 - arch/sparc/mm/io-unit.c | 1 - arch/sparc/mm/iommu.c | 1 - arch/sparc/mm/tlb.c | 1 - arch/um/kernel/process.c | 1 - arch/um/kernel/skas/mmu.c | 1 - arch/um/kernel/skas/uaccess.c | 1 - arch/um/kernel/tlb.c | 1 - arch/um/kernel/trap.c | 1 - arch/um/kernel/um_arch.c | 1 - arch/unicore32/kernel/module.c | 1 - arch/unicore32/mm/fault.c | 1 - arch/x86/include/asm/iomap.h | 1 - arch/x86/include/asm/xen/page.h | 1 - arch/x86/kernel/alternative.c | 1 - arch/x86/kernel/amd_gart_64.c | 1 - arch/x86/kernel/doublefault_32.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/machine_kexec_64.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/tboot.c | 1 - arch/x86/mm/dump_pagetables.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/kasan_init_64.c | 1 - arch/x86/mm/pat/cpa-test.c | 1 - arch/x86/mm/pat/memtype.c | 1 - arch/x86/mm/pgtable.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/mm/pti.c | 1 - arch/x86/platform/efi/efi_64.c | 1 - arch/x86/xen/enlighten_pv.c | 1 - arch/x86/xen/grant-table.c | 1 - arch/xtensa/kernel/process.c | 1 - arch/xtensa/kernel/ptrace.c | 1 - arch/xtensa/kernel/setup.c | 1 - drivers/char/agp/frontend.c | 1 - drivers/char/agp/generic.c | 1 - drivers/char/bsr.c | 1 - drivers/char/mspec.c | 1 - drivers/gpu/drm/i915/i915_mm.c | 1 - drivers/infiniband/sw/rdmavt/mmap.c | 1 - drivers/infiniband/sw/rxe/rxe_mmap.c | 1 - drivers/media/platform/davinci/vpbe_display.c | 1 - drivers/media/v4l2-core/v4l2-common.c | 1 - drivers/misc/sgi-gru/grufault.c | 1 - drivers/net/ethernet/sun/sunhme.c | 1 - drivers/sbus/char/flash.c | 1 - drivers/sbus/char/uctrl.c | 1 - drivers/scsi/a2091.c | 1 - drivers/scsi/a3000.c | 1 - drivers/scsi/gvp11.c | 1 - drivers/scsi/lasi700.c | 1 - drivers/scsi/mvme147.c | 1 - drivers/scsi/sni_53c710.c | 1 - drivers/video/console/newport_con.c | 1 - drivers/video/fbdev/acornfb.c | 1 - drivers/video/fbdev/atafb.c | 1 - drivers/video/fbdev/cirrusfb.c | 1 - drivers/video/fbdev/cyber2000fb.c | 1 - drivers/video/fbdev/fb-puv3.c | 1 - drivers/video/fbdev/hitfb.c | 1 - drivers/video/fbdev/neofb.c | 1 - drivers/video/fbdev/q40fb.c | 1 - drivers/video/fbdev/savage/savagefb_driver.c | 1 - drivers/xen/balloon.c | 1 - drivers/xen/grant-table.c | 1 - drivers/xen/privcmd.c | 1 - drivers/xen/xenbus/xenbus_probe.c | 1 - drivers/xen/xenbus/xenbus_probe_backend.c | 1 - drivers/xen/xenbus/xenbus_probe_frontend.c | 1 - fs/proc/array.c | 1 - fs/proc/meminfo.c | 1 - fs/proc/nommu.c | 1 - fs/proc/vmcore.c | 1 - include/linux/dax.h | 1 - init/init_task.c | 1 - kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/power/snapshot.c | 1 - lib/ioremap.c | 1 - mm/debug_vm_pgtable.c | 1 - mm/gup.c | 1 - mm/hugetlb.c | 1 - mm/memory.c | 1 - mm/page_io.c | 1 - mm/shmem.c | 1 - mm/sparse-vmemmap.c | 1 - mm/sparse.c | 1 - mm/swap_state.c | 1 - mm/swapfile.c | 1 - mm/vmacache.c | 1 - sound/core/sgbuf.c | 1 - virt/kvm/kvm_main.c | 1 - 319 files changed, 319 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 95c0359f4858..00266e6e1b71 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 99b8d7dc344b..43af71835adf 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 8f5ed8610970..e5347a080008 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -14,7 +14,6 @@ #include #include -#include #include diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index d1ed5a8133c5..13bea465f1c0 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 48b81d015d8a..b45f0b0d6511 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index cb8d599e72d6..8c43212ae38e 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 6fa802c495b4..f5c42a8fcf9c 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -55,7 +55,6 @@ static struct notifier_block alpha_panic_block = { }; #include -#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 52995bf413fe..631cc17410d1 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c index ce5430056f65..e063b3857b3d 100644 --- a/arch/alpha/kernel/sys_alcor.c +++ b/arch/alpha/kernel/sys_alcor.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c index 0aa6a27d0e2f..47459b73cdb7 100644 --- a/arch/alpha/kernel/sys_cabriolet.c +++ b/arch/alpha/kernel/sys_cabriolet.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index d33508621820..9fb445d7dca5 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c index 1cdfe55fb987..3c43fd347526 100644 --- a/arch/alpha/kernel/sys_eb64p.c +++ b/arch/alpha/kernel/sys_eb64p.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index 016f79251141..bf99dcfd40c4 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index d0d44f543d77..0a2ab6cb18db 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 533899a4a1a1..83d6c53d6d4d 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c index 702292af2225..e1bee8f84c58 100644 --- a/arch/alpha/kernel/sys_miata.c +++ b/arch/alpha/kernel/sys_miata.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c index 3af4f94113e1..7690dfd57cb6 100644 --- a/arch/alpha/kernel/sys_mikasa.c +++ b/arch/alpha/kernel/sys_mikasa.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 32850e45834b..53adf43dcd44 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c index b106f327f765..47f3ce4f719a 100644 --- a/arch/alpha/kernel/sys_noritake.c +++ b/arch/alpha/kernel/sys_noritake.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c index b76f65d0e8b5..b5846ffdadce 100644 --- a/arch/alpha/kernel/sys_rawhide.c +++ b/arch/alpha/kernel/sys_rawhide.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c index d33074011960..4b1c8d85c4f0 100644 --- a/arch/alpha/kernel/sys_ruffian.c +++ b/arch/alpha/kernel/sys_ruffian.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c index 4d85eaeb44aa..94046f9aea08 100644 --- a/arch/alpha/kernel/sys_rx164.c +++ b/arch/alpha/kernel/sys_rx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 3cf0d32da5d8..930005b2f630 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index a6bdc1da47ad..7c420d8dac53 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_sx164.c b/arch/alpha/kernel/sys_sx164.c index 17cc203176c8..dd9de84b630c 100644 --- a/arch/alpha/kernel/sys_sx164.c +++ b/arch/alpha/kernel/sys_sx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c index e230c6864088..9e2adb69bc74 100644 --- a/arch/alpha/kernel/sys_takara.c +++ b/arch/alpha/kernel/sys_takara.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index c8390d8de140..b1f3b4fcf99b 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2191bde161fd..2c54d707142a 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 667cd21393b5..3c42b3147fd6 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 76300f3813e8..974b6c64d3e6 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index af0a8500a24e..e15444b25ca0 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4cc6a7eff635..d0f7c8896c96 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,7 +25,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a8..9a6432557871 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 575b2e2b6759..5960e3dfd2bf 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index 015f75d1c98d..eee095f0e2f6 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -14,7 +14,6 @@ #include #include