diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/fork.c | 141 | ||||
-rw-r--r-- | kernel/gcov/base.c | 5 | ||||
-rw-r--r-- | kernel/pid.c | 15 | ||||
-rw-r--r-- | kernel/ptrace.c | 39 | ||||
-rw-r--r-- | kernel/signal.c | 14 | ||||
-rw-r--r-- | kernel/sys.c | 47 | ||||
-rw-r--r-- | kernel/sysctl.c | 16 |
7 files changed, 192 insertions, 85 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index f2c1e7352298..03c1eaaa6ef5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -74,6 +74,7 @@ #include <linux/uprobes.h> #include <linux/aio.h> #include <linux/compiler.h> +#include <linux/sysctl.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -88,6 +89,16 @@ #include <trace/events/task.h> /* + * Minimum number of threads to boot the kernel + */ +#define MIN_THREADS 20 + +/* + * Maximum number of threads + */ +#define MAX_THREADS FUTEX_TID_MASK + +/* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -253,7 +264,30 @@ EXPORT_SYMBOL_GPL(__put_task_struct); void __init __weak arch_task_cache_init(void) { } -void __init fork_init(unsigned long mempages) +/* + * set_max_threads + */ +static void set_max_threads(unsigned int max_threads_suggested) +{ + u64 threads; + + /* + * The number of threads shall be limited such that the thread + * structures may only consume a small part of the available memory. + */ + if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + threads = MAX_THREADS; + else + threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + (u64) THREAD_SIZE * 8UL); + + if (threads > max_threads_suggested) + threads = max_threads_suggested; + + max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); +} + +void __init fork_init(void) { #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN @@ -268,18 +302,7 @@ void __init fork_init(unsigned long mempages) /* do the arch specific task caches init */ arch_task_cache_init(); - /* - * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most half - * of memory. - */ - max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); - - /* - * we need to allow at least 20 threads to boot a system - */ - if (max_threads < 20) - max_threads = 20; + set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; @@ -380,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + mm->total_vm = oldmm->total_vm; mm->shared_vm = oldmm->shared_vm; mm->exec_vm = oldmm->exec_vm; @@ -505,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -#define dup_mmap(mm, oldmm) (0) +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ @@ -674,34 +706,53 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/** + * set_mm_exe_file - change a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). + * + * Main users are mmput() and sys_execve(). Callers prevent concurrent + * invocations: in mmput() nobody alive left, in execve task is single + * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the + * mm->exe_file, but does so without using set_mm_exe_file() in order + * to do avoid the need for any locks. + */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { + struct file *old_exe_file; + + /* + * It is safe to dereference the exe_file without RCU as + * this function is only called if nobody else can access + * this mm -- see comment above for justification. + */ + old_exe_file = rcu_dereference_raw(mm->exe_file); + if (new_exe_file) get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; + rcu_assign_pointer(mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); } +/** + * get_mm_exe_file - acquire a reference to the mm's executable file + * + * Returns %NULL if mm has no associated executable file. + * User must release file via fput(). + */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of exe_file */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); + rcu_read_lock(); + exe_file = rcu_dereference(mm->exe_file); + if (exe_file && !get_file_rcu(exe_file)) + exe_file = NULL; + rcu_read_unlock(); return exe_file; } - -static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} +EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_mm - acquire a reference to the task's mm @@ -864,8 +915,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; - dup_mm_exe_file(oldmm, mm); - err = dup_mmap(mm, oldmm); if (err) goto free_pt; @@ -1403,10 +1452,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { - retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns_for_children); - if (!pid) + if (IS_ERR(pid)) { + retval = PTR_ERR(pid); goto bad_fork_cleanup_io; + } } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -2000,3 +2050,26 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +int sysctl_max_threads(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int ret; + int threads = max_threads; + int min = MIN_THREADS; + int max = MAX_THREADS; + + t = *table; + t.data = &threads; + t.extra1 = &min; + t.extra2 = &max; + + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + set_max_threads(threads); + + return 0; +} diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index b358a802fd18..a744098e4eb7 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched.h> #include "gcov.h" static int gcov_events_enabled; @@ -107,8 +108,10 @@ void gcov_enable_events(void) gcov_events_enabled = 1; /* Perform event callback for previously registered entries. */ - while ((info = gcov_info_next(info))) + while ((info = gcov_info_next(info))) { gcov_event(GCOV_ADD, info); + cond_resched(); + } mutex_unlock(&gcov_lock); } diff --git a/kernel/pid.c b/kernel/pid.c index cd36a5e0d173..4fd07d5b7baf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) spin_unlock_irq(&pidmap_lock); kfree(page); if (unlikely(!map->page)) - break; + return -ENOMEM; } if (likely(atomic_read(&map->nr_free))) { for ( ; ; ) { @@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) } pid = mk_pid(pid_ns, map, offset); } - return -1; + return -EAGAIN; } int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) @@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns) int i, nr; struct pid_namespace *tmp; struct upid *upid; + int retval = -ENOMEM; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) - goto out; + return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (nr < 0) + if (IS_ERR_VALUE(nr)) { + retval = nr; goto out_free; + } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; @@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns) } spin_unlock_irq(&pidmap_lock); -out: return pid; out_unlock: @@ -351,8 +353,7 @@ out_free: free_pidmap(pid->numbers + i); kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; - goto out; + return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 227fec36b12a..c8e0e050a36a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) static int ptrace_detach(struct task_struct *child, unsigned int data) { - bool dead = false; - if (!valid_signal(data)) return -EIO; @@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) write_lock_irq(&tasklist_lock); /* - * This child can be already killed. Make sure de_thread() or - * our sub-thread doing do_wait() didn't do release_task() yet. + * We rely on ptrace_freeze_traced(). It can't be killed and + * untraced by another thread, it can't be a zombie. */ - if (child->ptrace) { - child->exit_code = data; - dead = __ptrace_detach(current, child); - } + WARN_ON(!child->ptrace || child->exit_state); + /* + * tasklist_lock avoids the race with wait_task_stopped(), see + * the comment in ptrace_resume(). + */ + child->exit_code = data; + __ptrace_detach(current, child); write_unlock_irq(&tasklist_lock); proc_ptrace_connector(child, PTRACE_DETACH); - if (unlikely(dead)) - release_task(child); return 0; } @@ -697,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { + bool need_siglock; + if (!valid_signal(data)) return -EIO; @@ -724,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request, user_disable_single_step(child); } + /* + * Change ->exit_code and ->state under siglock to avoid the race + * with wait_task_stopped() in between; a non-zero ->exit_code will + * wrongly look like another report from tracee. + * + * Note that we need siglock even if ->exit_code == data and/or this + * status was not reported yet, the new status must not be cleared by + * wait_task_stopped() after resume. + * + * If data == 0 we do not care if wait_task_stopped() reports the old + * status and clears the code too; this can't race with the tracee, it + * takes siglock after resume. + */ + need_siglock = data && !thread_group_empty(current); + if (need_siglock) + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; wake_up_state(child, __TASK_TRACED); + if (need_siglock) + spin_unlock_irq(&child->sighand->siglock); return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index a390499943e4..d51c5ddd855c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + if ((info->si_code >= 0 || info->si_code == SI_TKILL) && + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); diff --git a/kernel/sys.c b/kernel/sys.c index 3be344902316..a4e372b798a5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; + struct file *old_exe, *exe_file; struct inode *inode; int err; - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - exe = fdget(fd); if (!exe.file) return -EBADF; @@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) /* * Forbid mm->exe_file change if old file still mapped. */ + exe_file = get_mm_exe_file(mm); err = -EBUSY; - if (mm->exe_file) { + if (exe_file) { struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_file && - path_equal(&vma->vm_file->f_path, - &mm->exe_file->f_path)) - goto exit; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_err; + } + + up_read(&mm->mmap_sem); + fput(exe_file); } /* @@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) goto exit; err = 0; - set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ + /* set the new file, lockless */ + get_file(exe.file); + old_exe = xchg(&mm->exe_file, exe.file); + if (old_exe) + fput(old_exe); exit: fdput(exe); return err; +exit_err: + up_read(&mm->mmap_sem); + fput(exe_file); + goto exit; } #ifdef CONFIG_CHECKPOINT_RESTORE @@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - down_write(&mm->mmap_sem); if (prctl_map.exe_fd != (u32)-1) - error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); - downgrade_write(&mm->mmap_sem); + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); + down_read(&mm->mmap_sem); if (error) goto out; @@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (opt == PR_SET_MM_EXE_FILE) { - down_write(&mm->mmap_sem); - error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); - up_write(&mm->mmap_sem); - return error; - } + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 42b7fc2860c1..2082b1a88fb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -93,11 +93,9 @@ #include <linux/nmi.h> #endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP extern int core_uses_pid; @@ -710,10 +708,10 @@ static struct ctl_table kern_table[] = { #endif { .procname = "threads-max", - .data = &max_threads, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_max_threads, }, { .procname = "random", @@ -1983,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - *valp = *negp ? -*lvalp : *lvalp; + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + *valp = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + *valp = *lvalp; + } } else { int val = *valp; if (val < 0) { |