diff options
author | Christian Brauner <christian.brauner@ubuntu.com> | 2020-02-05 14:26:22 +0100 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2020-02-12 17:57:51 -0500 |
commit | ef2c41cf38a7559bbf91af42d5b6a4429db8fc68 (patch) | |
tree | ad5801b2782c5268dc0ad1b6f0f591b1da582eee /include | |
parent | f3553220d4cc458d69f7da6e71a3a6097778bd28 (diff) |
clone3: allow spawning processes into cgroups
This adds support for creating a process in a different cgroup than its
parent. Callers can limit and account processes and threads right from
the moment they are spawned:
- A service manager can directly spawn new services into dedicated
cgroups.
- A process can be directly created in a frozen cgroup and will be
frozen as well.
- The initial accounting jitter experienced by process supervisors and
daemons is eliminated with this.
- Threaded applications or even thread implementations can choose to
create a specific cgroup layout where each thread is spawned
directly into a dedicated cgroup.
This feature is limited to the unified hierarchy. Callers need to pass
a directory file descriptor for the target cgroup. The caller can
choose to pass an O_PATH file descriptor. All usual migration
restrictions apply, i.e. there can be no processes in inner nodes. In
general, creating a process directly in a target cgroup adheres to all
migration restrictions.
One of the biggest advantages of this feature is that CLONE_INTO_GROUP does
not need to grab the write side of the cgroup cgroup_threadgroup_rwsem.
This global lock makes moving tasks/threads around super expensive. With
clone3() this lock is avoided.
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/cgroup-defs.h | 5 | ||||
-rw-r--r-- | include/linux/cgroup.h | 20 | ||||
-rw-r--r-- | include/linux/sched/task.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/sched.h | 5 |
4 files changed, 26 insertions, 8 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 63097cb243cb..68c391f451d1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -628,8 +628,9 @@ struct cgroup_subsys { void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); - int (*can_fork)(struct task_struct *task); - void (*cancel_fork)(struct task_struct *task); + int (*can_fork)(struct task_struct *task, + struct css_set *cset); + void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f1219b927817..4598e4da6b1b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,8 @@ #include <linux/cgroup-defs.h> +struct kernel_clone_args; + #ifdef CONFIG_CGROUPS /* @@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); -extern int cgroup_can_fork(struct task_struct *p); -extern void cgroup_cancel_fork(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); +extern int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); @@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} -static inline int cgroup_can_fork(struct task_struct *p) { return 0; } -static inline void cgroup_cancel_fork(struct task_struct *p) {} -static inline void cgroup_post_fork(struct task_struct *p) {} +static inline int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs) { return 0; } +static inline void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} +static inline void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index f1879884238e..38359071236a 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -13,6 +13,7 @@ struct task_struct; struct rusage; union thread_union; +struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL @@ -29,6 +30,9 @@ struct kernel_clone_args { pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; + int cgroup; + struct cgroup *cgrp; + struct css_set *cset; }; /* diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 2e3bc22c6f20..3bac0a8ceab2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -35,6 +35,7 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 @@ -81,6 +82,8 @@ * @set_tid_size: This defines the size of the array referenced * in @set_tid. This cannot be larger than the * kernel's limit of nested PID namespaces. + * @cgroup: If CLONE_INTO_CGROUP is specified set this to + * a file descriptor for the cgroup. * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and @@ -97,11 +100,13 @@ struct clone_args { __aligned_u64 tls; __aligned_u64 set_tid; __aligned_u64 set_tid_size; + __aligned_u64 cgroup; }; #endif #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ /* * Scheduling policies |