From 0a268dbd7932c78896f5a45c8a492b31729db6c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:06 -0500 Subject: cgroup: move cgroup v1 specific code to kernel/cgroup/cgroup-v1.c cgroup.c is getting too unwieldy. Let's move out cgroup v1 specific code along with the debug controller into kernel/cgroup/cgroup-v1.c. v2: cgroup_mutex and css_set_lock made available in cgroup-internal.h regardless of CONFIG_PROVE_RCU. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-v1.c | 1027 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1027 insertions(+) create mode 100644 kernel/cgroup/cgroup-v1.c (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c new file mode 100644 index 000000000000..7af745a46f91 --- /dev/null +++ b/kernel/cgroup/cgroup-v1.c @@ -0,0 +1,1027 @@ +#include "cgroup-internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; + +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +DEFINE_SPINLOCK(release_agent_path_lock); + +bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroup_root *root; + int retval = 0; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + for_each_root(root) { + struct cgroup *from_cgrp; + + if (root == &cgrp_dfl_root) + continue; + + spin_lock_irq(&css_set_lock); + from_cgrp = task_cgroup_from_root(from, root); + spin_unlock_irq(&css_set_lock); + + retval = cgroup_attach_task(from_cgrp, tsk, false); + if (retval) + break; + } + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + * + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + LIST_HEAD(preloaded_csets); + struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; + int ret; + + if (cgroup_on_dfl(to)) + return -EINVAL; + + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + + mutex_lock(&cgroup_mutex); + + percpu_down_write(&cgroup_threadgroup_rwsem); + + /* all tasks in @from are being moved, all csets are source */ + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_migrate_prepare_dst(&preloaded_csets); + if (ret) + goto out_err; + + /* + * Migrate tasks one-by-one until @from is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->self, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + return ret; +} + +/* + * Stuff for reading the 'tasks'/'procs' files. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + */ + +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* for delayed destruction */ + struct delayed_work destroy_dwork; +}; + +/* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} + +static void pidlist_free(void *p) +{ + kvfree(p); +} + +/* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * Returns the number of unique elements. + */ +static int pidlist_uniq(pid_t *list, int length) +{ + int src, dest = 1; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + return dest; +} + +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + */ +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + + /* entry not found; create a new one */ + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) + return l; + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); + l->key.type = type; + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + return l; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. + */ +static int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); + spin_unlock_irq(&css_set_lock); + return count; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ + struct css_task_iter it; + struct task_struct *tsk; + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + if (unlikely(n == length)) + break; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; + } + css_task_iter_end(&it); + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(array, length); + + l = cgroup_pidlist_find_create(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + + /* store array, freeing old if necessary */ + pidlist_free(l->list); + l->list = array; + l->length = length; + *lp = l; + return 0; +} + +/* + * seq_file methods for the tasks/procs files. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->l->list array. + */ + +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +{ + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; + int index = 0, pid = *pos; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; + + if (pid) { + int end = l->length; + + while (index < end) { + int mid = (index + end) / 2; + if (l->list[mid] == pid) { + index = mid; + break; + } else if (l->list[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= l->length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = l->list + index; + *pos = *iter; + return iter; +} + +static void cgroup_pidlist_stop(struct seq_file *s, void *v) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); +} + +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + pid_t *p = v; + pid_t *end = l->list + l->length; + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_pidlist_show(struct seq_file *s, void *v) +{ + seq_printf(s, "%d\n", *(int *)v); + + return 0; +} + +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, false); +} + +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + spin_lock(&release_agent_path_lock); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); + spin_unlock(&release_agent_path_lock); + cgroup_kn_unlock(of->kn); + return nbytes; +} + +static int cgroup_release_agent_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + spin_lock(&release_agent_path_lock); + seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); + seq_putc(seq, '\n'); + return 0; +} + +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, "0\n"); + return 0; +} + +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return notify_on_release(css->cgroup); +} + +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + return 0; +} + +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); +} + +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + else + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + return 0; +} + +/* cgroup core interface files for the legacy hierarchies */ +struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, + { + .name = "tasks", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, + .write = cgroup_tasks_write, + }, + { + .name = "notify_on_release", + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_release_agent_show, + .write = cgroup_release_agent_write, + .max_write_len = PATH_MAX - 1, + }, + { } /* terminate */ +}; + +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + struct cgroup_subsys *ss; + int i; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ + mutex_lock(&cgroup_mutex); + + for_each_subsys(ss, i) + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, NULL); +} + +const struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * cgroupstats_build - build and fill cgroupstats + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup *cgrp; + struct css_task_iter it; + struct task_struct *tsk; + + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + + /* + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_online_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. + */ + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); + + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + css_task_iter_end(&it); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +void check_for_release(struct cgroup *cgrp) +{ + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); +} + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + */ +void cgroup_release_agent(struct work_struct *work) +{ + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL; + char *argv[3], *envp[3]; + int ret; + + mutex_lock(&cgroup_mutex); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + spin_lock_irq(&css_set_lock); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_irq(&css_set_lock); + if (ret < 0 || ret >= PATH_MAX) + goto out; + + argv[0] = agentbuf; + argv[1] = pathbuf; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); +} + +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) +{ + struct cgroup *cgrp = kn->priv; + int ret; + + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + if (kn->parent != new_parent) + return -EIO; + + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow on the default hierarchy. + */ + if (cgroup_on_dfl(cgrp)) + return -EPERM; + + /* + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; +} + +static int __init cgroup1_wq_init(void) +{ + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; +} +core_initcall(cgroup1_wq_init); + +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + + +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + + seq_printf(seq, "css_set %p\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; +#endif /* CONFIG_CGROUP_DEBUG */ -- cgit v1.2.3-70-g09d2 From fa069904dd38c2d8e121a3c7e37f8daaddb6dafa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:07 -0500 Subject: cgroup: separate out cgroup1_kf_syscall_ops Currently, cgroup_kf_syscall_ops is shared by v1 and v2 and the specific methods test the version and take different actions. Split out v1 functions and put them in cgroup1_kf_syscall_ops and remove the now unnecessary explicit branches in specific methods. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 4 ++-- kernel/cgroup/cgroup-v1.c | 11 ++--------- kernel/cgroup/cgroup.c | 40 ++++++++++++++++++++++++++-------------- 3 files changed, 30 insertions(+), 25 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index dca3193bd9d2..5790e5ff9a0f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -95,8 +95,8 @@ extern const struct file_operations proc_cgroupstats_operations; bool cgroup_ssid_no_v1(int ssid); void cgroup_pidlist_destroy_all(struct cgroup *cgrp); -int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str); +int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str); void cgroup_release_agent(struct work_struct *work); void check_for_release(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7af745a46f91..0b2c24f0b310 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -800,8 +800,8 @@ out_free: /* * cgroup_rename - Only allow simple rename of directories in place. */ -int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) +int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; @@ -811,13 +811,6 @@ int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, if (kn->parent != new_parent) return -EIO; - /* - * This isn't a proper migration and its usefulness is very - * limited. Disallow on the default hierarchy. - */ - if (cgroup_on_dfl(cgrp)) - return -EPERM; - /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 228dd9ae708b..de6a2ac41d0b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1232,6 +1232,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ +static struct kernfs_syscall_ops cgroup1_kf_syscall_ops; static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, @@ -1566,16 +1567,15 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; - if (root != &cgrp_dfl_root) - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1736,18 +1736,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; u16 added_mask, removed_mask; - if (root == &cgrp_dfl_root) { - pr_err("remount is not allowed\n"); - return -EINVAL; - } - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ @@ -1798,6 +1793,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return ret; } +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +{ + pr_err("remount is not allowed\n"); + return -EINVAL; +} + /* * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through @@ -1900,6 +1901,7 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; @@ -1931,7 +1933,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto cancel_ref; - root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + kf_sops = root == &cgrp_dfl_root ? + &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; + + root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED, root_cgrp); if (IS_ERR(root->kf_root)) { @@ -4813,12 +4818,19 @@ static int cgroup_rmdir(struct kernfs_node *kn) return ret; } +static struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { + .remount_fs = cgroup1_remount, + .show_options = cgroup1_show_options, + .rename = cgroup1_rename, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .show_path = cgroup_show_path, +}; + static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .remount_fs = cgroup_remount, - .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, - .rename = cgroup_rename, .show_path = cgroup_show_path, }; -- cgit v1.2.3-70-g09d2 From 1592c9b223749d59b933ebbfe37f1a8833d7a6cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:08 -0500 Subject: cgroup: move v1 mount functions to kernel/cgroup/cgroup-v1.c Now that the v1 mount code is split into separate functions, move them to kernel/cgroup/cgroup-v1.c along with the mount option handling code. As this puts all v1-only kernfs_syscall_ops in cgroup-v1.c, move cgroup1_kf_syscall_ops to cgroup-v1.c too. Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 28 ++- kernel/cgroup/cgroup-v1.c | 381 ++++++++++++++++++++++++++++++++++++- kernel/cgroup/cgroup.c | 410 +--------------------------------------- 3 files changed, 413 insertions(+), 406 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5790e5ff9a0f..710edeeb1f9f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -26,6 +26,16 @@ struct cgrp_cset_link { struct list_head cgrp_link; }; +struct cgroup_sb_opts { + u16 subsys_mask; + unsigned int flags; + char *release_agent; + bool cpuset_clone_children; + char *name; + /* User explicitly requested empty subsystem */ + bool none; +}; + extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; @@ -66,7 +76,13 @@ void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup_free_root(struct cgroup_root *root); +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns); bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct list_head *preloaded_csets); @@ -86,18 +102,24 @@ ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); +int cgroup_rmdir(struct kernfs_node *kn); +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root); + /* * cgroup-v1.c */ -extern spinlock_t release_agent_path_lock; extern struct cftype cgroup_legacy_base_files[]; extern const struct file_operations proc_cgroupstats_operations; +extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; bool cgroup_ssid_no_v1(int ssid); void cgroup_pidlist_destroy_all(struct cgroup *cgrp); -int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str); void cgroup_release_agent(struct work_struct *work); void check_for_release(struct cgroup *cgrp); +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns); #endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0b2c24f0b310..ae240c0d33cb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1,7 +1,9 @@ #include "cgroup-internal.h" +#include #include #include +#include #include #include #include @@ -32,7 +34,7 @@ static struct workqueue_struct *cgroup_pidlist_destroy_wq; * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ -DEFINE_SPINLOCK(release_agent_path_lock); +static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup_ssid_no_v1(int ssid) { @@ -800,8 +802,8 @@ out_free: /* * cgroup_rename - Only allow simple rename of directories in place. */ -int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) +static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; @@ -832,6 +834,379 @@ int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return ret; } +static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_subsys *ss; + int ssid; + + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); + if (root->flags & CGRP_ROOT_NOPREFIX) + seq_puts(seq, ",noprefix"); + if (root->flags & CGRP_ROOT_XATTR) + seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); + if (strlen(root->release_agent_path)) + seq_show_option(seq, "release_agent", + root->release_agent_path); + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) + seq_puts(seq, ",clone_children"); + if (strlen(root->name)) + seq_show_option(seq, "name", root->name); + return 0; +} + +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) +{ + char *token, *o = data; + bool all_ss = false, one_ss = false; + u16 mask = U16_MAX; + struct cgroup_subsys *ss; + int nr_opts = 0; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + + memset(opts, 0, sizeof(*opts)); + + while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + + if (!*token) + return -EINVAL; + if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { + opts->flags |= CGRP_ROOT_NOPREFIX; + continue; + } + if (!strcmp(token, "clone_children")) { + opts->cpuset_clone_children = true; + continue; + } + if (!strcmp(token, "xattr")) { + opts->flags |= CGRP_ROOT_XATTR; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; + continue; + } + if (!strncmp(token, "name=", 5)) { + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN - 1, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + + continue; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->legacy_name)) + continue; + if (!cgroup_ssid_enabled(i)) + continue; + if (cgroup_ssid_no_v1(i)) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + opts->subsys_mask |= (1 << i); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* + * Option noprefix was introduced just for backward compatibility + * with the old cpuset, so we allow noprefix only if mounting just + * the cpuset subsystem. + */ + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) + return -EINVAL; + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_mask && opts->none) + return -EINVAL; + + return 0; +} + +static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +{ + int ret = 0; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_sb_opts opts; + u16 added_mask, removed_mask; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + + /* Don't allow flags or name to change at remount */ + if ((opts.flags ^ root->flags) || + (opts.name && strcmp(opts.name, root->name))) { + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", + opts.flags, opts.name ?: "", root->flags, root->name); + ret = -EINVAL; + goto out_unlock; + } + + /* remounting is not allowed for populated hierarchies */ + if (!list_empty(&root->cgrp.self.children)) { + ret = -EBUSY; + goto out_unlock; + } + + ret = rebind_subsystems(root, added_mask); + if (ret) + goto out_unlock; + + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); + strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } + + trace_cgroup_remount(root); + + out_unlock: + kfree(opts.release_agent); + kfree(opts.name); + mutex_unlock(&cgroup_mutex); + return ret; +} + +struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { + .rename = cgroup1_rename, + .show_options = cgroup1_show_options, + .remount_fs = cgroup1_remount, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .show_path = cgroup_show_path, +}; + +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns) +{ + struct super_block *pinned_sb = NULL; + struct cgroup_sb_opts opts; + struct cgroup_root *root; + struct cgroup_subsys *ss; + struct dentry *dentry; + int i, ret; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + + for_each_root(root) { + bool name_match = false; + + if (root == &cgrp_dfl_root) + continue; + + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } + + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. + */ + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } + + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); + + /* + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. + */ + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + + ret = 0; + goto out_unlock; + } + + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } + + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { + ret = -EPERM; + goto out_unlock; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; + } + + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); + +out_unlock: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(opts.release_agent); + kfree(opts.name); + + if (ret) + return ERR_PTR(ret); + + dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, + CGROUP_SUPER_MAGIC, ns); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) + deactivate_super(pinned_sb); + + return dentry; +} + static int __init cgroup1_wq_init(void) { /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index de6a2ac41d0b..4be306510aff 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -31,7 +31,6 @@ #include "cgroup-internal.h" #include -#include #include #include #include @@ -49,7 +48,6 @@ #include #include #include -#include #include #include #include @@ -1078,7 +1076,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } -static void cgroup_free_root(struct cgroup_root *root) +void cgroup_free_root(struct cgroup_root *root) { if (root) { idr_destroy(&root->cgroup_idr); @@ -1232,7 +1230,6 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static struct kernfs_syscall_ops cgroup1_kf_syscall_ops; static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, @@ -1540,8 +1537,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } -static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root) +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; @@ -1567,232 +1564,6 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) -{ - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_subsys *ss; - int ssid; - - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); - if (root->flags & CGRP_ROOT_NOPREFIX) - seq_puts(seq, ",noprefix"); - if (root->flags & CGRP_ROOT_XATTR) - seq_puts(seq, ",xattr"); - - spin_lock(&release_agent_path_lock); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); - spin_unlock(&release_agent_path_lock); - - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) - seq_puts(seq, ",clone_children"); - if (strlen(root->name)) - seq_show_option(seq, "name", root->name); - return 0; -} - -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); - - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; - - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; - - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) - continue; - if (!cgroup_ssid_enabled(i)) - continue; - if (cgroup_ssid_no_v1(i)) - continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - - /* - * Option noprefix was introduced just for backward compatibility - * with the old cpuset, so we allow noprefix only if mounting just - * the cpuset subsystem. - */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; - - return 0; -} - -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) -{ - int ret = 0; - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; - u16 added_mask, removed_mask; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); - - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; - - /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); - ret = -EINVAL; - goto out_unlock; - } - - /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.self.children)) { - ret = -EBUSY; - goto out_unlock; - } - - ret = rebind_subsystems(root, added_mask); - if (ret) - goto out_unlock; - - WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - - if (opts.release_agent) { - spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); - spin_unlock(&release_agent_path_lock); - } - - trace_cgroup_remount(root); - - out_unlock: - kfree(opts.release_agent); - kfree(opts.name); - mutex_unlock(&cgroup_mutex); - return ret; -} - static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { pr_err("remount is not allowed\n"); @@ -1877,8 +1648,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); } -static void init_cgroup_root(struct cgroup_root *root, - struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->cgrp; @@ -1897,7 +1667,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1994,10 +1764,9 @@ out: return ret; } -static struct dentry *cgroup_do_mount(struct file_system_type *fs_type, - int flags, struct cgroup_root *root, - unsigned long magic, - struct cgroup_namespace *ns) +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns) { struct dentry *dentry; bool new_sb; @@ -2031,155 +1800,6 @@ static struct dentry *cgroup_do_mount(struct file_system_type *fs_type, return dentry; } -static struct dentry *cgroup1_mount(struct file_system_type *fs_type, - int flags, void *data, - unsigned long magic, - struct cgroup_namespace *ns) -{ - struct super_block *pinned_sb = NULL; - struct cgroup_sb_opts opts; - struct cgroup_root *root; - struct cgroup_subsys *ss; - struct dentry *dentry; - int i, ret; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* - * Destruction of cgroup root is asynchronous, so subsystems may - * still be dying after the previous unmount. Let's drain the - * dying subsystems. We just need to ensure that the ones - * unmounted previously finish dying and don't care about new ones - * starting. Testing ref liveliness is good enough. - */ - for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || - ss->root == &cgrp_dfl_root) - continue; - - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - cgroup_put(&ss->root->cgrp); - } - - for_each_root(root) { - bool name_match = false; - - if (root == &cgrp_dfl_root) - continue; - - /* - * If we asked for a name then it must match. Also, if - * name matches but sybsys_mask doesn't, we should fail. - * Remember whether name matched. - */ - if (opts.name) { - if (strcmp(opts.name, root->name)) - continue; - name_match = true; - } - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match. - */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { - if (!name_match) - continue; - ret = -EBUSY; - goto out_unlock; - } - - if (root->flags ^ opts.flags) - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - - ret = 0; - goto out_unlock; - } - - /* - * No such thing, create a new one. name= matching without subsys - * specification is allowed for already existing hierarchies but we - * can't create new one without subsys specification. - */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } - - /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - - init_cgroup_root(root, &opts); - - ret = cgroup_setup_root(root, opts.subsys_mask); - if (ret) - cgroup_free_root(root); - -out_unlock: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); - - if (ret) - return ERR_PTR(ret); - - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); - - /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. - */ - if (pinned_sb) - deactivate_super(pinned_sb); - - return dentry; -} - static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) @@ -4587,8 +4207,7 @@ out_destroy: return ERR_PTR(ret); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; struct kernfs_node *kn; @@ -4800,7 +4419,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; -static int cgroup_rmdir(struct kernfs_node *kn) +int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; @@ -4818,15 +4437,6 @@ static int cgroup_rmdir(struct kernfs_node *kn) return ret; } -static struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { - .remount_fs = cgroup1_remount, - .show_options = cgroup1_show_options, - .rename = cgroup1_rename, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .show_path = cgroup_show_path, -}; - static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, -- cgit v1.2.3-70-g09d2 From d62beb7f3dc6b45f9b9d381897e05fe8ba286d8a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:08 -0500 Subject: cgroup: rename functions for consistency Now that v1 functions are separated out, rename some functions for consistency. cgroup_dfl_base_files -> cgroup_base_files cgroup_legacy_base_files -> cgroup1_base_files cgroup_ssid_no_v1() -> cgroup1_ssid_disabled() cgroup_pidlist_destroy_all -> cgroup1_pidlist_destroy_all() cgroup_release_agent() -> cgroup1_release_agent() check_for_release() -> cgroup1_check_for_release() Signed-off-by: Tejun Heo Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 10 +++++----- kernel/cgroup/cgroup-v1.c | 14 +++++++------- kernel/cgroup/cgroup.c | 22 +++++++++++----------- 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 710edeeb1f9f..a890c92cb688 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -110,14 +110,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, /* * cgroup-v1.c */ -extern struct cftype cgroup_legacy_base_files[]; +extern struct cftype cgroup1_base_files[]; extern const struct file_operations proc_cgroupstats_operations; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; -bool cgroup_ssid_no_v1(int ssid); -void cgroup_pidlist_destroy_all(struct cgroup *cgrp); -void cgroup_release_agent(struct work_struct *work); -void check_for_release(struct cgroup *cgrp); +bool cgroup1_ssid_disabled(int ssid); +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); +void cgroup1_release_agent(struct work_struct *work); +void cgroup1_check_for_release(struct cgroup *cgrp); struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, void *data, unsigned long magic, struct cgroup_namespace *ns); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index ae240c0d33cb..37be09e09740 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -36,7 +36,7 @@ static struct workqueue_struct *cgroup_pidlist_destroy_wq; */ static DEFINE_SPINLOCK(release_agent_path_lock); -bool cgroup_ssid_no_v1(int ssid) +bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } @@ -201,7 +201,7 @@ static void pidlist_free(void *p) * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ -void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; @@ -585,7 +585,7 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, } /* cgroup core interface files for the legacy hierarchies */ -struct cftype cgroup_legacy_base_files[] = { +struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, @@ -729,7 +729,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) return 0; } -void check_for_release(struct cgroup *cgrp) +void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) @@ -759,7 +759,7 @@ void check_for_release(struct cgroup *cgrp) * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ -void cgroup_release_agent(struct work_struct *work) +void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); @@ -946,7 +946,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; - if (cgroup_ssid_no_v1(i)) + if (cgroup1_ssid_disabled(i)) continue; /* Mutually exclusive option 'all' + subsystem name */ @@ -968,7 +968,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) opts->subsys_mask |= (1 << i); /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4be306510aff..a05a2dacf5dc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -199,7 +199,7 @@ struct cgroup_namespace init_cgroup_ns = { static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; -static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); @@ -609,7 +609,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - check_for_release(cgrp); + cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); cgrp = cgroup_parent(cgrp); @@ -1440,9 +1440,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) - cfts = cgroup_dfl_base_files; + cfts = cgroup_base_files; else - cfts = cgroup_legacy_base_files; + cfts = cgroup1_base_files; return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); } @@ -1645,7 +1645,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); + INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) @@ -3836,7 +3836,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) } /* cgroup core interface files for the default hierarchy */ -static struct cftype cgroup_dfl_base_files[] = { +static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", .file_offset = offsetof(struct cgroup, procs_file), @@ -3909,7 +3909,7 @@ static void css_free_work_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { @@ -4411,7 +4411,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - check_for_release(cgroup_parent(cgrp)); + cgroup1_check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -4548,8 +4548,8 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -4599,7 +4599,7 @@ int __init cgroup_init(void) continue; } - if (cgroup_ssid_no_v1(ssid)) + if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", ss->name); -- cgit v1.2.3-70-g09d2 From e0aed7c74f0bf6c643103f3a10ff988b7ee6545a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Dec 2016 14:49:09 -0500 Subject: cgroup: fix RCU related sparse warnings kn->priv which is a void * is used as a RCU pointer by cgroup. When dereferencing it, it was passing kn->priv to rcu_derefreence() without casting it into a RCU pointer triggering address space mismatch warning from sparse. Fix them. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Acked-by: Acked-by: Zefan Li --- kernel/cgroup/cgroup-v1.c | 2 +- kernel/cgroup/cgroup.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 37be09e09740..465b10111eaf 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -694,7 +694,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); - cgrp = rcu_dereference(kn->priv); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || cgroup_is_dead(cgrp)) { rcu_read_unlock(); mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b6b9068ef468..d9d82e96d67f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4932,7 +4932,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ - cgrp = rcu_dereference(kn->priv); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); -- cgit v1.2.3-70-g09d2 From e595cd706982bff0211e6fafe5a108421e747fbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 15 Jan 2017 19:03:41 -0500 Subject: cgroup: track migration context in cgroup_mgctx cgroup migration is performed in four steps - css_set preloading, addition of target tasks, actual migration, and clean up. A list named preloaded_csets is used to track the preloading. This is a bit too restricted and the code is already depending on the subtlety that all source css_sets appear before destination ones. Let's create struct cgroup_mgctx which keeps track of everything during migration. Currently, it has separate preload lists for source and destination csets and also embeds cgroup_taskset which is used during the actual migration. This moves struct cgroup_taskset definition to cgroup-internal.h. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 66 ++++++++++++++++-- kernel/cgroup/cgroup-v1.c | 10 +-- kernel/cgroup/cgroup.c | 144 ++++++++++++++++------------------------ 3 files changed, 122 insertions(+), 98 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 589b0e7013ec..5f8c8ac5ab88 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -26,6 +26,61 @@ struct cgrp_cset_link { struct list_head cgrp_link; }; +/* used to track tasks and csets during migration */ +struct cgroup_taskset { + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* the subsys currently being processed */ + int ssid; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; +}; + +/* migration context also tracks preloading */ +struct cgroup_mgctx { + /* + * Preloaded source and destination csets. Used to guarantee + * atomic success or failure on actual migration. + */ + struct list_head preloaded_src_csets; + struct list_head preloaded_dst_csets; + + /* tasks and csets to migrate */ + struct cgroup_taskset tset; +}; + +#define CGROUP_TASKSET_INIT(tset) \ +{ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +#define CGROUP_MGCTX_INIT(name) \ +{ \ + LIST_HEAD_INIT(name.preloaded_src_csets), \ + LIST_HEAD_INIT(name.preloaded_dst_csets), \ + CGROUP_TASKSET_INIT(name.tset), \ +} + +#define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + struct cgroup_sb_opts { u16 subsys_mask; unsigned int flags; @@ -112,13 +167,12 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_namespace *ns); bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); -void cgroup_migrate_finish(struct list_head *preloaded_csets); -void cgroup_migrate_add_src(struct css_set *src_cset, - struct cgroup *dst_cgrp, - struct list_head *preloaded_csets); -int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets); +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); +void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, + struct cgroup_mgctx *mgctx); +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_root *root); + struct cgroup_mgctx *mgctx, struct cgroup_root *root); int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 465b10111eaf..7a965f460faf 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -87,7 +87,7 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all); */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - LIST_HEAD(preloaded_csets); + DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; @@ -106,10 +106,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; @@ -125,14 +125,14 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to->root); + ret = cgroup_migrate(task, false, &mgctx, to->root); if (!ret) trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); out_err: - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return ret; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aed492e907c1..f90554d24e59 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1916,49 +1916,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* used to track tasks and other necessary states during migration */ -struct cgroup_taskset { - /* the src and dst cset list running through cset->mg_node */ - struct list_head src_csets; - struct list_head dst_csets; - - /* the subsys currently being processed */ - int ssid; - - /* - * Fields for cgroup_taskset_*() iteration. - * - * Before migration is committed, the target migration tasks are on - * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of - * the csets on ->dst_csets. ->csets point to either ->src_csets - * or ->dst_csets depending on whether migration is committed. - * - * ->cur_csets and ->cur_task point to the current task position - * during iteration. - */ - struct list_head *csets; - struct css_set *cur_cset; - struct task_struct *cur_task; -}; - -#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ - .src_csets = LIST_HEAD_INIT(tset.src_csets), \ - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ - .csets = &tset.src_csets, \ -} - /** - * cgroup_taskset_add - try to add a migration target task to a taskset + * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task - * @tset: target taskset + * @mgctx: target migration context * - * Add @task, which is a migration target, to @tset. This function becomes - * noop if @task doesn't need to be migrated. @task's css_set should have - * been added as a migration source and @task->cg_list will be moved from - * the css_set's tasks list to mg_tasks one. + * Add @task, which is a migration target, to @mgctx->tset. This function + * becomes noop if @task doesn't need to be migrated. @task's css_set + * should have been added as a migration source and @task->cg_list will be + * moved from the css_set's tasks list to mg_tasks one. */ -static void cgroup_taskset_add(struct task_struct *task, - struct cgroup_taskset *tset) +static void cgroup_migrate_add_task(struct task_struct *task, + struct cgroup_mgctx *mgctx) { struct css_set *cset; @@ -1978,10 +1947,11 @@ static void cgroup_taskset_add(struct task_struct *task, list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset->src_csets); + list_add_tail(&cset->mg_node, + &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, - &tset->dst_csets); + &mgctx->tset.dst_csets); } /** @@ -2048,17 +2018,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset - * @tset: taget taskset + * @mgctx: migration context * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset as setup by migration preparation functions. + * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and - * guarantees that either all or none of the tasks in @tset are migrated. - * @tset is consumed regardless of success. + * guarantees that either all or none of the tasks in @mgctx are migrated. + * @mgctx is consumed regardless of success. */ -static int cgroup_taskset_migrate(struct cgroup_taskset *tset, +static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, struct cgroup_root *root) { + struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; @@ -2151,25 +2122,31 @@ bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) /** * cgroup_migrate_finish - cleanup after attach - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ -void cgroup_migrate_finish(struct list_head *preloaded_csets) +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { + LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + + list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); + list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + + list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } + spin_unlock_irq(&css_set_lock); } @@ -2177,10 +2154,10 @@ void cgroup_migrate_finish(struct list_head *preloaded_csets) * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin - * @src_cset and add it to @preloaded_csets, which should later be cleaned + * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem @@ -2191,7 +2168,7 @@ void cgroup_migrate_finish(struct list_head *preloaded_csets) */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) + struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; @@ -2219,32 +2196,32 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add(&src_cset->mg_preload_node, preloaded_csets); + list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @preloaded_csets: list of preloaded source css_sets + * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been - * preloaded to @preloaded_csets. This function looks up and pins all - * destination css_sets, links each to its source, and append them to - * @preloaded_csets. + * preloaded to @mgctx->preloaded_src_csets. This function looks up and + * pins all destination css_sets, links each to its source, and append them + * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on - * @preloaded_csets. + * @mgctx. */ -int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { - LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_preload_node) { struct css_set *dst_cset; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); @@ -2270,15 +2247,15 @@ int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) - list_add(&dst_cset->mg_preload_node, &csets); + list_add_tail(&dst_cset->mg_preload_node, + &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); } - list_splice_tail(&csets, preloaded_csets); return 0; err: - cgroup_migrate_finish(&csets); + cgroup_migrate_finish(mgctx); return -ENOMEM; } @@ -2287,6 +2264,7 @@ err: * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @root: cgroup root migration is taking place on + * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also @@ -2301,9 +2279,8 @@ err: * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_root *root) + struct cgroup_mgctx *mgctx, struct cgroup_root *root) { - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; /* @@ -2315,14 +2292,14 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_lock(); task = leader; do { - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_taskset_migrate(&tset, root); + return cgroup_migrate_execute(mgctx, root); } /** @@ -2336,7 +2313,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { - LIST_HEAD(preloaded_csets); + DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret; @@ -2348,8 +2325,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, rcu_read_lock(); task = leader; do { - cgroup_migrate_add_src(task_css_set(task), dst_cgrp, - &preloaded_csets); + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); @@ -2357,11 +2333,11 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); + ret = cgroup_migrate(leader, threadgroup, &mgctx, dst_cgrp->root); - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); if (!ret) trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); @@ -2528,8 +2504,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { - LIST_HEAD(preloaded_csets); - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; @@ -2545,33 +2520,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, dsct, - &preloaded_csets); + cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { struct task_struct *task, *ntask; - /* src_csets precede dst_csets, break on the first dst_cset */ - if (!src_cset->mg_src_cgrp) - break; - /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp->root); + ret = cgroup_migrate_execute(&mgctx, cgrp->root); out_finish: - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } -- cgit v1.2.3-70-g09d2 From bfc2cf6f61fceac42235345081eb713329baa2a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 15 Jan 2017 19:03:41 -0500 Subject: cgroup: call subsys->*attach() only for subsystems which are actually affected by migration Currently, subsys->*attach() callbacks are called for all subsystems which are attached to the hierarchy on which the migration is taking place. With cgroup_migrate_prepare_dst() filtering out identity migrations, v1 hierarchies can avoid spurious ->*attach() callback invocations where the source and destination csses are identical; however, this isn't enough on v2 as only a subset of the attached controllers can be affected on controller enable/disable. While spurious ->*attach() invocations aren't critically broken, they're unnecessary overhead and can lead to temporary overcharges on certain controllers. Fix it by tracking which subsystems are affected by a migration and invoking ->*attach() callbacks only on those subsystems. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup/cgroup-internal.h | 5 ++++- kernel/cgroup/cgroup-v1.c | 2 +- kernel/cgroup/cgroup.c | 25 ++++++++++++++----------- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5f8c8ac5ab88..9203bfb05603 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -62,6 +62,9 @@ struct cgroup_mgctx { /* tasks and csets to migrate */ struct cgroup_taskset tset; + + /* subsystems affected by migration */ + u16 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ @@ -172,7 +175,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx, struct cgroup_root *root); + struct cgroup_mgctx *mgctx); int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7a965f460faf..fc34bcf2329f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -125,7 +125,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, &mgctx, to->root); + ret = cgroup_migrate(task, false, &mgctx); if (!ret) trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f90554d24e59..69ad5b3de0c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2019,15 +2019,13 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset * @mgctx: migration context - * @root: cgroup root the migration is taking place on * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ -static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, - struct cgroup_root *root) +static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; @@ -2040,7 +2038,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, return 0; /* check that we can legitimately attach to the cgroup */ - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); @@ -2076,7 +2074,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, */ tset->csets = &tset->dst_csets; - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); @@ -2087,7 +2085,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx, goto out_release_tset; out_cancel_attach: - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { @@ -2223,6 +2221,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_preload_node) { struct css_set *dst_cset; + struct cgroup_subsys *ss; + int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) @@ -2251,6 +2251,10 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); + + for_each_subsys(ss, ssid) + if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) + mgctx->ss_mask |= 1 << ssid; } return 0; @@ -2263,7 +2267,6 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @root: cgroup root migration is taking place on * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, @@ -2279,7 +2282,7 @@ err: * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_mgctx *mgctx, struct cgroup_root *root) + struct cgroup_mgctx *mgctx) { struct task_struct *task; @@ -2299,7 +2302,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_migrate_execute(mgctx, root); + return cgroup_migrate_execute(mgctx); } /** @@ -2335,7 +2338,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) - ret = cgroup_migrate(leader, threadgroup, &mgctx, dst_cgrp->root); + ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); @@ -2539,7 +2542,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_execute(&mgctx, cgrp->root); + ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); -- cgit v1.2.3-70-g09d2 From c3edc4010e9d102eb7b8f17d15c2ebc425fed63c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Feb 2017 08:35:14 +0100 Subject: sched/headers: Move task_struct::signal and task_struct::sighand types and accessors into MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit task_struct::signal and task_struct::sighand are pointers, which would normally make it straightforward to not define those types in sched.h. That is not so, because the types are accompanied by a myriad of APIs (macros and inline functions) that dereference them. Split the types and the APIs out of sched.h and move them into a new header, . With this change sched.h does not know about 'struct signal' and 'struct sighand' anymore, trying to put accessors into sched.h as a test fails the following way: ./include/linux/sched.h: In function ‘test_signal_types’: ./include/linux/sched.h:2461:18: error: dereferencing pointer to incomplete type ‘struct signal_struct’ ^ This reduces the size and complexity of sched.h significantly. Update all headers and .c code that relied on getting the signal handling functionality from to include . The list of affected files in the preparatory patch was partly generated by grepping for the APIs, and partly by doing coverage build testing, both all[yes|mod|def|no]config builds on 64-bit and 32-bit x86, and an array of cross-architecture builds. Nevertheless some (trivial) build breakage is still expected related to rare Kconfig combinations and in-flight patches to various kernel code, but most of it should be handled by this patch. Acked-by: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/nwfpe/fpmodule.c | 2 +- arch/sh/kernel/cpu/sh4/fpu.c | 3 +- drivers/net/tap.c | 2 +- include/linux/sched.h | 499 +----------------------------------------- include/linux/sched/signal.h | 502 +++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup/cgroup-v1.c | 1 + mm/vmalloc.c | 2 +- net/smc/af_smc.c | 2 + net/smc/smc_clc.c | 2 + net/smc/smc_close.c | 2 + net/smc/smc_rx.c | 2 + net/smc/smc_tx.c | 2 + 12 files changed, 520 insertions(+), 501 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/arch/arm/nwfpe/fpmodule.c b/arch/arm/nwfpe/fpmodule.c index ec717c190e2c..1365e8650843 100644 --- a/arch/arm/nwfpe/fpmodule.c +++ b/arch/arm/nwfpe/fpmodule.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c index 69ab4d3c8d41..95fd2dcb83da 100644 --- a/arch/sh/kernel/cpu/sh4/fpu.c +++ b/arch/sh/kernel/cpu/sh4/fpu.c @@ -10,8 +10,7 @@ * * FIXME! These routines have not been tested for big endian case. */ -#include -#include +#include #include #include #include diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 35b55a2fa1a1..4d4173d25dd0 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/sched.h b/include/linux/sched.h index 7934cd0acbc7..c1586104d4c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,6 +71,9 @@ struct blk_plug; struct filename; struct nameidata; +struct signal_struct; +struct sighand_struct; + extern unsigned long total_forks; extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); @@ -361,13 +364,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif -struct sighand_struct { - atomic_t count; - struct k_sigaction action[_NSIG]; - spinlock_t siglock; - wait_queue_head_t signalfd_wqh; -}; - struct pacct_struct { int ac_flag; long ac_exitcode; @@ -485,195 +481,6 @@ struct thread_group_cputimer { #include struct autogroup; -/* - * NOTE! "signal_struct" does not have its own - * locking, because a shared signal_struct always - * implies a shared sighand_struct, so locking - * sighand_struct is always a proper superset of - * the locking of signal_struct. - */ -struct signal_struct { - atomic_t sigcnt; - atomic_t live; - int nr_threads; - struct list_head thread_head; - - wait_queue_head_t wait_chldexit; /* for wait4() */ - - /* current thread group signal load-balancing target: */ - struct task_struct *curr_target; - - /* shared signal handling: */ - struct sigpending shared_pending; - - /* thread group exit support */ - int group_exit_code; - /* overloaded: - * - notify group_exit_task when ->count is equal to notify_count - * - everyone except group_exit_task is stopped during signal delivery - * of fatal signals, group_exit_task processes the signal. - */ - int notify_count; - struct task_struct *group_exit_task; - - /* thread group stop support, overloads group_exit_code too */ - int group_stop_count; - unsigned int flags; /* see SIGNAL_* flags below */ - - /* - * PR_SET_CHILD_SUBREAPER marks a process, like a service - * manager, to re-parent orphan (double-forking) child processes - * to this process instead of 'init'. The service manager is - * able to receive SIGCHLD signals and is able to investigate - * the process until it calls wait(). All children of this - * process will inherit a flag if they should look for a - * child_subreaper process at exit. - */ - unsigned int is_child_subreaper:1; - unsigned int has_child_subreaper:1; - -#ifdef CONFIG_POSIX_TIMERS - - /* POSIX.1b Interval Timers */ - int posix_timer_id; - struct list_head posix_timers; - - /* ITIMER_REAL timer for the process */ - struct hrtimer real_timer; - ktime_t it_real_incr; - - /* - * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use - * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these - * values are defined to 0 and 1 respectively - */ - struct cpu_itimer it[2]; - - /* - * Thread group totals for process CPU timers. - * See thread_group_cputimer(), et al, for details. - */ - struct thread_group_cputimer cputimer; - - /* Earliest-expiration cache. */ - struct task_cputime cputime_expires; - - struct list_head cpu_timers[3]; - -#endif - - struct pid *leader_pid; - -#ifdef CONFIG_NO_HZ_FULL - atomic_t tick_dep_mask; -#endif - - struct pid *tty_old_pgrp; - - /* boolean value for session group leader */ - int leader; - - struct tty_struct *tty; /* NULL if no tty */ - -#ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; -#endif - /* - * Cumulative resource counters for dead threads in the group, - * and for reaped dead child processes forked by this group. - * Live threads maintain their own counters and add to these - * in __exit_signal, except for the group leader. - */ - seqlock_t stats_lock; - u64 utime, stime, cutime, cstime; - u64 gtime; - u64 cgtime; - struct prev_cputime prev_cputime; - unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; - unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; - unsigned long inblock, oublock, cinblock, coublock; - unsigned long maxrss, cmaxrss; - struct task_io_accounting ioac; - - /* - * Cumulative ns of schedule CPU time fo dead threads in the - * group, not including a zombie group leader, (This only differs - * from jiffies_to_ns(utime + stime) if sched_clock uses something - * other than jiffies.) - */ - unsigned long long sum_sched_runtime; - - /* - * We don't bother to synchronize most readers of this at all, - * because there is no reader checking a limit that actually needs - * to get both rlim_cur and rlim_max atomically, and either one - * alone is a single word that can safely be read normally. - * getrlimit/setrlimit use task_lock(current->group_leader) to - * protect this instead of the siglock, because they really - * have no need to disable irqs. - */ - struct rlimit rlim[RLIM_NLIMITS]; - -#ifdef CONFIG_BSD_PROCESS_ACCT - struct pacct_struct pacct; /* per-process accounting information */ -#endif -#ifdef CONFIG_TASKSTATS - struct taskstats *stats; -#endif -#ifdef CONFIG_AUDIT - unsigned audit_tty; - struct tty_audit_buf *tty_audit_buf; -#endif - - /* - * Thread is the potential origin of an oom condition; kill first on - * oom - */ - bool oom_flag_origin; - short oom_score_adj; /* OOM kill score adjustment */ - short oom_score_adj_min; /* OOM kill score adjustment min value. - * Only settable by CAP_SYS_RESOURCE. */ - struct mm_struct *oom_mm; /* recorded mm when the thread group got - * killed by the oom killer */ - - struct mutex cred_guard_mutex; /* guard against foreign influences on - * credential calculations - * (notably. ptrace) */ -}; - -/* - * Bits in flags field of signal_struct. - */ -#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ -#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ -#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ -#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ -/* - * Pending notifications to parent. - */ -#define SIGNAL_CLD_STOPPED 0x00000010 -#define SIGNAL_CLD_CONTINUED 0x00000020 -#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) - -#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ - -#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ - SIGNAL_STOP_CONTINUED) - -static inline void signal_set_stop_flags(struct signal_struct *sig, - unsigned int flags) -{ - WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); - sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; -} - -/* If true, all threads except ->group_exit_task have pending SIGKILL */ -static inline int signal_group_exit(const struct signal_struct *sig) -{ - return (sig->flags & SIGNAL_GROUP_EXIT) || - (sig->group_exit_task != NULL); -} - /* * Some day this will be a full-fledged user tracking system.. */ @@ -2126,190 +1933,8 @@ extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); -extern void flush_signals(struct task_struct *); -extern void ignore_signals(struct task_struct *); -extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); - -static inline int kernel_dequeue_signal(siginfo_t *info) -{ - struct task_struct *tsk = current; - siginfo_t __info; - int ret; - - spin_lock_irq(&tsk->sighand->siglock); - ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info); - spin_unlock_irq(&tsk->sighand->siglock); - - return ret; -} - -static inline void kernel_signal_stop(void) -{ - spin_lock_irq(¤t->sighand->siglock); - if (current->jobctl & JOBCTL_STOP_DEQUEUED) - __set_current_state(TASK_STOPPED); - spin_unlock_irq(¤t->sighand->siglock); - - schedule(); -} extern void release_task(struct task_struct * p); -extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sigsegv(int, struct task_struct *); -extern int force_sig_info(int, struct siginfo *, struct task_struct *); -extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); -extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); -extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, - const struct cred *, u32); -extern int kill_pgrp(struct pid *pid, int sig, int priv); -extern int kill_pid(struct pid *pid, int sig, int priv); -extern int kill_proc_info(int, struct siginfo *, pid_t); -extern __must_check bool do_notify_parent(struct task_struct *, int); -extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); -extern void force_sig(int, struct task_struct *); -extern int send_sig(int, struct task_struct *, int); -extern int zap_other_threads(struct task_struct *p); -extern struct sigqueue *sigqueue_alloc(void); -extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); -extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); - -#ifdef TIF_RESTORE_SIGMASK -/* - * Legacy restore_sigmask accessors. These are inefficient on - * SMP architectures because they require atomic operations. - */ - -/** - * set_restore_sigmask() - make sure saved_sigmask processing gets done - * - * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code - * will run before returning to user mode, to process the flag. For - * all callers, TIF_SIGPENDING is already set or it's no harm to set - * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the - * arch code will notice on return to user mode, in case those bits - * are scarce. We set TIF_SIGPENDING here to ensure that the arch - * signal code always gets run when TIF_RESTORE_SIGMASK is set. - */ -static inline void set_restore_sigmask(void) -{ - set_thread_flag(TIF_RESTORE_SIGMASK); - WARN_ON(!test_thread_flag(TIF_SIGPENDING)); -} -static inline void clear_restore_sigmask(void) -{ - clear_thread_flag(TIF_RESTORE_SIGMASK); -} -static inline bool test_restore_sigmask(void) -{ - return test_thread_flag(TIF_RESTORE_SIGMASK); -} -static inline bool test_and_clear_restore_sigmask(void) -{ - return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); -} - -#else /* TIF_RESTORE_SIGMASK */ - -/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ -static inline void set_restore_sigmask(void) -{ - current->restore_sigmask = true; - WARN_ON(!test_thread_flag(TIF_SIGPENDING)); -} -static inline void clear_restore_sigmask(void) -{ - current->restore_sigmask = false; -} -static inline bool test_restore_sigmask(void) -{ - return current->restore_sigmask; -} -static inline bool test_and_clear_restore_sigmask(void) -{ - if (!current->restore_sigmask) - return false; - current->restore_sigmask = false; - return true; -} -#endif - -static inline void restore_saved_sigmask(void) -{ - if (test_and_clear_restore_sigmask()) - __set_current_blocked(¤t->saved_sigmask); -} - -static inline sigset_t *sigmask_to_save(void) -{ - sigset_t *res = ¤t->blocked; - if (unlikely(test_restore_sigmask())) - res = ¤t->saved_sigmask; - return res; -} - -static inline int kill_cad_pid(int sig, int priv) -{ - return kill_pid(cad_pid, sig, priv); -} - -/* These can be the second arg to send_sig_info/send_group_sig_info. */ -#define SEND_SIG_NOINFO ((struct siginfo *) 0) -#define SEND_SIG_PRIV ((struct siginfo *) 1) -#define SEND_SIG_FORCED ((struct siginfo *) 2) - -/* - * True if we are on the alternate signal stack. - */ -static inline int on_sig_stack(unsigned long sp) -{ - /* - * If the signal stack is SS_AUTODISARM then, by construction, we - * can't be on the signal stack unless user code deliberately set - * SS_AUTODISARM when we were already on it. - * - * This improves reliability: if user state gets corrupted such that - * the stack pointer points very close to the end of the signal stack, - * then this check will enable the signal to be handled anyway. - */ - if (current->sas_ss_flags & SS_AUTODISARM) - return 0; - -#ifdef CONFIG_STACK_GROWSUP - return sp >= current->sas_ss_sp && - sp - current->sas_ss_sp < current->sas_ss_size; -#else - return sp > current->sas_ss_sp && - sp - current->sas_ss_sp <= current->sas_ss_size; -#endif -} - -static inline int sas_ss_flags(unsigned long sp) -{ - if (!current->sas_ss_size) - return SS_DISABLE; - - return on_sig_stack(sp) ? SS_ONSTACK : 0; -} - -static inline void sas_ss_reset(struct task_struct *p) -{ - p->sas_ss_sp = 0; - p->sas_ss_size = 0; - p->sas_ss_flags = SS_DISABLE; -} - -static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) -{ - if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) -#ifdef CONFIG_STACK_GROWSUP - return current->sas_ss_sp; -#else - return current->sas_ss_sp + current->sas_ss_size; -#endif - return sp; -} #ifdef CONFIG_HAVE_COPY_THREAD_TLS extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, @@ -2338,10 +1963,8 @@ static inline void exit_thread(struct task_struct *tsk) #endif extern void exit_files(struct task_struct *); -extern void __cleanup_sighand(struct sighand_struct *); extern void exit_itimers(struct signal_struct *); -extern void flush_itimer_signals(void); extern void do_group_exit(int); @@ -2376,81 +1999,6 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif -#define tasklist_empty() \ - list_empty(&init_task.tasks) - -#define next_task(p) \ - list_entry_rcu((p)->tasks.next, struct task_struct, tasks) - -#define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) - -extern bool current_is_single_threaded(void); - -/* - * Careful: do_each_thread/while_each_thread is a double loop so - * 'break' will not work as expected - use goto instead. - */ -#define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do - -#define while_each_thread(g, t) \ - while ((t = next_thread(t)) != g) - -#define __for_each_thread(signal, t) \ - list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) - -#define for_each_thread(p, t) \ - __for_each_thread((p)->signal, t) - -/* Careful: this is a double loop, 'break' won't work as expected. */ -#define for_each_process_thread(p, t) \ - for_each_process(p) for_each_thread(p, t) - -typedef int (*proc_visitor)(struct task_struct *p, void *data); -void walk_process_tree(struct task_struct *top, proc_visitor, void *); - -static inline int get_nr_threads(struct task_struct *tsk) -{ - return tsk->signal->nr_threads; -} - -static inline bool thread_group_leader(struct task_struct *p) -{ - return p->exit_signal >= 0; -} - -/* Do to the insanities of de_thread it is possible for a process - * to have the pid of the thread group leader without actually being - * the thread group leader. For iteration through the pids in proc - * all we care about is that we have a task with the appropriate - * pid, we don't actually care if we have the right task. - */ -static inline bool has_group_leader_pid(struct task_struct *p) -{ - return task_pid(p) == p->signal->leader_pid; -} - -static inline -bool same_thread_group(struct task_struct *p1, struct task_struct *p2) -{ - return p1->signal == p2->signal; -} - -static inline struct task_struct *next_thread(const struct task_struct *p) -{ - return list_entry_rcu(p->thread_group.next, - struct task_struct, thread_group); -} - -static inline int thread_group_empty(struct task_struct *p) -{ - return list_empty(&p->thread_group); -} - -#define delay_group_leader(p) \ - (thread_group_leader(p) && !thread_group_empty(p)) - /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also @@ -2471,25 +2019,6 @@ static inline void task_unlock(struct task_struct *p) spin_unlock(&p->alloc_lock); } -extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, - unsigned long *flags); - -static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, - unsigned long *flags) -{ - struct sighand_struct *ret; - - ret = __lock_task_sighand(tsk, flags); - (void)__cond_lock(&tsk->sighand->siglock, ret); - return ret; -} - -static inline void unlock_task_sighand(struct task_struct *tsk, - unsigned long *flags) -{ - spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); -} - #ifdef CONFIG_THREAD_INFO_IN_TASK static inline struct thread_info *task_thread_info(struct task_struct *task) @@ -2862,28 +2391,6 @@ static inline void mm_update_next_owner(struct mm_struct *mm) } #endif /* CONFIG_MEMCG */ -static inline unsigned long task_rlimit(const struct task_struct *tsk, - unsigned int limit) -{ - return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); -} - -static inline unsigned long task_rlimit_max(const struct task_struct *tsk, - unsigned int limit) -{ - return READ_ONCE(tsk->signal->rlim[limit].rlim_max); -} - -static inline unsigned long rlimit(unsigned int limit) -{ - return task_rlimit(current, limit); -} - -static inline unsigned long rlimit_max(unsigned int limit) -{ - return task_rlimit_max(current, limit); -} - #define SCHED_CPUFREQ_RT (1U << 0) #define SCHED_CPUFREQ_DL (1U << 1) #define SCHED_CPUFREQ_IOWAIT (1U << 2) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index c6958a53fef3..53fe5450f431 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -8,4 +8,506 @@ #include #include +/* + * Types defining task->signal and task->sighand and APIs using them: + */ + +struct sighand_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; + wait_queue_head_t signalfd_wqh; +}; + +/* + * NOTE! "signal_struct" does not have its own + * locking, because a shared signal_struct always + * implies a shared sighand_struct, so locking + * sighand_struct is always a proper superset of + * the locking of signal_struct. + */ +struct signal_struct { + atomic_t sigcnt; + atomic_t live; + int nr_threads; + struct list_head thread_head; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + + /* current thread group signal load-balancing target: */ + struct task_struct *curr_target; + + /* shared signal handling: */ + struct sigpending shared_pending; + + /* thread group exit support */ + int group_exit_code; + /* overloaded: + * - notify group_exit_task when ->count is equal to notify_count + * - everyone except group_exit_task is stopped during signal delivery + * of fatal signals, group_exit_task processes the signal. + */ + int notify_count; + struct task_struct *group_exit_task; + + /* thread group stop support, overloads group_exit_code too */ + int group_stop_count; + unsigned int flags; /* see SIGNAL_* flags below */ + + /* + * PR_SET_CHILD_SUBREAPER marks a process, like a service + * manager, to re-parent orphan (double-forking) child processes + * to this process instead of 'init'. The service manager is + * able to receive SIGCHLD signals and is able to investigate + * the process until it calls wait(). All children of this + * process will inherit a flag if they should look for a + * child_subreaper process at exit. + */ + unsigned int is_child_subreaper:1; + unsigned int has_child_subreaper:1; + +#ifdef CONFIG_POSIX_TIMERS + + /* POSIX.1b Interval Timers */ + int posix_timer_id; + struct list_head posix_timers; + + /* ITIMER_REAL timer for the process */ + struct hrtimer real_timer; + ktime_t it_real_incr; + + /* + * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use + * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these + * values are defined to 0 and 1 respectively + */ + struct cpu_itimer it[2]; + + /* + * Thread group totals for process CPU timers. + * See thread_group_cputimer(), et al, for details. + */ + struct thread_group_cputimer cputimer; + + /* Earliest-expiration cache. */ + struct task_cputime cputime_expires; + + struct list_head cpu_timers[3]; + +#endif + + struct pid *leader_pid; + +#ifdef CONFIG_NO_HZ_FULL + atomic_t tick_dep_mask; +#endif + + struct pid *tty_old_pgrp; + + /* boolean value for session group leader */ + int leader; + + struct tty_struct *tty; /* NULL if no tty */ + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + /* + * Cumulative resource counters for dead threads in the group, + * and for reaped dead child processes forked by this group. + * Live threads maintain their own counters and add to these + * in __exit_signal, except for the group leader. + */ + seqlock_t stats_lock; + u64 utime, stime, cutime, cstime; + u64 gtime; + u64 cgtime; + struct prev_cputime prev_cputime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + unsigned long inblock, oublock, cinblock, coublock; + unsigned long maxrss, cmaxrss; + struct task_io_accounting ioac; + + /* + * Cumulative ns of schedule CPU time fo dead threads in the + * group, not including a zombie group leader, (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sum_sched_runtime; + + /* + * We don't bother to synchronize most readers of this at all, + * because there is no reader checking a limit that actually needs + * to get both rlim_cur and rlim_max atomically, and either one + * alone is a single word that can safely be read normally. + * getrlimit/setrlimit use task_lock(current->group_leader) to + * protect this instead of the siglock, because they really + * have no need to disable irqs. + */ + struct rlimit rlim[RLIM_NLIMITS]; + +#ifdef CONFIG_BSD_PROCESS_ACCT + struct pacct_struct pacct; /* per-process accounting information */ +#endif +#ifdef CONFIG_TASKSTATS + struct taskstats *stats; +#endif +#ifdef CONFIG_AUDIT + unsigned audit_tty; + struct tty_audit_buf *tty_audit_buf; +#endif + + /* + * Thread is the potential origin of an oom condition; kill first on + * oom + */ + bool oom_flag_origin; + short oom_score_adj; /* OOM kill score adjustment */ + short oom_score_adj_min; /* OOM kill score adjustment min value. + * Only settable by CAP_SYS_RESOURCE. */ + struct mm_struct *oom_mm; /* recorded mm when the thread group got + * killed by the oom killer */ + + struct mutex cred_guard_mutex; /* guard against foreign influences on + * credential calculations + * (notably. ptrace) */ +}; + +/* + * Bits in flags field of signal_struct. + */ +#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ +#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ +#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ +#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ +/* + * Pending notifications to parent. + */ +#define SIGNAL_CLD_STOPPED 0x00000010 +#define SIGNAL_CLD_CONTINUED 0x00000020 +#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) + +#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ + +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + +/* If true, all threads except ->group_exit_task have pending SIGKILL */ +static inline int signal_group_exit(const struct signal_struct *sig) +{ + return (sig->flags & SIGNAL_GROUP_EXIT) || + (sig->group_exit_task != NULL); +} + +extern void flush_signals(struct task_struct *); +extern void ignore_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *, int force_default); +extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); + +static inline int kernel_dequeue_signal(siginfo_t *info) +{ + struct task_struct *tsk = current; + siginfo_t __info; + int ret; + + spin_lock_irq(&tsk->sighand->siglock); + ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info); + spin_unlock_irq(&tsk->sighand->siglock); + + return ret; +} + +static inline void kernel_signal_stop(void) +{ + spin_lock_irq(¤t->sighand->siglock); + if (current->jobctl & JOBCTL_STOP_DEQUEUED) + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + schedule(); +} +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern int force_sigsegv(int, struct task_struct *); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); +extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); +extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, + const struct cred *, u32); +extern int kill_pgrp(struct pid *pid, int sig, int priv); +extern int kill_pid(struct pid *pid, int sig, int priv); +extern int kill_proc_info(int, struct siginfo *, pid_t); +extern __must_check bool do_notify_parent(struct task_struct *, int); +extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); +extern void force_sig(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern int zap_other_threads(struct task_struct *p); +extern struct sigqueue *sigqueue_alloc(void); +extern void sigqueue_free(struct sigqueue *); +extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); +extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); + +#ifdef TIF_RESTORE_SIGMASK +/* + * Legacy restore_sigmask accessors. These are inefficient on + * SMP architectures because they require atomic operations. + */ + +/** + * set_restore_sigmask() - make sure saved_sigmask processing gets done + * + * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code + * will run before returning to user mode, to process the flag. For + * all callers, TIF_SIGPENDING is already set or it's no harm to set + * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the + * arch code will notice on return to user mode, in case those bits + * are scarce. We set TIF_SIGPENDING here to ensure that the arch + * signal code always gets run when TIF_RESTORE_SIGMASK is set. + */ +static inline void set_restore_sigmask(void) +{ + set_thread_flag(TIF_RESTORE_SIGMASK); + WARN_ON(!test_thread_flag(TIF_SIGPENDING)); +} +static inline void clear_restore_sigmask(void) +{ + clear_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_restore_sigmask(void) +{ + return test_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_and_clear_restore_sigmask(void) +{ + return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); +} + +#else /* TIF_RESTORE_SIGMASK */ + +/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ +static inline void set_restore_sigmask(void) +{ + current->restore_sigmask = true; + WARN_ON(!test_thread_flag(TIF_SIGPENDING)); +} +static inline void clear_restore_sigmask(void) +{ + current->restore_sigmask = false; +} +static inline bool test_restore_sigmask(void) +{ + return current->restore_sigmask; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + if (!current->restore_sigmask) + return false; + current->restore_sigmask = false; + return true; +} +#endif + +static inline void restore_saved_sigmask(void) +{ + if (test_and_clear_restore_sigmask()) + __set_current_blocked(¤t->saved_sigmask); +} + +static inline sigset_t *sigmask_to_save(void) +{ + sigset_t *res = ¤t->blocked; + if (unlikely(test_restore_sigmask())) + res = ¤t->saved_sigmask; + return res; +} + +static inline int kill_cad_pid(int sig, int priv) +{ + return kill_pid(cad_pid, sig, priv); +} + +/* These can be the second arg to send_sig_info/send_group_sig_info. */ +#define SEND_SIG_NOINFO ((struct siginfo *) 0) +#define SEND_SIG_PRIV ((struct siginfo *) 1) +#define SEND_SIG_FORCED ((struct siginfo *) 2) + +/* + * True if we are on the alternate signal stack. + */ +static inline int on_sig_stack(unsigned long sp) +{ + /* + * If the signal stack is SS_AUTODISARM then, by construction, we + * can't be on the signal stack unless user code deliberately set + * SS_AUTODISARM when we were already on it. + * + * This improves reliability: if user state gets corrupted such that + * the stack pointer points very close to the end of the signal stack, + * then this check will enable the signal to be handled anyway. + */ + if (current->sas_ss_flags & SS_AUTODISARM) + return 0; + +#ifdef CONFIG_STACK_GROWSUP + return sp >= current->sas_ss_sp && + sp - current->sas_ss_sp < current->sas_ss_size; +#else + return sp > current->sas_ss_sp && + sp - current->sas_ss_sp <= current->sas_ss_size; +#endif +} + +static inline int sas_ss_flags(unsigned long sp) +{ + if (!current->sas_ss_size) + return SS_DISABLE; + + return on_sig_stack(sp) ? SS_ONSTACK : 0; +} + +static inline void sas_ss_reset(struct task_struct *p) +{ + p->sas_ss_sp = 0; + p->sas_ss_size = 0; + p->sas_ss_flags = SS_DISABLE; +} + +static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) +{ + if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) +#ifdef CONFIG_STACK_GROWSUP + return current->sas_ss_sp; +#else + return current->sas_ss_sp + current->sas_ss_size; +#endif + return sp; +} + +extern void __cleanup_sighand(struct sighand_struct *); +extern void flush_itimer_signals(void); + +#define tasklist_empty() \ + list_empty(&init_task.tasks) + +#define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) + +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +extern bool current_is_single_threaded(void); + +/* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +#define do_each_thread(g, t) \ + for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + +#define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) + +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + +typedef int (*proc_visitor)(struct task_struct *p, void *data); +void walk_process_tree(struct task_struct *top, proc_visitor, void *); + +static inline int get_nr_threads(struct task_struct *tsk) +{ + return tsk->signal->nr_threads; +} + +static inline bool thread_group_leader(struct task_struct *p) +{ + return p->exit_signal >= 0; +} + +/* Do to the insanities of de_thread it is possible for a process + * to have the pid of the thread group leader without actually being + * the thread group leader. For iteration through the pids in proc + * all we care about is that we have a task with the appropriate + * pid, we don't actually care if we have the right task. + */ +static inline bool has_group_leader_pid(struct task_struct *p) +{ + return task_pid(p) == p->signal->leader_pid; +} + +static inline +bool same_thread_group(struct task_struct *p1, struct task_struct *p2) +{ + return p1->signal == p2->signal; +} + +static inline struct task_struct *next_thread(const struct task_struct *p) +{ + return list_entry_rcu(p->thread_group.next, + struct task_struct, thread_group); +} + +static inline int thread_group_empty(struct task_struct *p) +{ + return list_empty(&p->thread_group); +} + +#define delay_group_leader(p) \ + (thread_group_leader(p) && !thread_group_empty(p)) + +extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, + unsigned long *flags); + +static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) +{ + struct sighand_struct *ret; + + ret = __lock_task_sighand(tsk, flags); + (void)__cond_lock(&tsk->sighand->siglock, ret); + return ret; +} + +static inline void unlock_task_sighand(struct task_struct *tsk, + unsigned long *flags) +{ + spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); +} + +static inline unsigned long task_rlimit(const struct task_struct *tsk, + unsigned int limit) +{ + return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); +} + +static inline unsigned long task_rlimit_max(const struct task_struct *tsk, + unsigned int limit) +{ + return READ_ONCE(tsk->signal->rlim[limit].rlim_max); +} + +static inline unsigned long rlimit(unsigned int limit) +{ + return task_rlimit(current, limit); +} + +static inline unsigned long rlimit_max(unsigned int limit) +{ + return task_rlimit_max(current, limit); +} + #endif /* _LINUX_SCHED_SIGNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fc34bcf2329f..08d2cb605101 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/vmalloc.c b/mm/vmalloc.c index be93949b4885..b4024d688f38 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5d4208ad029e..85837ab90e89 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,6 +27,8 @@ #include #include #include +#include + #include #include #include diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index cc6b6f8651eb..e41f594a1e1d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -11,6 +11,8 @@ #include #include +#include + #include #include diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 03dfcc6b7661..67a71d170bed 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -9,6 +9,8 @@ */ #include +#include + #include #include "smc.h" diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 5d1878732f46..c4ef9a4ec569 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -11,6 +11,8 @@ #include #include +#include + #include #include "smc.h" diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 6e73b28915ea..69a0013dd25c 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -15,6 +15,8 @@ #include #include #include +#include + #include #include "smc.h" -- cgit v1.2.3-70-g09d2 From 56cd697366b6d6f67acb6c58ac7f3b185d11ef07 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 6 Feb 2017 10:57:33 +0100 Subject: sched/headers: Move the task_lock()/unlock() APIs to The task_lock()/task_unlock() APIs are not realated to core scheduling, they are task lifetime APIs, i.e. they belong into . Move them. Acked-by: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 20 -------------------- include/linux/sched/task.h | 20 ++++++++++++++++++++ kernel/cgroup/cgroup-v1.c | 1 + kernel/cgroup/namespace.c | 2 +- 4 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ab105a2a8f9..d481c129a822 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1526,26 +1526,6 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif -/* - * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring - * subscriptions and synchronises with wait4(). Also used in procfs. Also - * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. - * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), - * neither inside nor outside. - */ -static inline void task_lock(struct task_struct *p) -{ - spin_lock(&p->alloc_lock); -} - -static inline void task_unlock(struct task_struct *p) -{ - spin_unlock(&p->alloc_lock); -} - /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 1be049a18d1b..2be9fde588a7 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -91,4 +91,24 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) } #endif +/* + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset and + * ->cgroup.subsys[]. And ->vfork_done. + * + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), + * neither inside nor outside. + */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + #endif /* _LINUX_SCHED_TASK_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 08d2cb605101..abc585858685 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index cff7ea62c38f..96d38dab6fb2 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -1,6 +1,6 @@ #include "cgroup-internal.h" -#include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 50ff9d130014f7b19541dbf607a615a72b75b778 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Feb 2017 16:03:58 +0100 Subject: sched/headers: Remove from It's not used by any of the scheduler methods, but needs it to pick up STACK_END_MAGIC. Acked-by: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/cgroup/cgroup-v1.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index bd0111a06aa2..559be4f1aee5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -22,7 +22,6 @@ #include #include #include -#include #include diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index abc585858685..56eba9caa632 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From b6a6759daf55dade2b65089957832759d502acfb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 25 Feb 2017 01:56:48 -0800 Subject: cgroups: censor kernel pointer in debug files As found in grsecurity, this avoids exposing a kernel pointer through the cgroup debug entries. Signed-off-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup/cgroup-v1.c') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 56eba9caa632..1dc22f6b49f5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1329,7 +1329,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct task_struct *task; int count = 0; - seq_printf(seq, "css_set %p\n", cset); + seq_printf(seq, "css_set %pK\n", cset); list_for_each_entry(task, &cset->tasks, cg_list) { if (count++ > MAX_TASKS_SHOWN_PER_CSS) -- cgit v1.2.3-70-g09d2