diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 337 |
1 files changed, 56 insertions, 281 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0aeb32415ff..8b729c278b64 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -60,6 +60,7 @@ #include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> +#include <linux/file.h> #include <linux/atomic.h> @@ -89,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); /* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + +/* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by @@ -124,38 +133,6 @@ struct cfent { }; /* - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when - * cgroup_subsys->use_id != 0. - */ -#define CSS_ID_MAX (65535) -struct css_id { - /* - * The css to which this ID points. This pointer is set to valid value - * after cgroup is populated. If cgroup is removed, this will be NULL. - * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_tryget() - * should be used for avoiding race. - */ - struct cgroup_subsys_state __rcu *css; - /* - * ID of this css. - */ - unsigned short id; - /* - * Depth in hierarchy which this ID belongs to. - */ - unsigned short depth; - /* - * ID is freed by RCU. (and lookup routine is RCU safe.) - */ - struct rcu_head rcu_head; - /* - * Hierarchy of CSS ID belongs to. - */ - unsigned short stack[0]; /* Array of Length (depth+1) */ -}; - -/* * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { @@ -222,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static int cgroup_file_release(struct inode *inode, struct file *file); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -386,9 +364,6 @@ struct cgrp_cset_link { static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; -static int cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *css); - /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to @@ -840,8 +815,6 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys_state *child_css); - static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -907,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head) struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - schedule_work(&cgrp->destroy_work); + queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -931,11 +904,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1522,7 +1490,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = cgroup_delete, + .d_delete = always_delete_dentry, }; struct inode *inode = @@ -2038,7 +2006,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) - continue; + goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); @@ -2046,7 +2014,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) - continue; + goto next; /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. @@ -2054,7 +2022,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; - + next: if (!threadgroup) break; } while_each_thread(leader, tsk); @@ -2462,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = single_release, + .release = cgroup_file_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2523,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file) ret = cft->release(inode, file); if (css->ss) css_put(css); + if (file->f_op == &cgroup_seqfile_operations) + single_release(inode, file); return ret; } @@ -3187,11 +3157,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, visit the leftmost descendant */ - if (!pos) { - next = css_leftmost_descendant(root); - return next != root ? next : NULL; - } + /* if first iteration, visit leftmost descendant which may be @root */ + if (!pos) + return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) @@ -4034,8 +4002,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, struct cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; - struct file *efile; - struct file *cfile; + struct fd efile; + struct fd cfile; char *endp; int ret; @@ -4058,31 +4026,31 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, init_waitqueue_func_entry(&event->wait, cgroup_event_wake); INIT_WORK(&event->remove, cgroup_event_remove); - efile = eventfd_fget(efd); - if (IS_ERR(efile)) { - ret = PTR_ERR(efile); + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; goto out_kfree; } - event->eventfd = eventfd_ctx_fileget(efile); + event->eventfd = eventfd_ctx_fileget(efile.file); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); goto out_put_efile; } - cfile = fget(cfd); - if (!cfile) { + cfile = fdget(cfd); + if (!cfile.file) { ret = -EBADF; goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile), MAY_READ); + ret = inode_permission(file_inode(cfile.file), MAY_READ); if (ret < 0) goto out_put_cfile; - event->cft = __file_cft(cfile); + event->cft = __file_cft(cfile.file); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); goto out_put_cfile; @@ -4103,7 +4071,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); if (event->css && event->css == cfile_css && css_tryget(event->css)) ret = 0; @@ -4121,25 +4089,25 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, if (ret) goto out_put_css; - efile->f_op->poll(efile, &event->pt); + efile.file->f_op->poll(efile.file, &event->pt); spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); - fput(cfile); - fput(efile); + fdput(cfile); + fdput(efile); return 0; out_put_css: css_put(event->css); out_put_cfile: - fput(cfile); + fdput(cfile); out_put_eventfd: eventfd_ctx_put(event->eventfd); out_put_efile: - fput(efile); + fdput(efile); out_kfree: kfree(event); @@ -4241,21 +4209,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) goto err; } } - - /* This cgroup is ready now */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* - * Update id->css pointer and make this css visible from - * CSS ID functions. This pointer will be dereferened - * from RCU-read-side without locks. - */ - if (id) - rcu_assign_pointer(id->css, css); - } - return 0; err: cgroup_clear_dir(cgrp, subsys_mask); @@ -4307,7 +4260,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } static void css_release(struct percpu_ref *ref) @@ -4324,7 +4277,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->cgroup = cgrp; css->ss = ss; css->flags = 0; - css->id = NULL; if (cgrp->parent) css->parent = cgroup_css(cgrp->parent, ss); @@ -4456,12 +4408,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; init_css(css, ss, cgrp); - - if (ss->use_id) { - err = alloc_css_id(css); - if (err) - goto err_free_all; - } } /* @@ -4604,7 +4550,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_killed_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } /** @@ -4926,12 +4872,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* our new subsystem will be attached to the dummy hierarchy. */ init_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_css() because it sets css->id. */ - if (ss->use_id) { - ret = cgroup_init_idr(ss, css); - if (ret) - goto err_unload; - } /* * Now we need to entangle the css into the existing css_sets. unlike @@ -4997,9 +4937,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) offline_css(cgroup_css(cgroup_dummy_top, ss)); - if (ss->use_id) - idr_destroy(&ss->idr); - /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; @@ -5026,8 +4963,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * remove subsystem's css from the cgroup_dummy_top and free it - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. note that this - * also takes care of freeing the css_id. + * the cgrp->subsys pointer to find their state. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); @@ -5098,8 +5034,6 @@ int __init cgroup_init(void) for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); - if (ss->use_id) - cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* allocate id for the dummy hierarchy */ @@ -5140,6 +5074,22 @@ out: return err; } +static int __init cgroup_wq_init(void) +{ + /* + * There isn't much point in executing destruction path in + * parallel. Good chunk is serialized with cgroup_mutex anyway. + * Use 1 for @max_active. + * + * We would prefer to do this in cgroup_init() above, but that + * is called before init_workqueues(): so leave this until after. + */ + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + BUG_ON(!cgroup_destroy_wq); + return 0; +} +core_initcall(cgroup_wq_init); + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy @@ -5519,181 +5469,6 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -/* - * Functons for CSS ID. - */ - -/* to get ID other than 0, this should be called when !cgroup_is_dead() */ -unsigned short css_id(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - /* - * This css_id() can return correct value when somone has refcnt - * on this or this is under rcu_read_lock(). Once css->id is allocated, - * it's unchanged until freed. - */ - cssid = rcu_dereference_raw(css->id); - - if (cssid) - return cssid->id; - return 0; -} -EXPORT_SYMBOL_GPL(css_id); - -/** - * css_is_ancestor - test "root" css is an ancestor of "child" - * @child: the css to be tested. - * @root: the css supporsed to be an ancestor of the child. - * - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, the caller must hold rcu_read_lock(). - * But, considering usual usage, the csses should be valid objects after test. - * Assuming that the caller will do some action to the child if this returns - * returns true, the caller must take "child";s reference count. - * If "child" is valid object and this returns true, "root" is valid, too. - */ - -bool css_is_ancestor(struct cgroup_subsys_state *child, - const struct cgroup_subsys_state *root) -{ - struct css_id *child_id; - struct css_id *root_id; - - child_id = rcu_dereference(child->id); - if (!child_id) - return false; - root_id = rcu_dereference(root->id); - if (!root_id) - return false; - if (child_id->depth < root_id->depth) - return false; - if (child_id->stack[root_id->depth] != root_id->id) - return false; - return true; -} - -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) -{ - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* When this is called before css_id initialization, id can be NULL */ - if (!id) - return; - - BUG_ON(!ss->use_id); - - rcu_assign_pointer(id->css, NULL); - rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); - kfree_rcu(id, rcu_head); -} -EXPORT_SYMBOL_GPL(free_css_id); - -/* - * This is called by init or create(). Then, calls to this function are - * always serialized (By cgroup_mutex() at create()). - */ - -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) -{ - struct css_id *newid; - int ret, size; - - BUG_ON(!ss->use_id); - - size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); - newid = kzalloc(size, GFP_KERNEL); - if (!newid) - return ERR_PTR(-ENOMEM); - - idr_preload(GFP_KERNEL); - spin_lock(&ss->id_lock); - /* Don't use 0. allocates an ID of 1-65535 */ - ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); - spin_unlock(&ss->id_lock); - idr_preload_end(); - - /* Returns error when there are no free spaces for new ID.*/ - if (ret < 0) - goto err_out; - - newid->id = ret; - newid->depth = depth; - return newid; -err_out: - kfree(newid); - return ERR_PTR(ret); - -} - -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *rootcss) -{ - struct css_id *newid; - - spin_lock_init(&ss->id_lock); - idr_init(&ss->idr); - - newid = get_new_cssid(ss, 0); - if (IS_ERR(newid)) - return PTR_ERR(newid); - - newid->stack[0] = newid->id; - RCU_INIT_POINTER(newid->css, rootcss); - RCU_INIT_POINTER(rootcss->id, newid); - return 0; -} - -static int alloc_css_id(struct cgroup_subsys_state *child_css) -{ - struct cgroup_subsys_state *parent_css = css_parent(child_css); - struct css_id *child_id, *parent_id; - int i, depth; - - parent_id = rcu_dereference_protected(parent_css->id, true); - depth = parent_id->depth + 1; - - child_id = get_new_cssid(child_css->ss, depth); - if (IS_ERR(child_id)) - return PTR_ERR(child_id); - - for (i = 0; i < depth; i++) - child_id->stack[i] = parent_id->stack[i]; - child_id->stack[depth] = child_id->id; - /* - * child_id->css pointer will be set after this cgroup is available - * see cgroup_populate_dir() - */ - rcu_assign_pointer(child_css->id, child_id); - - return 0; -} - -/** - * css_lookup - lookup css by id - * @ss: cgroup subsys to be looked into. - * @id: the id - * - * Returns pointer to cgroup_subsys_state if there is valid one with id. - * NULL if not. Should be called under rcu_read_lock() - */ -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) -{ - struct css_id *cssid = NULL; - - BUG_ON(!ss->use_id); - cssid = idr_find(&ss->idr, id); - - if (unlikely(!cssid)) - return NULL; - - return rcu_dereference(cssid->css); -} -EXPORT_SYMBOL_GPL(css_lookup); - /** * css_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest |