diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 288 | 
1 files changed, 168 insertions, 120 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798f..b5c64327e71 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,7 @@  #include <linux/module.h>  #include <linux/delayacct.h>  #include <linux/cgroupstats.h> -#include <linux/hash.h> +#include <linux/hashtable.h>  #include <linux/namei.h>  #include <linux/pid_namespace.h>  #include <linux/idr.h> @@ -376,22 +376,18 @@ static int css_set_count;   * account cgroups in empty hierarchies.   */  #define CSS_SET_HASH_BITS	7 -#define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state *css[])  {  	int i; -	int index; -	unsigned long tmp = 0UL; +	unsigned long key = 0UL;  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) -		tmp += (unsigned long)css[i]; -	tmp = (tmp >> 16) ^ tmp; +		key += (unsigned long)css[i]; +	key = (key >> 16) ^ key; -	index = hash_long(tmp, CSS_SET_HASH_BITS); - -	return &css_set_table[index]; +	return key;  }  /* We don't maintain the lists running through each css_set to its @@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)  	}  	/* This css_set is dead. unlink it and release cgroup refcounts */ -	hlist_del(&cg->hlist); +	hash_del(&cg->hlist);  	css_set_count--;  	list_for_each_entry_safe(link, saved_link, &cg->cg_links, @@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)  		struct cgroup *cgrp = link->cgrp;  		list_del(&link->cg_link_list);  		list_del(&link->cgrp_link_list); + +		/* +		 * We may not be holding cgroup_mutex, and if cgrp->count is +		 * dropped to 0 the cgroup can be destroyed at any time, hence +		 * rcu_read_lock is used to keep it alive. +		 */ +		rcu_read_lock();  		if (atomic_dec_and_test(&cgrp->count) &&  		    notify_on_release(cgrp)) {  			if (taskexit)  				set_bit(CGRP_RELEASABLE, &cgrp->flags);  			check_for_release(cgrp);  		} +		rcu_read_unlock();  		kfree(link);  	} @@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set(  {  	int i;  	struct cgroupfs_root *root = cgrp->root; -	struct hlist_head *hhead;  	struct hlist_node *node;  	struct css_set *cg; +	unsigned long key;  	/*  	 * Build the set of subsystem state objects that we want to see in the @@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set(  		}  	} -	hhead = css_set_hash(template); -	hlist_for_each_entry(cg, node, hhead, hlist) { +	key = css_set_hash(template); +	hash_for_each_possible(css_set_table, cg, node, hlist, key) {  		if (!compare_css_sets(cg, oldcg, cgrp, template))  			continue; @@ -657,8 +661,8 @@ static struct css_set *find_css_set(  	struct list_head tmp_cg_links; -	struct hlist_head *hhead;  	struct cg_cgroup_link *link; +	unsigned long key;  	/* First see if we already have a cgroup group that matches  	 * the desired set */ @@ -704,8 +708,8 @@ static struct css_set *find_css_set(  	css_set_count++;  	/* Add this cgroup group to the hash table */ -	hhead = css_set_hash(res->subsys); -	hlist_add_head(&res->hlist, hhead); +	key = css_set_hash(res->subsys); +	hash_add(css_set_table, &res->hlist, key);  	write_unlock(&css_set_lock); @@ -856,47 +860,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)  	return inode;  } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static void cgroup_free_fn(struct work_struct *work)  { -	/* is dentry a directory ? if so, kfree() associated cgroup */ -	if (S_ISDIR(inode->i_mode)) { -		struct cgroup *cgrp = dentry->d_fsdata; -		struct cgroup_subsys *ss; -		BUG_ON(!(cgroup_is_removed(cgrp))); -		/* It's possible for external users to be holding css -		 * reference counts on a cgroup; css_put() needs to -		 * be able to access the cgroup after decrementing -		 * the reference count in order to know if it needs to -		 * queue the cgroup to be handled by the release -		 * agent */ -		synchronize_rcu(); +	struct cgroup *cgrp = container_of(work, struct cgroup, free_work); +	struct cgroup_subsys *ss; -		mutex_lock(&cgroup_mutex); -		/* -		 * Release the subsystem state objects. -		 */ -		for_each_subsys(cgrp->root, ss) -			ss->css_free(cgrp); +	mutex_lock(&cgroup_mutex); +	/* +	 * Release the subsystem state objects. +	 */ +	for_each_subsys(cgrp->root, ss) +		ss->css_free(cgrp); -		cgrp->root->number_of_cgroups--; -		mutex_unlock(&cgroup_mutex); +	cgrp->root->number_of_cgroups--; +	mutex_unlock(&cgroup_mutex); -		/* -		 * Drop the active superblock reference that we took when we -		 * created the cgroup -		 */ -		deactivate_super(cgrp->root->sb); +	/* +	 * Drop the active superblock reference that we took when we +	 * created the cgroup +	 */ +	deactivate_super(cgrp->root->sb); -		/* -		 * if we're getting rid of the cgroup, refcount should ensure -		 * that there are no pidlists left. -		 */ -		BUG_ON(!list_empty(&cgrp->pidlists)); +	/* +	 * if we're getting rid of the cgroup, refcount should ensure +	 * that there are no pidlists left. +	 */ +	BUG_ON(!list_empty(&cgrp->pidlists)); -		simple_xattrs_free(&cgrp->xattrs); +	simple_xattrs_free(&cgrp->xattrs); -		ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); -		kfree_rcu(cgrp, rcu_head); +	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); +	kfree(cgrp); +} + +static void cgroup_free_rcu(struct rcu_head *head) +{ +	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); + +	schedule_work(&cgrp->free_work); +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ +	/* is dentry a directory ? if so, kfree() associated cgroup */ +	if (S_ISDIR(inode->i_mode)) { +		struct cgroup *cgrp = dentry->d_fsdata; + +		BUG_ON(!(cgroup_is_removed(cgrp))); +		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);  	} else {  		struct cfent *cfe = __d_cfe(dentry);  		struct cgroup *cgrp = dentry->d_parent->d_fsdata; @@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d)  	dput(parent);  } -static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  {  	struct cfent *cfe;  	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);  	lockdep_assert_held(&cgroup_mutex); +	/* +	 * If we're doing cleanup due to failure of cgroup_create(), +	 * the corresponding @cfe may not exist. +	 */  	list_for_each_entry(cfe, &cgrp->files, node) {  		struct dentry *d = cfe->dentry; @@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  		list_del_init(&cfe->node);  		dput(d); -		return 0; +		break;  	} -	return -ENOENT;  }  /** @@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,  		}  	}  	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; -	synchronize_rcu();  	return 0;  } @@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	INIT_LIST_HEAD(&cgrp->allcg_node);  	INIT_LIST_HEAD(&cgrp->release_list);  	INIT_LIST_HEAD(&cgrp->pidlists); +	INIT_WORK(&cgrp->free_work, cgroup_free_fn);  	mutex_init(&cgrp->pidlist_mutex);  	INIT_LIST_HEAD(&cgrp->event_list);  	spin_lock_init(&cgrp->event_list_lock); @@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		struct cgroupfs_root *existing_root;  		const struct cred *cred;  		int i; +		struct hlist_node *node; +		struct css_set *cg;  		BUG_ON(sb->s_root != NULL); @@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		/* Link the top cgroup in this hierarchy into all  		 * the css_set objects */  		write_lock(&css_set_lock); -		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { -			struct hlist_head *hhead = &css_set_table[i]; -			struct hlist_node *node; -			struct css_set *cg; - -			hlist_for_each_entry(cg, node, hhead, hlist) -				link_css_set(&tmp_cg_links, cg, root_cgrp); -		} +		hash_for_each(css_set_table, i, node, cg, hlist) +			link_css_set(&tmp_cg_links, cg, root_cgrp);  		write_unlock(&css_set_lock);  		free_cg_links(&tmp_cg_links); @@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)  	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),  			   "cgroup_path() called without proper locking"); -	if (!dentry || cgrp == dummytop) { +	if (cgrp == dummytop) {  		/*  		 * Inactive subsystems have no dentry for their root  		 * cgroup @@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  			ss->attach(cgrp, &tset);  	} -	synchronize_rcu();  out:  	if (retval) {  		for_each_subsys(root, ss) { @@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	/*  	 * step 5: success! and cleanup  	 */ -	synchronize_rcu();  	retval = 0;  out_put_css_set_refs:  	if (retval) { @@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,  		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)  			continue; -		if (is_add) +		if (is_add) {  			err = cgroup_add_file(cgrp, subsys, cft); -		else -			err = cgroup_rm_file(cgrp, cft); -		if (err) { -			pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", -				   is_add ? "add" : "remove", cft->name, err); +			if (err) +				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", +					cft->name, err);  			ret = err; +		} else { +			cgroup_rm_file(cgrp, cft);  		}  	}  	return ret; @@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,  }  EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +/** + * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup + * @pos: cgroup of interest + * + * Return the rightmost descendant of @pos.  If there's no descendant, + * @pos is returned.  This can be used during pre-order traversal to skip + * subtree of @pos. + */ +struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +{ +	struct cgroup *last, *tmp; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	do { +		last = pos; +		/* ->prev isn't RCU safe, walk ->next till the end */ +		pos = NULL; +		list_for_each_entry_rcu(tmp, &last->children, sibling) +			pos = tmp; +	} while (pos); + +	return last; +} +EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +  static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)  {  	struct cgroup *last; @@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work)  			remove);  	struct cgroup *cgrp = event->cgrp; +	remove_wait_queue(event->wqh, &event->wait); +  	event->cft->unregister_event(cgrp, event->cft, event->eventfd); +	/* Notify userspace the event is going away. */ +	eventfd_signal(event->eventfd, 1); +  	eventfd_ctx_put(event->eventfd);  	kfree(event);  	dput(cgrp->dentry); @@ -3773,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,  	unsigned long flags = (unsigned long)key;  	if (flags & POLLHUP) { -		__remove_wait_queue(event->wqh, &event->wait); -		spin_lock(&cgrp->event_list_lock); -		list_del_init(&event->list); -		spin_unlock(&cgrp->event_list_lock);  		/* -		 * We are in atomic context, but cgroup_event_remove() may -		 * sleep, so we have to call it in workqueue. +		 * If the event has been detached at cgroup removal, we +		 * can simply return knowing the other side will cleanup +		 * for us. +		 * +		 * We can't race against event freeing since the other +		 * side will require wqh->lock via remove_wait_queue(), +		 * which we hold.  		 */ -		schedule_work(&event->remove); +		spin_lock(&cgrp->event_list_lock); +		if (!list_empty(&event->list)) { +			list_del_init(&event->list); +			/* +			 * We are in atomic context, but cgroup_event_remove() +			 * may sleep, so we have to call it in workqueue. +			 */ +			schedule_work(&event->remove); +		} +		spin_unlock(&cgrp->event_list_lock);  	}  	return 0; @@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,  				      const char *buffer)  {  	struct cgroup_event *event = NULL; +	struct cgroup *cgrp_cfile;  	unsigned int efd, cfd;  	struct file *efile = NULL;  	struct file *cfile = NULL; @@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,  		goto fail;  	} +	/* +	 * The file to be monitored must be in the same cgroup as +	 * cgroup.event_control is. +	 */ +	cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); +	if (cgrp_cfile != cgrp) { +		ret = -EINVAL; +		goto fail; +	} +  	if (!event->cft->register_event || !event->cft->unregister_event) {  		ret = -EINVAL;  		goto fail; @@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	init_cgroup_housekeeping(cgrp); +	dentry->d_fsdata = cgrp; +	cgrp->dentry = dentry; +  	cgrp->parent = parent;  	cgrp->root = parent->root;  	cgrp->top_cgroup = parent->top_cgroup; @@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	lockdep_assert_held(&dentry->d_inode->i_mutex);  	/* allocation complete, commit to creation */ -	dentry->d_fsdata = cgrp; -	cgrp->dentry = dentry;  	list_add_tail(&cgrp->allcg_node, &root->allcg_list);  	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);  	root->number_of_cgroups++; @@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  	/*  	 * Unregister events and notify userspace.  	 * Notify userspace about cgroup removing only after rmdir of cgroup -	 * directory to avoid race between userspace and kernelspace. Use -	 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since -	 * cgroup_event_wake() is called with the wait queue head locked, -	 * remove_wait_queue() cannot be called while holding event_list_lock. +	 * directory to avoid race between userspace and kernelspace.  	 */  	spin_lock(&cgrp->event_list_lock); -	list_splice_init(&cgrp->event_list, &tmp_list); -	spin_unlock(&cgrp->event_list_lock); -	list_for_each_entry_safe(event, tmp, &tmp_list, list) { +	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {  		list_del_init(&event->list); -		remove_wait_queue(event->wqh, &event->wait); -		eventfd_signal(event->eventfd, 1);  		schedule_work(&event->remove);  	} +	spin_unlock(&cgrp->event_list_lock);  	return 0;  } @@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  {  	struct cgroup_subsys_state *css;  	int i, ret; +	struct hlist_node *node, *tmp; +	struct css_set *cg; +	unsigned long key;  	/* check name and function validity */  	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	 * this is all done under the css_set_lock.  	 */  	write_lock(&css_set_lock); -	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { -		struct css_set *cg; -		struct hlist_node *node, *tmp; -		struct hlist_head *bucket = &css_set_table[i], *new_bucket; - -		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { -			/* skip entries that we already rehashed */ -			if (cg->subsys[ss->subsys_id]) -				continue; -			/* remove existing entry */ -			hlist_del(&cg->hlist); -			/* set new value */ -			cg->subsys[ss->subsys_id] = css; -			/* recompute hash and restore entry */ -			new_bucket = css_set_hash(cg->subsys); -			hlist_add_head(&cg->hlist, new_bucket); -		} +	hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) { +		/* skip entries that we already rehashed */ +		if (cg->subsys[ss->subsys_id]) +			continue; +		/* remove existing entry */ +		hash_del(&cg->hlist); +		/* set new value */ +		cg->subsys[ss->subsys_id] = css; +		/* recompute hash and restore entry */ +		key = css_set_hash(cg->subsys); +		hash_add(css_set_table, node, key);  	}  	write_unlock(&css_set_lock); @@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);  void cgroup_unload_subsys(struct cgroup_subsys *ss)  {  	struct cg_cgroup_link *link; -	struct hlist_head *hhead;  	BUG_ON(ss->module == NULL); @@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	write_lock(&css_set_lock);  	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {  		struct css_set *cg = link->cg; +		unsigned long key; -		hlist_del(&cg->hlist); +		hash_del(&cg->hlist);  		cg->subsys[ss->subsys_id] = NULL; -		hhead = css_set_hash(cg->subsys); -		hlist_add_head(&cg->hlist, hhead); +		key = css_set_hash(cg->subsys); +		hash_add(css_set_table, &cg->hlist, key);  	}  	write_unlock(&css_set_lock); @@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void)  	list_add(&init_css_set_link.cg_link_list,  		 &init_css_set.cg_links); -	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) -		INIT_HLIST_HEAD(&css_set_table[i]); -  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		struct cgroup_subsys *ss = subsys[i]; @@ -4667,7 +4716,7 @@ int __init cgroup_init(void)  {  	int err;  	int i; -	struct hlist_head *hhead; +	unsigned long key;  	err = bdi_init(&cgroup_backing_dev_info);  	if (err) @@ -4686,8 +4735,8 @@ int __init cgroup_init(void)  	}  	/* Add init_css_set to the hash table */ -	hhead = css_set_hash(init_css_set.subsys); -	hlist_add_head(&init_css_set.hlist, hhead); +	key = css_set_hash(init_css_set.subsys); +	hash_add(css_set_table, &init_css_set.hlist, key);  	BUG_ON(!init_root_id(&rootnode));  	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4982,8 +5031,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  	}  	task_unlock(tsk); -	if (cg) -		put_css_set_taskexit(cg); +	put_css_set_taskexit(cg);  }  /**  |