diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 435 | 
1 files changed, 401 insertions, 34 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9edb5c4b79b..382109b5bae 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -94,7 +94,6 @@ struct cgroupfs_root {  	char release_agent_path[PATH_MAX];  }; -  /*   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the   * subsystems that are otherwise unattached - it never has more than a @@ -102,6 +101,39 @@ struct cgroupfs_root {   */  static struct cgroupfs_root rootnode; +/* + * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when + * cgroup_subsys->use_id != 0. + */ +#define CSS_ID_MAX	(65535) +struct css_id { +	/* +	 * The css to which this ID points. This pointer is set to valid value +	 * after cgroup is populated. If cgroup is removed, this will be NULL. +	 * This pointer is expected to be RCU-safe because destroy() +	 * is called after synchronize_rcu(). But for safe use, css_is_removed() +	 * css_tryget() should be used for avoiding race. +	 */ +	struct cgroup_subsys_state *css; +	/* +	 * ID of this css. +	 */ +	unsigned short id; +	/* +	 * Depth in hierarchy which this ID belongs to. +	 */ +	unsigned short depth; +	/* +	 * ID is freed by RCU. (and lookup routine is RCU safe.) +	 */ +	struct rcu_head rcu_head; +	/* +	 * Hierarchy of CSS ID belongs to. +	 */ +	unsigned short stack[0]; /* Array of Length (depth+1) */ +}; + +  /* The list of hierarchy roots */  static LIST_HEAD(roots); @@ -185,6 +217,8 @@ struct cg_cgroup_link {  static struct css_set init_css_set;  static struct cg_cgroup_link init_css_set_link; +static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); +  /* css_set_lock protects the list of css_set objects, and the   * chain of tasks off each css_set.  Nests outside task->alloc_lock   * due to cgroup_iter_start() */ @@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,  }; +static int alloc_css_id(struct cgroup_subsys *ss, +			struct cgroup *parent, struct cgroup *child); +  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)  {  	struct inode *inode = new_inode(sb); @@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)   * Call subsys's pre_destroy handler.   * This is called before css refcnt check.   */ -static void cgroup_call_pre_destroy(struct cgroup *cgrp) +static int cgroup_call_pre_destroy(struct cgroup *cgrp)  {  	struct cgroup_subsys *ss; +	int ret = 0; +  	for_each_subsys(cgrp->root, ss) -		if (ss->pre_destroy) -			ss->pre_destroy(ss, cgrp); -	return; +		if (ss->pre_destroy) { +			ret = ss->pre_destroy(ss, cgrp); +			if (ret) +				break; +		} +	return ret;  }  static void free_cgroup_rcu(struct rcu_head *obj) @@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)  	remove_dir(dentry);  } +/* + * A queue for waiters to do rmdir() cgroup. A tasks will sleep when + * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some + * reference to css->refcnt. In general, this refcnt is expected to goes down + * to zero, soon. + * + * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + */ +DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); + +static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +{ +	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) +		wake_up_all(&cgroup_rmdir_waitq); +} +  static int rebind_subsystems(struct cgroupfs_root *root,  			      unsigned long final_bits)  { @@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	}  	ret = rebind_subsystems(root, opts.subsys_bits); +	if (ret) +		goto out_unlock;  	/* (re)populate subsystem files */ -	if (!ret) -		cgroup_populate_dir(cgrp); +	cgroup_populate_dir(cgrp);  	if (opts.release_agent)  		strcpy(root->release_agent_path, opts.release_agent);   out_unlock: -	if (opts.release_agent) -		kfree(opts.release_agent); +	kfree(opts.release_agent);  	mutex_unlock(&cgroup_mutex);  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);  	return ret; @@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,  	/* First find the desired set of subsystems */  	ret = parse_cgroupfs_options(data, &opts);  	if (ret) { -		if (opts.release_agent) -			kfree(opts.release_agent); +		kfree(opts.release_agent);  		return ret;  	}  	root = kzalloc(sizeof(*root), GFP_KERNEL);  	if (!root) { -		if (opts.release_agent) -			kfree(opts.release_agent); +		kfree(opts.release_agent);  		return -ENOMEM;  	} @@ -1071,7 +1127,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,  		mutex_unlock(&cgroup_mutex);  	} -	return simple_set_mnt(mnt, sb); +	simple_set_mnt(mnt, sb); +	return 0;   free_cg_links:  	free_cg_links(&tmp_cg_links); @@ -1279,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);  	synchronize_rcu();  	put_css_set(cg); + +	/* +	 * wake up rmdir() waiter. the rmdir should fail since the cgroup +	 * is no longer empty. +	 */ +	cgroup_wakeup_rmdir_waiters(cgrp);  	return 0;  } @@ -1624,10 +1687,10 @@ static struct inode_operations cgroup_dir_inode_operations = {  	.rename = cgroup_rename,  }; -static int cgroup_create_file(struct dentry *dentry, int mode, +static int cgroup_create_file(struct dentry *dentry, mode_t mode,  				struct super_block *sb)  { -	static struct dentry_operations cgroup_dops = { +	static const struct dentry_operations cgroup_dops = {  		.d_iput = cgroup_diput,  	}; @@ -1670,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,   * @mode: mode to set on new directory.   */  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, -				int mode) +				mode_t mode)  {  	struct dentry *parent;  	int error = 0; @@ -1688,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,  	return error;  } +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static mode_t cgroup_file_mode(const struct cftype *cft) +{ +	mode_t mode = 0; + +	if (cft->mode) +		return cft->mode; + +	if (cft->read || cft->read_u64 || cft->read_s64 || +	    cft->read_map || cft->read_seq_string) +		mode |= S_IRUGO; + +	if (cft->write || cft->write_u64 || cft->write_s64 || +	    cft->write_string || cft->trigger) +		mode |= S_IWUSR; + +	return mode; +} +  int cgroup_add_file(struct cgroup *cgrp,  		       struct cgroup_subsys *subsys,  		       const struct cftype *cft) @@ -1695,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,  	struct dentry *dir = cgrp->dentry;  	struct dentry *dentry;  	int error; +	mode_t mode;  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { @@ -1705,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));  	dentry = lookup_one_len(name, dir, strlen(name));  	if (!IS_ERR(dentry)) { -		error = cgroup_create_file(dentry, 0644 | S_IFREG, +		mode = cgroup_file_mode(cft); +		error = cgroup_create_file(dentry, mode | S_IFREG,  						cgrp->root->sb);  		if (!error)  			dentry->d_fsdata = (void *)cft; @@ -2287,6 +2379,7 @@ static struct cftype files[] = {  		.write_u64 = cgroup_tasks_write,  		.release = cgroup_tasks_release,  		.private = FILE_TASKLIST, +		.mode = S_IRUGO | S_IWUSR,  	},  	{ @@ -2326,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)  			return err;  	} +	/* This cgroup is ready now */ +	for_each_subsys(cgrp->root, ss) { +		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; +		/* +		 * Update id->css pointer and make this css visible from +		 * CSS ID functions. This pointer will be dereferened +		 * from RCU-read-side without locks. +		 */ +		if (css->id) +			rcu_assign_pointer(css->id->css, css); +	}  	return 0;  } @@ -2337,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,  	css->cgroup = cgrp;  	atomic_set(&css->refcnt, 1);  	css->flags = 0; +	css->id = NULL;  	if (cgrp == dummytop)  		set_bit(CSS_ROOT, &css->flags);  	BUG_ON(cgrp->subsys[ss->subsys_id]); @@ -2375,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)   * Must be called with the mutex on the parent inode held   */  static long cgroup_create(struct cgroup *parent, struct dentry *dentry, -			     int mode) +			     mode_t mode)  {  	struct cgroup *cgrp;  	struct cgroupfs_root *root = parent->root; @@ -2412,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  			goto err_destroy;  		}  		init_cgroup_css(css, ss, cgrp); +		if (ss->use_id) +			if (alloc_css_id(ss, parent, cgrp)) +				goto err_destroy; +		/* At error, ->destroy() callback has to free assigned ID. */  	}  	cgroup_lock_hierarchy(root); @@ -2554,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)  	struct cgroup *cgrp = dentry->d_fsdata;  	struct dentry *d;  	struct cgroup *parent; +	DEFINE_WAIT(wait); +	int ret;  	/* the vfs holds both inode->i_mutex already */ - +again:  	mutex_lock(&cgroup_mutex);  	if (atomic_read(&cgrp->count) != 0) {  		mutex_unlock(&cgroup_mutex); @@ -2572,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)  	 * Call pre_destroy handlers of subsys. Notify subsystems  	 * that rmdir() request comes.  	 */ -	cgroup_call_pre_destroy(cgrp); +	ret = cgroup_call_pre_destroy(cgrp); +	if (ret) +		return ret;  	mutex_lock(&cgroup_mutex);  	parent = cgrp->parent; - -	if (atomic_read(&cgrp->count) -	    || !list_empty(&cgrp->children) -	    || !cgroup_clear_css_refs(cgrp)) { +	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {  		mutex_unlock(&cgroup_mutex);  		return -EBUSY;  	} +	/* +	 * css_put/get is provided for subsys to grab refcnt to css. In typical +	 * case, subsystem has no reference after pre_destroy(). But, under +	 * hierarchy management, some *temporal* refcnt can be hold. +	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys +	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up +	 * is called when css_put() is called and refcnt goes down to 0. +	 */ +	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); +	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); + +	if (!cgroup_clear_css_refs(cgrp)) { +		mutex_unlock(&cgroup_mutex); +		schedule(); +		finish_wait(&cgroup_rmdir_waitq, &wait); +		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); +		if (signal_pending(current)) +			return -EINTR; +		goto again; +	} +	/* NO css_tryget() can success after here. */ +	finish_wait(&cgroup_rmdir_waitq, &wait); +	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  	spin_lock(&release_list_lock);  	set_bit(CGRP_REMOVED, &cgrp->flags); @@ -2707,6 +2840,8 @@ int __init cgroup_init(void)  		struct cgroup_subsys *ss = subsys[i];  		if (!ss->early_init)  			cgroup_init_subsys(ss); +		if (ss->use_id) +			cgroup_subsys_init_idr(ss);  	}  	/* Add init_css_set to the hash table */ @@ -3083,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,  }  /** - * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp + * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp   * @cgrp: the cgroup in question + * @task: the task in question   * - * See if @cgrp is a descendant of the current task's cgroup in - * the appropriate hierarchy. + * See if @cgrp is a descendant of @task's cgroup in the appropriate + * hierarchy.   *   * If we are sending in dummytop, then presumably we are creating   * the top cgroup in the subsystem.   *   * Called only by the ns (nsproxy) cgroup.   */ -int cgroup_is_descendant(const struct cgroup *cgrp) +int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)  {  	int ret;  	struct cgroup *target; @@ -3104,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)  		return 1;  	get_first_subsys(cgrp, NULL, &subsys_id); -	target = task_cgroup(current, subsys_id); +	target = task_cgroup(task, subsys_id);  	while (cgrp != target && cgrp!= cgrp->top_cgroup)  		cgrp = cgrp->parent;  	ret = (cgrp == target); @@ -3137,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)  {  	struct cgroup *cgrp = css->cgroup;  	rcu_read_lock(); -	if ((atomic_dec_return(&css->refcnt) == 1) && -	    notify_on_release(cgrp)) { -		set_bit(CGRP_RELEASABLE, &cgrp->flags); -		check_for_release(cgrp); +	if (atomic_dec_return(&css->refcnt) == 1) { +		if (notify_on_release(cgrp)) { +			set_bit(CGRP_RELEASABLE, &cgrp->flags); +			check_for_release(cgrp); +		} +		cgroup_wakeup_rmdir_waiters(cgrp);  	}  	rcu_read_unlock();  } @@ -3240,3 +3378,232 @@ static int __init cgroup_disable(char *str)  	return 1;  }  __setup("cgroup_disable=", cgroup_disable); + +/* + * Functons for CSS ID. + */ + +/* + *To get ID other than 0, this should be called when !cgroup_is_removed(). + */ +unsigned short css_id(struct cgroup_subsys_state *css) +{ +	struct css_id *cssid = rcu_dereference(css->id); + +	if (cssid) +		return cssid->id; +	return 0; +} + +unsigned short css_depth(struct cgroup_subsys_state *css) +{ +	struct css_id *cssid = rcu_dereference(css->id); + +	if (cssid) +		return cssid->depth; +	return 0; +} + +bool css_is_ancestor(struct cgroup_subsys_state *child, +		    const struct cgroup_subsys_state *root) +{ +	struct css_id *child_id = rcu_dereference(child->id); +	struct css_id *root_id = rcu_dereference(root->id); + +	if (!child_id || !root_id || (child_id->depth < root_id->depth)) +		return false; +	return child_id->stack[root_id->depth] == root_id->id; +} + +static void __free_css_id_cb(struct rcu_head *head) +{ +	struct css_id *id; + +	id = container_of(head, struct css_id, rcu_head); +	kfree(id); +} + +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) +{ +	struct css_id *id = css->id; +	/* When this is called before css_id initialization, id can be NULL */ +	if (!id) +		return; + +	BUG_ON(!ss->use_id); + +	rcu_assign_pointer(id->css, NULL); +	rcu_assign_pointer(css->id, NULL); +	spin_lock(&ss->id_lock); +	idr_remove(&ss->idr, id->id); +	spin_unlock(&ss->id_lock); +	call_rcu(&id->rcu_head, __free_css_id_cb); +} + +/* + * This is called by init or create(). Then, calls to this function are + * always serialized (By cgroup_mutex() at create()). + */ + +static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) +{ +	struct css_id *newid; +	int myid, error, size; + +	BUG_ON(!ss->use_id); + +	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); +	newid = kzalloc(size, GFP_KERNEL); +	if (!newid) +		return ERR_PTR(-ENOMEM); +	/* get id */ +	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { +		error = -ENOMEM; +		goto err_out; +	} +	spin_lock(&ss->id_lock); +	/* Don't use 0. allocates an ID of 1-65535 */ +	error = idr_get_new_above(&ss->idr, newid, 1, &myid); +	spin_unlock(&ss->id_lock); + +	/* Returns error when there are no free spaces for new ID.*/ +	if (error) { +		error = -ENOSPC; +		goto err_out; +	} +	if (myid > CSS_ID_MAX) +		goto remove_idr; + +	newid->id = myid; +	newid->depth = depth; +	return newid; +remove_idr: +	error = -ENOSPC; +	spin_lock(&ss->id_lock); +	idr_remove(&ss->idr, myid); +	spin_unlock(&ss->id_lock); +err_out: +	kfree(newid); +	return ERR_PTR(error); + +} + +static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) +{ +	struct css_id *newid; +	struct cgroup_subsys_state *rootcss; + +	spin_lock_init(&ss->id_lock); +	idr_init(&ss->idr); + +	rootcss = init_css_set.subsys[ss->subsys_id]; +	newid = get_new_cssid(ss, 0); +	if (IS_ERR(newid)) +		return PTR_ERR(newid); + +	newid->stack[0] = newid->id; +	newid->css = rootcss; +	rootcss->id = newid; +	return 0; +} + +static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, +			struct cgroup *child) +{ +	int subsys_id, i, depth = 0; +	struct cgroup_subsys_state *parent_css, *child_css; +	struct css_id *child_id, *parent_id = NULL; + +	subsys_id = ss->subsys_id; +	parent_css = parent->subsys[subsys_id]; +	child_css = child->subsys[subsys_id]; +	depth = css_depth(parent_css) + 1; +	parent_id = parent_css->id; + +	child_id = get_new_cssid(ss, depth); +	if (IS_ERR(child_id)) +		return PTR_ERR(child_id); + +	for (i = 0; i < depth; i++) +		child_id->stack[i] = parent_id->stack[i]; +	child_id->stack[depth] = child_id->id; +	/* +	 * child_id->css pointer will be set after this cgroup is available +	 * see cgroup_populate_dir() +	 */ +	rcu_assign_pointer(child_css->id, child_id); + +	return 0; +} + +/** + * css_lookup - lookup css by id + * @ss: cgroup subsys to be looked into. + * @id: the id + * + * Returns pointer to cgroup_subsys_state if there is valid one with id. + * NULL if not. Should be called under rcu_read_lock() + */ +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) +{ +	struct css_id *cssid = NULL; + +	BUG_ON(!ss->use_id); +	cssid = idr_find(&ss->idr, id); + +	if (unlikely(!cssid)) +		return NULL; + +	return rcu_dereference(cssid->css); +} + +/** + * css_get_next - lookup next cgroup under specified hierarchy. + * @ss: pointer to subsystem + * @id: current position of iteration. + * @root: pointer to css. search tree under this. + * @foundid: position of found object. + * + * Search next css under the specified hierarchy of rootid. Calling under + * rcu_read_lock() is necessary. Returns NULL if it reaches the end. + */ +struct cgroup_subsys_state * +css_get_next(struct cgroup_subsys *ss, int id, +	     struct cgroup_subsys_state *root, int *foundid) +{ +	struct cgroup_subsys_state *ret = NULL; +	struct css_id *tmp; +	int tmpid; +	int rootid = css_id(root); +	int depth = css_depth(root); + +	if (!rootid) +		return NULL; + +	BUG_ON(!ss->use_id); +	/* fill start point for scan */ +	tmpid = id; +	while (1) { +		/* +		 * scan next entry from bitmap(tree), tmpid is updated after +		 * idr_get_next(). +		 */ +		spin_lock(&ss->id_lock); +		tmp = idr_get_next(&ss->idr, &tmpid); +		spin_unlock(&ss->id_lock); + +		if (!tmp) +			break; +		if (tmp->depth >= depth && tmp->stack[depth] == rootid) { +			ret = rcu_dereference(tmp->css); +			if (ret) { +				*foundid = tmpid; +				break; +			} +		} +		/* continue to scan from next id */ +		tmpid = tmpid + 1; +	} +	return ret; +} +  |