diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 587 | 
1 files changed, 422 insertions, 165 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a35510af..2731d115d72 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */  #include <linux/eventfd.h>  #include <linux/poll.h> +#include <linux/flex_array.h> /* used in cgroup_attach_proc */  #include <asm/atomic.h> @@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)  }  EXPORT_SYMBOL_GPL(cgroup_path); +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +			       struct task_struct *tsk, bool guarantee) +{ +	struct css_set *oldcg; +	struct css_set *newcg; + +	/* +	 * get old css_set. we need to take task_lock and refcount it, because +	 * an exiting task can change its css_set to init_css_set and drop its +	 * old one without taking cgroup_mutex. +	 */ +	task_lock(tsk); +	oldcg = tsk->cgroups; +	get_css_set(oldcg); +	task_unlock(tsk); + +	/* locate or allocate a new css_set for this task. */ +	if (guarantee) { +		/* we know the css_set we want already exists. */ +		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; +		read_lock(&css_set_lock); +		newcg = find_existing_css_set(oldcg, cgrp, template); +		BUG_ON(!newcg); +		get_css_set(newcg); +		read_unlock(&css_set_lock); +	} else { +		might_sleep(); +		/* find_css_set will give us newcg already referenced. */ +		newcg = find_css_set(oldcg, cgrp); +		if (!newcg) { +			put_css_set(oldcg); +			return -ENOMEM; +		} +	} +	put_css_set(oldcg); + +	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ +	task_lock(tsk); +	if (tsk->flags & PF_EXITING) { +		task_unlock(tsk); +		put_css_set(newcg); +		return -ESRCH; +	} +	rcu_assign_pointer(tsk->cgroups, newcg); +	task_unlock(tsk); + +	/* Update the css_set linked lists if we're using them */ +	write_lock(&css_set_lock); +	if (!list_empty(&tsk->cg_list)) +		list_move(&tsk->cg_list, &newcg->tasks); +	write_unlock(&css_set_lock); + +	/* +	 * We just gained a reference on oldcg by taking it from the task. As +	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop +	 * it here; it will be freed under RCU. +	 */ +	put_css_set(oldcg); + +	set_bit(CGRP_RELEASABLE, &oldcgrp->flags); +	return 0; +} +  /**   * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'   * @cgrp: the cgroup the task is attaching to @@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);   */  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  { -	int retval = 0; +	int retval;  	struct cgroup_subsys *ss, *failed_ss = NULL;  	struct cgroup *oldcgrp; -	struct css_set *cg; -	struct css_set *newcg;  	struct cgroupfs_root *root = cgrp->root;  	/* Nothing to do if the task is already in that cgroup */ @@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	for_each_subsys(root, ss) {  		if (ss->can_attach) { -			retval = ss->can_attach(ss, cgrp, tsk, false); +			retval = ss->can_attach(ss, cgrp, tsk);  			if (retval) {  				/*  				 * Remember on which subsystem the can_attach() @@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  				goto out;  			}  		} +		if (ss->can_attach_task) { +			retval = ss->can_attach_task(cgrp, tsk); +			if (retval) { +				failed_ss = ss; +				goto out; +			} +		}  	} -	task_lock(tsk); -	cg = tsk->cgroups; -	get_css_set(cg); -	task_unlock(tsk); -	/* -	 * Locate or allocate a new css_set for this task, -	 * based on its final set of cgroups -	 */ -	newcg = find_css_set(cg, cgrp); -	put_css_set(cg); -	if (!newcg) { -		retval = -ENOMEM; -		goto out; -	} - -	task_lock(tsk); -	if (tsk->flags & PF_EXITING) { -		task_unlock(tsk); -		put_css_set(newcg); -		retval = -ESRCH; +	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); +	if (retval)  		goto out; -	} -	rcu_assign_pointer(tsk->cgroups, newcg); -	task_unlock(tsk); - -	/* Update the css_set linked lists if we're using them */ -	write_lock(&css_set_lock); -	if (!list_empty(&tsk->cg_list)) -		list_move(&tsk->cg_list, &newcg->tasks); -	write_unlock(&css_set_lock);  	for_each_subsys(root, ss) { +		if (ss->pre_attach) +			ss->pre_attach(cgrp); +		if (ss->attach_task) +			ss->attach_task(cgrp, tsk);  		if (ss->attach) -			ss->attach(ss, cgrp, oldcgrp, tsk, false); +			ss->attach(ss, cgrp, oldcgrp, tsk);  	} -	set_bit(CGRP_RELEASABLE, &oldcgrp->flags); +  	synchronize_rcu(); -	put_css_set(cg);  	/*  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -1829,7 +1881,7 @@ out:  				 */  				break;  			if (ss->cancel_attach) -				ss->cancel_attach(ss, cgrp, tsk, false); +				ss->cancel_attach(ss, cgrp, tsk);  		}  	}  	return retval; @@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);  /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead   */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) +struct cg_list_entry { +	struct css_set *cg; +	struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, +				  struct task_struct *tsk, struct css_set *cg, +				  struct list_head *newcg_list) +{ +	struct css_set *newcg; +	struct cg_list_entry *cg_entry; +	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + +	read_lock(&css_set_lock); +	newcg = find_existing_css_set(cg, cgrp, template); +	if (newcg) +		get_css_set(newcg); +	read_unlock(&css_set_lock); + +	/* doesn't exist at all? */ +	if (!newcg) +		return false; +	/* see if it's already in the list */ +	list_for_each_entry(cg_entry, newcg_list, links) { +		if (cg_entry->cg == newcg) { +			put_css_set(newcg); +			return true; +		} +	} + +	/* not found */ +	put_css_set(newcg); +	return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, +			    struct list_head *newcg_list) +{ +	struct css_set *newcg; +	struct cg_list_entry *cg_entry; + +	/* ensure a new css_set will exist for this thread */ +	newcg = find_css_set(cg, cgrp); +	if (!newcg) +		return -ENOMEM; +	/* add it to the list */ +	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); +	if (!cg_entry) { +		put_css_set(newcg); +		return -ENOMEM; +	} +	cg_entry->cg = newcg; +	list_add(&cg_entry->links, newcg_list); +	return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ +	int retval, i, group_size; +	struct cgroup_subsys *ss, *failed_ss = NULL; +	bool cancel_failed_ss = false; +	/* guaranteed to be initialized later, but the compiler needs this */ +	struct cgroup *oldcgrp = NULL; +	struct css_set *oldcg; +	struct cgroupfs_root *root = cgrp->root; +	/* threadgroup list cursor and array */ +	struct task_struct *tsk; +	struct flex_array *group; +	/* +	 * we need to make sure we have css_sets for all the tasks we're +	 * going to move -before- we actually start moving them, so that in +	 * case we get an ENOMEM we can bail out before making any changes. +	 */ +	struct list_head newcg_list; +	struct cg_list_entry *cg_entry, *temp_nobe; + +	/* +	 * step 0: in order to do expensive, possibly blocking operations for +	 * every thread, we cannot iterate the thread group list, since it needs +	 * rcu or tasklist locked. instead, build an array of all threads in the +	 * group - threadgroup_fork_lock prevents new threads from appearing, +	 * and if threads exit, this will just be an over-estimate. +	 */ +	group_size = get_nr_threads(leader); +	/* flex_array supports very large thread-groups better than kmalloc. */ +	group = flex_array_alloc(sizeof(struct task_struct *), group_size, +				 GFP_KERNEL); +	if (!group) +		return -ENOMEM; +	/* pre-allocate to guarantee space while iterating in rcu read-side. */ +	retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); +	if (retval) +		goto out_free_group_list; + +	/* prevent changes to the threadgroup list while we take a snapshot. */ +	rcu_read_lock(); +	if (!thread_group_leader(leader)) { +		/* +		 * a race with de_thread from another thread's exec() may strip +		 * us of our leadership, making while_each_thread unsafe to use +		 * on this task. if this happens, there is no choice but to +		 * throw this task away and try again (from cgroup_procs_write); +		 * this is "double-double-toil-and-trouble-check locking". +		 */ +		rcu_read_unlock(); +		retval = -EAGAIN; +		goto out_free_group_list; +	} +	/* take a reference on each task in the group to go in the array. */ +	tsk = leader; +	i = 0; +	do { +		/* as per above, nr_threads may decrease, but not increase. */ +		BUG_ON(i >= group_size); +		get_task_struct(tsk); +		/* +		 * saying GFP_ATOMIC has no effect here because we did prealloc +		 * earlier, but it's good form to communicate our expectations. +		 */ +		retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); +		BUG_ON(retval != 0); +		i++; +	} while_each_thread(leader, tsk); +	/* remember the number of threads in the array for later. */ +	group_size = i; +	rcu_read_unlock(); + +	/* +	 * step 1: check that we can legitimately attach to the cgroup. +	 */ +	for_each_subsys(root, ss) { +		if (ss->can_attach) { +			retval = ss->can_attach(ss, cgrp, leader); +			if (retval) { +				failed_ss = ss; +				goto out_cancel_attach; +			} +		} +		/* a callback to be run on every thread in the threadgroup. */ +		if (ss->can_attach_task) { +			/* run on each task in the threadgroup. */ +			for (i = 0; i < group_size; i++) { +				tsk = flex_array_get_ptr(group, i); +				retval = ss->can_attach_task(cgrp, tsk); +				if (retval) { +					failed_ss = ss; +					cancel_failed_ss = true; +					goto out_cancel_attach; +				} +			} +		} +	} + +	/* +	 * step 2: make sure css_sets exist for all threads to be migrated. +	 * we use find_css_set, which allocates a new one if necessary. +	 */ +	INIT_LIST_HEAD(&newcg_list); +	for (i = 0; i < group_size; i++) { +		tsk = flex_array_get_ptr(group, i); +		/* nothing to do if this task is already in the cgroup */ +		oldcgrp = task_cgroup_from_root(tsk, root); +		if (cgrp == oldcgrp) +			continue; +		/* get old css_set pointer */ +		task_lock(tsk); +		if (tsk->flags & PF_EXITING) { +			/* ignore this task if it's going away */ +			task_unlock(tsk); +			continue; +		} +		oldcg = tsk->cgroups; +		get_css_set(oldcg); +		task_unlock(tsk); +		/* see if the new one for us is already in the list? */ +		if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { +			/* was already there, nothing to do. */ +			put_css_set(oldcg); +		} else { +			/* we don't already have it. get new one. */ +			retval = css_set_prefetch(cgrp, oldcg, &newcg_list); +			put_css_set(oldcg); +			if (retval) +				goto out_list_teardown; +		} +	} + +	/* +	 * step 3: now that we're guaranteed success wrt the css_sets, proceed +	 * to move all tasks to the new cgroup, calling ss->attach_task for each +	 * one along the way. there are no failure cases after here, so this is +	 * the commit point. +	 */ +	for_each_subsys(root, ss) { +		if (ss->pre_attach) +			ss->pre_attach(cgrp); +	} +	for (i = 0; i < group_size; i++) { +		tsk = flex_array_get_ptr(group, i); +		/* leave current thread as it is if it's already there */ +		oldcgrp = task_cgroup_from_root(tsk, root); +		if (cgrp == oldcgrp) +			continue; +		/* attach each task to each subsystem */ +		for_each_subsys(root, ss) { +			if (ss->attach_task) +				ss->attach_task(cgrp, tsk); +		} +		/* if the thread is PF_EXITING, it can just get skipped. */ +		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); +		BUG_ON(retval != 0 && retval != -ESRCH); +	} +	/* nothing is sensitive to fork() after this point. */ + +	/* +	 * step 4: do expensive, non-thread-specific subsystem callbacks. +	 * TODO: if ever a subsystem needs to know the oldcgrp for each task +	 * being moved, this call will need to be reworked to communicate that. +	 */ +	for_each_subsys(root, ss) { +		if (ss->attach) +			ss->attach(ss, cgrp, oldcgrp, leader); +	} + +	/* +	 * step 5: success! and cleanup +	 */ +	synchronize_rcu(); +	cgroup_wakeup_rmdir_waiter(cgrp); +	retval = 0; +out_list_teardown: +	/* clean up the list of prefetched css_sets. */ +	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { +		list_del(&cg_entry->links); +		put_css_set(cg_entry->cg); +		kfree(cg_entry); +	} +out_cancel_attach: +	/* same deal as in cgroup_attach_task */ +	if (retval) { +		for_each_subsys(root, ss) { +			if (ss == failed_ss) { +				if (cancel_failed_ss && ss->cancel_attach) +					ss->cancel_attach(ss, cgrp, leader); +				break; +			} +			if (ss->cancel_attach) +				ss->cancel_attach(ss, cgrp, leader); +		} +	} +	/* clean up the array of referenced threads in the group. */ +	for (i = 0; i < group_size; i++) { +		tsk = flex_array_get_ptr(group, i); +		put_task_struct(tsk); +	} +out_free_group_list: +	flex_array_free(group); +	return retval; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. + */ +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)  {  	struct task_struct *tsk;  	const struct cred *cred = current_cred(), *tcred;  	int ret; +	if (!cgroup_lock_live_group(cgrp)) +		return -ENODEV; +  	if (pid) {  		rcu_read_lock();  		tsk = find_task_by_vpid(pid); -		if (!tsk || tsk->flags & PF_EXITING) { +		if (!tsk) { +			rcu_read_unlock(); +			cgroup_unlock(); +			return -ESRCH; +		} +		if (threadgroup) { +			/* +			 * RCU protects this access, since tsk was found in the +			 * tid map. a race with de_thread may cause group_leader +			 * to stop being the leader, but cgroup_attach_proc will +			 * detect it later. +			 */ +			tsk = tsk->group_leader; +		} else if (tsk->flags & PF_EXITING) { +			/* optimization for the single-task-only case */  			rcu_read_unlock(); +			cgroup_unlock();  			return -ESRCH;  		} +		/* +		 * even if we're attaching all tasks in the thread group, we +		 * only need to check permissions on one of them. +		 */  		tcred = __task_cred(tsk);  		if (cred->euid &&  		    cred->euid != tcred->uid &&  		    cred->euid != tcred->suid) {  			rcu_read_unlock(); +			cgroup_unlock();  			return -EACCES;  		}  		get_task_struct(tsk);  		rcu_read_unlock();  	} else { -		tsk = current; +		if (threadgroup) +			tsk = current->group_leader; +		else +			tsk = current;  		get_task_struct(tsk);  	} -	ret = cgroup_attach_task(cgrp, tsk); +	if (threadgroup) { +		threadgroup_fork_write_lock(tsk); +		ret = cgroup_attach_proc(cgrp, tsk); +		threadgroup_fork_write_unlock(tsk); +	} else { +		ret = cgroup_attach_task(cgrp, tsk); +	}  	put_task_struct(tsk); +	cgroup_unlock();  	return ret;  }  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)  { +	return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +{  	int ret; -	if (!cgroup_lock_live_group(cgrp)) -		return -ENODEV; -	ret = attach_task_by_pid(cgrp, pid); -	cgroup_unlock(); +	do { +		/* +		 * attach_proc fails with -EAGAIN if threadgroup leadership +		 * changes in the middle of the operation, in which case we need +		 * to find the task_struct for the new leader and start over. +		 */ +		ret = attach_task_by_pid(cgrp, tgid, true); +	} while (ret == -EAGAIN);  	return ret;  } @@ -3259,9 +3632,9 @@ static struct cftype files[] = {  	{  		.name = CGROUP_FILE_GENERIC_PREFIX "procs",  		.open = cgroup_procs_open, -		/* .write_u64 = cgroup_procs_write, TODO */ +		.write_u64 = cgroup_procs_write,  		.release = cgroup_pidlist_release, -		.mode = S_IRUGO, +		.mode = S_IRUGO | S_IWUSR,  	},  	{  		.name = "notify_on_release", @@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  }  /** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, -							char *nodename) -{ -	struct dentry *dentry; -	int ret = 0; -	struct cgroup *parent, *child; -	struct inode *inode; -	struct css_set *cg; -	struct cgroupfs_root *root; -	struct cgroup_subsys *ss; - -	/* We shouldn't be called by an unregistered subsystem */ -	BUG_ON(!subsys->active); - -	/* First figure out what hierarchy and cgroup we're dealing -	 * with, and pin them so we can drop cgroup_mutex */ -	mutex_lock(&cgroup_mutex); - again: -	root = subsys->root; -	if (root == &rootnode) { -		mutex_unlock(&cgroup_mutex); -		return 0; -	} - -	/* Pin the hierarchy */ -	if (!atomic_inc_not_zero(&root->sb->s_active)) { -		/* We race with the final deactivate_super() */ -		mutex_unlock(&cgroup_mutex); -		return 0; -	} - -	/* Keep the cgroup alive */ -	task_lock(tsk); -	parent = task_cgroup(tsk, subsys->subsys_id); -	cg = tsk->cgroups; -	get_css_set(cg); -	task_unlock(tsk); - -	mutex_unlock(&cgroup_mutex); - -	/* Now do the VFS work to create a cgroup */ -	inode = parent->dentry->d_inode; - -	/* Hold the parent directory mutex across this operation to -	 * stop anyone else deleting the new cgroup */ -	mutex_lock(&inode->i_mutex); -	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); -	if (IS_ERR(dentry)) { -		printk(KERN_INFO -		       "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, -		       PTR_ERR(dentry)); -		ret = PTR_ERR(dentry); -		goto out_release; -	} - -	/* Create the cgroup directory, which also creates the cgroup */ -	ret = vfs_mkdir(inode, dentry, 0755); -	child = __d_cgrp(dentry); -	dput(dentry); -	if (ret) { -		printk(KERN_INFO -		       "Failed to create cgroup %s: %d\n", nodename, -		       ret); -		goto out_release; -	} - -	/* The cgroup now exists. Retake cgroup_mutex and check -	 * that we're still in the same state that we thought we -	 * were. */ -	mutex_lock(&cgroup_mutex); -	if ((root != subsys->root) || -	    (parent != task_cgroup(tsk, subsys->subsys_id))) { -		/* Aargh, we raced ... */ -		mutex_unlock(&inode->i_mutex); -		put_css_set(cg); - -		deactivate_super(root->sb); -		/* The cgroup is still accessible in the VFS, but -		 * we're not going to try to rmdir() it at this -		 * point. */ -		printk(KERN_INFO -		       "Race in cgroup_clone() - leaking cgroup %s\n", -		       nodename); -		goto again; -	} - -	/* do any required auto-setup */ -	for_each_subsys(root, ss) { -		if (ss->post_clone) -			ss->post_clone(ss, child); -	} - -	/* All seems fine. Finish by moving the task into the new cgroup */ -	ret = cgroup_attach_task(child, tsk); -	mutex_unlock(&cgroup_mutex); - - out_release: -	mutex_unlock(&inode->i_mutex); - -	mutex_lock(&cgroup_mutex); -	put_css_set(cg); -	mutex_unlock(&cgroup_mutex); -	deactivate_super(root->sb); -	return ret; -} - -/**   * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp   * @cgrp: the cgroup in question   * @task: the task in question  |