diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 265 | 
1 files changed, 159 insertions, 106 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c6e1c17e6d..046c1609606 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)  	struct cg_cgroup_link *link;  	struct cg_cgroup_link *saved_link; -	write_lock(&css_set_lock);  	hlist_del(&cg->hlist);  	css_set_count--; @@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)  		list_del(&link->cgrp_link_list);  		kfree(link);  	} - -	write_unlock(&css_set_lock);  } -static void __release_css_set(struct kref *k, int taskexit) +static void __put_css_set(struct css_set *cg, int taskexit)  {  	int i; -	struct css_set *cg = container_of(k, struct css_set, ref); - +	/* +	 * Ensure that the refcount doesn't hit zero while any readers +	 * can see it. Similar to atomic_dec_and_lock(), but for an +	 * rwlock +	 */ +	if (atomic_add_unless(&cg->refcount, -1, 1)) +		return; +	write_lock(&css_set_lock); +	if (!atomic_dec_and_test(&cg->refcount)) { +		write_unlock(&css_set_lock); +		return; +	}  	unlink_css_set(cg); +	write_unlock(&css_set_lock);  	rcu_read_lock();  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { @@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)  	kfree(cg);  } -static void release_css_set(struct kref *k) -{ -	__release_css_set(k, 0); -} - -static void release_css_set_taskexit(struct kref *k) -{ -	__release_css_set(k, 1); -} -  /*   * refcounted get/put for css_set objects   */  static inline void get_css_set(struct css_set *cg)  { -	kref_get(&cg->ref); +	atomic_inc(&cg->refcount);  }  static inline void put_css_set(struct css_set *cg)  { -	kref_put(&cg->ref, release_css_set); +	__put_css_set(cg, 0);  }  static inline void put_css_set_taskexit(struct css_set *cg)  { -	kref_put(&cg->ref, release_css_set_taskexit); +	__put_css_set(cg, 1);  }  /* @@ -427,7 +425,7 @@ static struct css_set *find_css_set(  		return NULL;  	} -	kref_init(&res->ref); +	atomic_set(&res->refcount, 1);  	INIT_LIST_HEAD(&res->cg_links);  	INIT_LIST_HEAD(&res->tasks);  	INIT_HLIST_NODE(&res->hlist); @@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = {  	.remount_fs = cgroup_remount,  }; +static void init_cgroup_housekeeping(struct cgroup *cgrp) +{ +	INIT_LIST_HEAD(&cgrp->sibling); +	INIT_LIST_HEAD(&cgrp->children); +	INIT_LIST_HEAD(&cgrp->css_sets); +	INIT_LIST_HEAD(&cgrp->release_list); +	init_rwsem(&cgrp->pids_mutex); +}  static void init_cgroup_root(struct cgroupfs_root *root)  {  	struct cgroup *cgrp = &root->top_cgroup; @@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)  	root->number_of_cgroups = 1;  	cgrp->root = root;  	cgrp->top_cgroup = cgrp; -	INIT_LIST_HEAD(&cgrp->sibling); -	INIT_LIST_HEAD(&cgrp->children); -	INIT_LIST_HEAD(&cgrp->css_sets); -	INIT_LIST_HEAD(&cgrp->release_list); +	init_cgroup_housekeeping(cgrp);  }  static int cgroup_test_super(struct super_block *sb, void *data) @@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp)  	read_lock(&css_set_lock);  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { -		count += atomic_read(&link->cg->ref.refcount); +		count += atomic_read(&link->cg->refcount);  	}  	read_unlock(&css_set_lock);  	return count; @@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)   * but we cannot guarantee that the information we produce is correct   * unless we produce it entirely atomically.   * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here).  The struct - * ctr_struct * is stored in file->private_data.  Its resources will - * be freed by release() when the file is closed.  The array is used - * to sprintf the PIDs and then used by read().   */ -struct ctr_struct { -	char *buf; -	int bufsz; -};  /*   * Load into 'pidarray' up to 'npids' of the tasks using cgroup @@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b)  	return *(pid_t *)a - *(pid_t *)b;  } +  /* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. + * seq_file methods for the "tasks" file. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->tasks_pids array.   */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) + +static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)  { -	int cnt = 0; -	int i; +	/* +	 * Initially we receive a position value that corresponds to +	 * one more than the last pid shown (or 0 on the first call or +	 * after a seek to the start). Use a binary-search to find the +	 * next pid to display, if any +	 */ +	struct cgroup *cgrp = s->private; +	int index = 0, pid = *pos; +	int *iter; + +	down_read(&cgrp->pids_mutex); +	if (pid) { +		int end = cgrp->pids_length; +		int i; +		while (index < end) { +			int mid = (index + end) / 2; +			if (cgrp->tasks_pids[mid] == pid) { +				index = mid; +				break; +			} else if (cgrp->tasks_pids[mid] <= pid) +				index = mid + 1; +			else +				end = mid; +		} +	} +	/* If we're off the end of the array, we're done */ +	if (index >= cgrp->pids_length) +		return NULL; +	/* Update the abstract position to be the actual pid that we found */ +	iter = cgrp->tasks_pids + index; +	*pos = *iter; +	return iter; +} + +static void cgroup_tasks_stop(struct seq_file *s, void *v) +{ +	struct cgroup *cgrp = s->private; +	up_read(&cgrp->pids_mutex); +} + +static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +{ +	struct cgroup *cgrp = s->private; +	int *p = v; +	int *end = cgrp->tasks_pids + cgrp->pids_length; + +	/* +	 * Advance to the next pid in the array. If this goes off the +	 * end, we're done +	 */ +	p++; +	if (p >= end) { +		return NULL; +	} else { +		*pos = *p; +		return p; +	} +} + +static int cgroup_tasks_show(struct seq_file *s, void *v) +{ +	return seq_printf(s, "%d\n", *(int *)v); +} + +static struct seq_operations cgroup_tasks_seq_operations = { +	.start = cgroup_tasks_start, +	.stop = cgroup_tasks_stop, +	.next = cgroup_tasks_next, +	.show = cgroup_tasks_show, +}; + +static void release_cgroup_pid_array(struct cgroup *cgrp) +{ +	down_write(&cgrp->pids_mutex); +	BUG_ON(!cgrp->pids_use_count); +	if (!--cgrp->pids_use_count) { +		kfree(cgrp->tasks_pids); +		cgrp->tasks_pids = NULL; +		cgrp->pids_length = 0; +	} +	up_write(&cgrp->pids_mutex); +} + +static int cgroup_tasks_release(struct inode *inode, struct file *file) +{ +	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + +	if (!(file->f_mode & FMODE_READ)) +		return 0; -	for (i = 0; i < npids; i++) -		cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); -	return cnt; +	release_cgroup_pid_array(cgrp); +	return seq_release(inode, file);  } +static struct file_operations cgroup_tasks_operations = { +	.read = seq_read, +	.llseek = seq_lseek, +	.write = cgroup_file_write, +	.release = cgroup_tasks_release, +}; +  /* - * Handle an open on 'tasks' file.  Prepare a buffer listing the + * Handle an open on 'tasks' file.  Prepare an array containing the   * process id's of tasks currently attached to the cgroup being opened. - * - * Does not require any specific cgroup mutexes, and does not take any.   */ +  static int cgroup_tasks_open(struct inode *unused, struct file *file)  {  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); -	struct ctr_struct *ctr;  	pid_t *pidarray;  	int npids; -	char c; +	int retval; +	/* Nothing to do for write-only files */  	if (!(file->f_mode & FMODE_READ))  		return 0; -	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); -	if (!ctr) -		goto err0; -  	/*  	 * If cgroup gets more users after we read count, we won't have  	 * enough space - tough.  This race is indistinguishable to the @@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)  	 * show up until sometime later on.  	 */  	npids = cgroup_task_count(cgrp); -	if (npids) { -		pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); -		if (!pidarray) -			goto err1; - -		npids = pid_array_load(pidarray, npids, cgrp); -		sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - -		/* Call pid_array_to_buf() twice, first just to get bufsz */ -		ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; -		ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); -		if (!ctr->buf) -			goto err2; -		ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - -		kfree(pidarray); -	} else { -		ctr->buf = NULL; -		ctr->bufsz = 0; -	} -	file->private_data = ctr; -	return 0; - -err2: -	kfree(pidarray); -err1: -	kfree(ctr); -err0: -	return -ENOMEM; -} - -static ssize_t cgroup_tasks_read(struct cgroup *cgrp, -				    struct cftype *cft, -				    struct file *file, char __user *buf, -				    size_t nbytes, loff_t *ppos) -{ -	struct ctr_struct *ctr = file->private_data; +	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); +	if (!pidarray) +		return -ENOMEM; +	npids = pid_array_load(pidarray, npids, cgrp); +	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); -	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} +	/* +	 * Store the array in the cgroup, freeing the old +	 * array if necessary +	 */ +	down_write(&cgrp->pids_mutex); +	kfree(cgrp->tasks_pids); +	cgrp->tasks_pids = pidarray; +	cgrp->pids_length = npids; +	cgrp->pids_use_count++; +	up_write(&cgrp->pids_mutex); -static int cgroup_tasks_release(struct inode *unused_inode, -					struct file *file) -{ -	struct ctr_struct *ctr; +	file->f_op = &cgroup_tasks_operations; -	if (file->f_mode & FMODE_READ) { -		ctr = file->private_data; -		kfree(ctr->buf); -		kfree(ctr); +	retval = seq_open(file, &cgroup_tasks_seq_operations); +	if (retval) { +		release_cgroup_pid_array(cgrp); +		return retval;  	} +	((struct seq_file *)file->private_data)->private = cgrp;  	return 0;  } @@ -2210,7 +2268,6 @@ static struct cftype files[] = {  	{  		.name = "tasks",  		.open = cgroup_tasks_open, -		.read = cgroup_tasks_read,  		.write_u64 = cgroup_tasks_write,  		.release = cgroup_tasks_release,  		.private = FILE_TASKLIST, @@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	mutex_lock(&cgroup_mutex); -	INIT_LIST_HEAD(&cgrp->sibling); -	INIT_LIST_HEAD(&cgrp->children); -	INIT_LIST_HEAD(&cgrp->css_sets); -	INIT_LIST_HEAD(&cgrp->release_list); +	init_cgroup_housekeeping(cgrp);  	cgrp->parent = parent;  	cgrp->root = parent->root; @@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  int __init cgroup_init_early(void)  {  	int i; -	kref_init(&init_css_set.ref); -	kref_get(&init_css_set.ref); +	atomic_set(&init_css_set.refcount, 1);  	INIT_LIST_HEAD(&init_css_set.cg_links);  	INIT_LIST_HEAD(&init_css_set.tasks);  	INIT_HLIST_NODE(&init_css_set.hlist);  |