diff options
Diffstat (limited to 'kernel')
83 files changed, 5210 insertions, 2337 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e4791b3ba55..bab1dffe37e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o  obj-$(CONFIG_FUNCTION_TRACER) += trace/  obj-$(CONFIG_TRACING) += trace/  obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_SLOW_WORK) += slow-work.o  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/async.c b/kernel/async.c index f565891f2c9..968ef9457d4 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,6 +49,7 @@ asynchronous and synchronous parts of the kernel.  */  #include <linux/async.h> +#include <linux/bug.h>  #include <linux/module.h>  #include <linux/wait.h>  #include <linux/sched.h> @@ -387,20 +388,11 @@ static int async_manager_thread(void *unused)  static int __init async_init(void)  { -	if (async_enabled) -		if (IS_ERR(kthread_run(async_manager_thread, NULL, -				       "async/mgr"))) -			async_enabled = 0; -	return 0; -} +	async_enabled = +		!IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); -static int __init setup_async(char *str) -{ -	async_enabled = 1; -	return 1; +	WARN_ON(!async_enabled); +	return 0;  } -__setup("fastboot", setup_async); - -  core_initcall(async_init); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8cbddff6c28..2bfc6478676 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -66,6 +66,7 @@  #include <linux/syscalls.h>  #include <linux/inotify.h>  #include <linux/capability.h> +#include <linux/fs_struct.h>  #include "audit.h" diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9edb5c4b79b..382109b5bae 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -94,7 +94,6 @@ struct cgroupfs_root {  	char release_agent_path[PATH_MAX];  }; -  /*   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the   * subsystems that are otherwise unattached - it never has more than a @@ -102,6 +101,39 @@ struct cgroupfs_root {   */  static struct cgroupfs_root rootnode; +/* + * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when + * cgroup_subsys->use_id != 0. + */ +#define CSS_ID_MAX	(65535) +struct css_id { +	/* +	 * The css to which this ID points. This pointer is set to valid value +	 * after cgroup is populated. If cgroup is removed, this will be NULL. +	 * This pointer is expected to be RCU-safe because destroy() +	 * is called after synchronize_rcu(). But for safe use, css_is_removed() +	 * css_tryget() should be used for avoiding race. +	 */ +	struct cgroup_subsys_state *css; +	/* +	 * ID of this css. +	 */ +	unsigned short id; +	/* +	 * Depth in hierarchy which this ID belongs to. +	 */ +	unsigned short depth; +	/* +	 * ID is freed by RCU. (and lookup routine is RCU safe.) +	 */ +	struct rcu_head rcu_head; +	/* +	 * Hierarchy of CSS ID belongs to. +	 */ +	unsigned short stack[0]; /* Array of Length (depth+1) */ +}; + +  /* The list of hierarchy roots */  static LIST_HEAD(roots); @@ -185,6 +217,8 @@ struct cg_cgroup_link {  static struct css_set init_css_set;  static struct cg_cgroup_link init_css_set_link; +static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); +  /* css_set_lock protects the list of css_set objects, and the   * chain of tasks off each css_set.  Nests outside task->alloc_lock   * due to cgroup_iter_start() */ @@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,  }; +static int alloc_css_id(struct cgroup_subsys *ss, +			struct cgroup *parent, struct cgroup *child); +  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)  {  	struct inode *inode = new_inode(sb); @@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)   * Call subsys's pre_destroy handler.   * This is called before css refcnt check.   */ -static void cgroup_call_pre_destroy(struct cgroup *cgrp) +static int cgroup_call_pre_destroy(struct cgroup *cgrp)  {  	struct cgroup_subsys *ss; +	int ret = 0; +  	for_each_subsys(cgrp->root, ss) -		if (ss->pre_destroy) -			ss->pre_destroy(ss, cgrp); -	return; +		if (ss->pre_destroy) { +			ret = ss->pre_destroy(ss, cgrp); +			if (ret) +				break; +		} +	return ret;  }  static void free_cgroup_rcu(struct rcu_head *obj) @@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)  	remove_dir(dentry);  } +/* + * A queue for waiters to do rmdir() cgroup. A tasks will sleep when + * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some + * reference to css->refcnt. In general, this refcnt is expected to goes down + * to zero, soon. + * + * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + */ +DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); + +static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +{ +	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) +		wake_up_all(&cgroup_rmdir_waitq); +} +  static int rebind_subsystems(struct cgroupfs_root *root,  			      unsigned long final_bits)  { @@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	}  	ret = rebind_subsystems(root, opts.subsys_bits); +	if (ret) +		goto out_unlock;  	/* (re)populate subsystem files */ -	if (!ret) -		cgroup_populate_dir(cgrp); +	cgroup_populate_dir(cgrp);  	if (opts.release_agent)  		strcpy(root->release_agent_path, opts.release_agent);   out_unlock: -	if (opts.release_agent) -		kfree(opts.release_agent); +	kfree(opts.release_agent);  	mutex_unlock(&cgroup_mutex);  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);  	return ret; @@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,  	/* First find the desired set of subsystems */  	ret = parse_cgroupfs_options(data, &opts);  	if (ret) { -		if (opts.release_agent) -			kfree(opts.release_agent); +		kfree(opts.release_agent);  		return ret;  	}  	root = kzalloc(sizeof(*root), GFP_KERNEL);  	if (!root) { -		if (opts.release_agent) -			kfree(opts.release_agent); +		kfree(opts.release_agent);  		return -ENOMEM;  	} @@ -1071,7 +1127,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,  		mutex_unlock(&cgroup_mutex);  	} -	return simple_set_mnt(mnt, sb); +	simple_set_mnt(mnt, sb); +	return 0;   free_cg_links:  	free_cg_links(&tmp_cg_links); @@ -1279,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);  	synchronize_rcu();  	put_css_set(cg); + +	/* +	 * wake up rmdir() waiter. the rmdir should fail since the cgroup +	 * is no longer empty. +	 */ +	cgroup_wakeup_rmdir_waiters(cgrp);  	return 0;  } @@ -1624,10 +1687,10 @@ static struct inode_operations cgroup_dir_inode_operations = {  	.rename = cgroup_rename,  }; -static int cgroup_create_file(struct dentry *dentry, int mode, +static int cgroup_create_file(struct dentry *dentry, mode_t mode,  				struct super_block *sb)  { -	static struct dentry_operations cgroup_dops = { +	static const struct dentry_operations cgroup_dops = {  		.d_iput = cgroup_diput,  	}; @@ -1670,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,   * @mode: mode to set on new directory.   */  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, -				int mode) +				mode_t mode)  {  	struct dentry *parent;  	int error = 0; @@ -1688,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,  	return error;  } +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static mode_t cgroup_file_mode(const struct cftype *cft) +{ +	mode_t mode = 0; + +	if (cft->mode) +		return cft->mode; + +	if (cft->read || cft->read_u64 || cft->read_s64 || +	    cft->read_map || cft->read_seq_string) +		mode |= S_IRUGO; + +	if (cft->write || cft->write_u64 || cft->write_s64 || +	    cft->write_string || cft->trigger) +		mode |= S_IWUSR; + +	return mode; +} +  int cgroup_add_file(struct cgroup *cgrp,  		       struct cgroup_subsys *subsys,  		       const struct cftype *cft) @@ -1695,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,  	struct dentry *dir = cgrp->dentry;  	struct dentry *dentry;  	int error; +	mode_t mode;  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { @@ -1705,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));  	dentry = lookup_one_len(name, dir, strlen(name));  	if (!IS_ERR(dentry)) { -		error = cgroup_create_file(dentry, 0644 | S_IFREG, +		mode = cgroup_file_mode(cft); +		error = cgroup_create_file(dentry, mode | S_IFREG,  						cgrp->root->sb);  		if (!error)  			dentry->d_fsdata = (void *)cft; @@ -2287,6 +2379,7 @@ static struct cftype files[] = {  		.write_u64 = cgroup_tasks_write,  		.release = cgroup_tasks_release,  		.private = FILE_TASKLIST, +		.mode = S_IRUGO | S_IWUSR,  	},  	{ @@ -2326,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)  			return err;  	} +	/* This cgroup is ready now */ +	for_each_subsys(cgrp->root, ss) { +		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; +		/* +		 * Update id->css pointer and make this css visible from +		 * CSS ID functions. This pointer will be dereferened +		 * from RCU-read-side without locks. +		 */ +		if (css->id) +			rcu_assign_pointer(css->id->css, css); +	}  	return 0;  } @@ -2337,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,  	css->cgroup = cgrp;  	atomic_set(&css->refcnt, 1);  	css->flags = 0; +	css->id = NULL;  	if (cgrp == dummytop)  		set_bit(CSS_ROOT, &css->flags);  	BUG_ON(cgrp->subsys[ss->subsys_id]); @@ -2375,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)   * Must be called with the mutex on the parent inode held   */  static long cgroup_create(struct cgroup *parent, struct dentry *dentry, -			     int mode) +			     mode_t mode)  {  	struct cgroup *cgrp;  	struct cgroupfs_root *root = parent->root; @@ -2412,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  			goto err_destroy;  		}  		init_cgroup_css(css, ss, cgrp); +		if (ss->use_id) +			if (alloc_css_id(ss, parent, cgrp)) +				goto err_destroy; +		/* At error, ->destroy() callback has to free assigned ID. */  	}  	cgroup_lock_hierarchy(root); @@ -2554,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)  	struct cgroup *cgrp = dentry->d_fsdata;  	struct dentry *d;  	struct cgroup *parent; +	DEFINE_WAIT(wait); +	int ret;  	/* the vfs holds both inode->i_mutex already */ - +again:  	mutex_lock(&cgroup_mutex);  	if (atomic_read(&cgrp->count) != 0) {  		mutex_unlock(&cgroup_mutex); @@ -2572,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)  	 * Call pre_destroy handlers of subsys. Notify subsystems  	 * that rmdir() request comes.  	 */ -	cgroup_call_pre_destroy(cgrp); +	ret = cgroup_call_pre_destroy(cgrp); +	if (ret) +		return ret;  	mutex_lock(&cgroup_mutex);  	parent = cgrp->parent; - -	if (atomic_read(&cgrp->count) -	    || !list_empty(&cgrp->children) -	    || !cgroup_clear_css_refs(cgrp)) { +	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {  		mutex_unlock(&cgroup_mutex);  		return -EBUSY;  	} +	/* +	 * css_put/get is provided for subsys to grab refcnt to css. In typical +	 * case, subsystem has no reference after pre_destroy(). But, under +	 * hierarchy management, some *temporal* refcnt can be hold. +	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys +	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up +	 * is called when css_put() is called and refcnt goes down to 0. +	 */ +	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); +	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); + +	if (!cgroup_clear_css_refs(cgrp)) { +		mutex_unlock(&cgroup_mutex); +		schedule(); +		finish_wait(&cgroup_rmdir_waitq, &wait); +		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); +		if (signal_pending(current)) +			return -EINTR; +		goto again; +	} +	/* NO css_tryget() can success after here. */ +	finish_wait(&cgroup_rmdir_waitq, &wait); +	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  	spin_lock(&release_list_lock);  	set_bit(CGRP_REMOVED, &cgrp->flags); @@ -2707,6 +2840,8 @@ int __init cgroup_init(void)  		struct cgroup_subsys *ss = subsys[i];  		if (!ss->early_init)  			cgroup_init_subsys(ss); +		if (ss->use_id) +			cgroup_subsys_init_idr(ss);  	}  	/* Add init_css_set to the hash table */ @@ -3083,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,  }  /** - * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp + * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp   * @cgrp: the cgroup in question + * @task: the task in question   * - * See if @cgrp is a descendant of the current task's cgroup in - * the appropriate hierarchy. + * See if @cgrp is a descendant of @task's cgroup in the appropriate + * hierarchy.   *   * If we are sending in dummytop, then presumably we are creating   * the top cgroup in the subsystem.   *   * Called only by the ns (nsproxy) cgroup.   */ -int cgroup_is_descendant(const struct cgroup *cgrp) +int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)  {  	int ret;  	struct cgroup *target; @@ -3104,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)  		return 1;  	get_first_subsys(cgrp, NULL, &subsys_id); -	target = task_cgroup(current, subsys_id); +	target = task_cgroup(task, subsys_id);  	while (cgrp != target && cgrp!= cgrp->top_cgroup)  		cgrp = cgrp->parent;  	ret = (cgrp == target); @@ -3137,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)  {  	struct cgroup *cgrp = css->cgroup;  	rcu_read_lock(); -	if ((atomic_dec_return(&css->refcnt) == 1) && -	    notify_on_release(cgrp)) { -		set_bit(CGRP_RELEASABLE, &cgrp->flags); -		check_for_release(cgrp); +	if (atomic_dec_return(&css->refcnt) == 1) { +		if (notify_on_release(cgrp)) { +			set_bit(CGRP_RELEASABLE, &cgrp->flags); +			check_for_release(cgrp); +		} +		cgroup_wakeup_rmdir_waiters(cgrp);  	}  	rcu_read_unlock();  } @@ -3240,3 +3378,232 @@ static int __init cgroup_disable(char *str)  	return 1;  }  __setup("cgroup_disable=", cgroup_disable); + +/* + * Functons for CSS ID. + */ + +/* + *To get ID other than 0, this should be called when !cgroup_is_removed(). + */ +unsigned short css_id(struct cgroup_subsys_state *css) +{ +	struct css_id *cssid = rcu_dereference(css->id); + +	if (cssid) +		return cssid->id; +	return 0; +} + +unsigned short css_depth(struct cgroup_subsys_state *css) +{ +	struct css_id *cssid = rcu_dereference(css->id); + +	if (cssid) +		return cssid->depth; +	return 0; +} + +bool css_is_ancestor(struct cgroup_subsys_state *child, +		    const struct cgroup_subsys_state *root) +{ +	struct css_id *child_id = rcu_dereference(child->id); +	struct css_id *root_id = rcu_dereference(root->id); + +	if (!child_id || !root_id || (child_id->depth < root_id->depth)) +		return false; +	return child_id->stack[root_id->depth] == root_id->id; +} + +static void __free_css_id_cb(struct rcu_head *head) +{ +	struct css_id *id; + +	id = container_of(head, struct css_id, rcu_head); +	kfree(id); +} + +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) +{ +	struct css_id *id = css->id; +	/* When this is called before css_id initialization, id can be NULL */ +	if (!id) +		return; + +	BUG_ON(!ss->use_id); + +	rcu_assign_pointer(id->css, NULL); +	rcu_assign_pointer(css->id, NULL); +	spin_lock(&ss->id_lock); +	idr_remove(&ss->idr, id->id); +	spin_unlock(&ss->id_lock); +	call_rcu(&id->rcu_head, __free_css_id_cb); +} + +/* + * This is called by init or create(). Then, calls to this function are + * always serialized (By cgroup_mutex() at create()). + */ + +static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) +{ +	struct css_id *newid; +	int myid, error, size; + +	BUG_ON(!ss->use_id); + +	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); +	newid = kzalloc(size, GFP_KERNEL); +	if (!newid) +		return ERR_PTR(-ENOMEM); +	/* get id */ +	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { +		error = -ENOMEM; +		goto err_out; +	} +	spin_lock(&ss->id_lock); +	/* Don't use 0. allocates an ID of 1-65535 */ +	error = idr_get_new_above(&ss->idr, newid, 1, &myid); +	spin_unlock(&ss->id_lock); + +	/* Returns error when there are no free spaces for new ID.*/ +	if (error) { +		error = -ENOSPC; +		goto err_out; +	} +	if (myid > CSS_ID_MAX) +		goto remove_idr; + +	newid->id = myid; +	newid->depth = depth; +	return newid; +remove_idr: +	error = -ENOSPC; +	spin_lock(&ss->id_lock); +	idr_remove(&ss->idr, myid); +	spin_unlock(&ss->id_lock); +err_out: +	kfree(newid); +	return ERR_PTR(error); + +} + +static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) +{ +	struct css_id *newid; +	struct cgroup_subsys_state *rootcss; + +	spin_lock_init(&ss->id_lock); +	idr_init(&ss->idr); + +	rootcss = init_css_set.subsys[ss->subsys_id]; +	newid = get_new_cssid(ss, 0); +	if (IS_ERR(newid)) +		return PTR_ERR(newid); + +	newid->stack[0] = newid->id; +	newid->css = rootcss; +	rootcss->id = newid; +	return 0; +} + +static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, +			struct cgroup *child) +{ +	int subsys_id, i, depth = 0; +	struct cgroup_subsys_state *parent_css, *child_css; +	struct css_id *child_id, *parent_id = NULL; + +	subsys_id = ss->subsys_id; +	parent_css = parent->subsys[subsys_id]; +	child_css = child->subsys[subsys_id]; +	depth = css_depth(parent_css) + 1; +	parent_id = parent_css->id; + +	child_id = get_new_cssid(ss, depth); +	if (IS_ERR(child_id)) +		return PTR_ERR(child_id); + +	for (i = 0; i < depth; i++) +		child_id->stack[i] = parent_id->stack[i]; +	child_id->stack[depth] = child_id->id; +	/* +	 * child_id->css pointer will be set after this cgroup is available +	 * see cgroup_populate_dir() +	 */ +	rcu_assign_pointer(child_css->id, child_id); + +	return 0; +} + +/** + * css_lookup - lookup css by id + * @ss: cgroup subsys to be looked into. + * @id: the id + * + * Returns pointer to cgroup_subsys_state if there is valid one with id. + * NULL if not. Should be called under rcu_read_lock() + */ +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) +{ +	struct css_id *cssid = NULL; + +	BUG_ON(!ss->use_id); +	cssid = idr_find(&ss->idr, id); + +	if (unlikely(!cssid)) +		return NULL; + +	return rcu_dereference(cssid->css); +} + +/** + * css_get_next - lookup next cgroup under specified hierarchy. + * @ss: pointer to subsystem + * @id: current position of iteration. + * @root: pointer to css. search tree under this. + * @foundid: position of found object. + * + * Search next css under the specified hierarchy of rootid. Calling under + * rcu_read_lock() is necessary. Returns NULL if it reaches the end. + */ +struct cgroup_subsys_state * +css_get_next(struct cgroup_subsys *ss, int id, +	     struct cgroup_subsys_state *root, int *foundid) +{ +	struct cgroup_subsys_state *ret = NULL; +	struct css_id *tmp; +	int tmpid; +	int rootid = css_id(root); +	int depth = css_depth(root); + +	if (!rootid) +		return NULL; + +	BUG_ON(!ss->use_id); +	/* fill start point for scan */ +	tmpid = id; +	while (1) { +		/* +		 * scan next entry from bitmap(tree), tmpid is updated after +		 * idr_get_next(). +		 */ +		spin_lock(&ss->id_lock); +		tmp = idr_get_next(&ss->idr, &tmpid); +		spin_unlock(&ss->id_lock); + +		if (!tmp) +			break; +		if (tmp->depth >= depth && tmp->stack[depth] == rootid) { +			ret = rcu_dereference(tmp->css); +			if (ret) { +				*foundid = tmpid; +				break; +			} +		} +		/* continue to scan from next id */ +		tmpid = tmpid + 1; +	} +	return ret; +} + diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index daca6209202..0c92d797baa 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)  {  	u64 count; -	cgroup_lock();  	count = cgroup_task_count(cont); -	cgroup_unlock();  	return count;  } diff --git a/kernel/cpu.c b/kernel/cpu.c index 79e40f00dcb..395b6974dc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)  		goto out;  	} -	cpu_clear(cpu, cpu_active_map); +	set_cpu_active(cpu, false);  	/*  	 * Make sure the all cpus did the reschedule and are not @@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)  	err = _cpu_down(cpu, 0);  	if (cpu_online(cpu)) -		cpu_set(cpu, cpu_active_map); +		set_cpu_active(cpu, true);  out:  	cpu_maps_update_done(); @@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)  		goto out_notify;  	BUG_ON(!cpu_online(cpu)); -	cpu_set(cpu, cpu_active_map); +	set_cpu_active(cpu, true);  	/* Now call notifier in preparation. */  	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f76db9dcaa0..026faccca86 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)  	return container_of(task_subsys_state(task, cpuset_subsys_id),  			    struct cpuset, css);  } -struct cpuset_hotplug_scanner { -	struct cgroup_scanner scan; -	struct cgroup *to; -};  /* bits in struct cpuset flags field */  typedef enum { @@ -521,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)  	return 0;  } +#ifdef CONFIG_SMP  /*   * Helper routine for generate_sched_domains().   * Do cpusets a, b have overlapping cpus_allowed masks? @@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)  	put_online_cpus();  } +#else /* !CONFIG_SMP */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ +} + +static int generate_sched_domains(struct cpumask **domains, +			struct sched_domain_attr **attributes) +{ +	*domains = NULL; +	return 1; +} +#endif /* CONFIG_SMP */  static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); @@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,  	mutex_unlock(&callback_mutex);  } +/* + * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new + * nodes if memory_migrate flag is set. Called with cgroup_mutex held. + */ +static void cpuset_change_nodemask(struct task_struct *p, +				   struct cgroup_scanner *scan) +{ +	struct mm_struct *mm; +	struct cpuset *cs; +	int migrate; +	const nodemask_t *oldmem = scan->data; + +	mm = get_task_mm(p); +	if (!mm) +		return; + +	cs = cgroup_cs(scan->cg); +	migrate = is_memory_migrate(cs); + +	mpol_rebind_mm(mm, &cs->mems_allowed); +	if (migrate) +		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); +	mmput(mm); +} +  static void *cpuset_being_rebound;  /**   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed   * @oldmem: old mems_allowed of cpuset cs + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()   *   * Called with cgroup_mutex held - * Return 0 if successful, -errno if not. + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL.   */ -static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) +static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, +				 struct ptr_heap *heap)  { -	struct task_struct *p; -	struct mm_struct **mmarray; -	int i, n, ntasks; -	int migrate; -	int fudge; -	struct cgroup_iter it; -	int retval; +	struct cgroup_scanner scan;  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */ -	fudge = 10;				/* spare mmarray[] slots */ -	fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ -	retval = -ENOMEM; - -	/* -	 * Allocate mmarray[] to hold mm reference for each task -	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding -	 * tasklist_lock.  We could use GFP_ATOMIC, but with a -	 * few more lines of code, we can retry until we get a big -	 * enough mmarray[] w/o using GFP_ATOMIC. -	 */ -	while (1) { -		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */ -		ntasks += fudge; -		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); -		if (!mmarray) -			goto done; -		read_lock(&tasklist_lock);		/* block fork */ -		if (cgroup_task_count(cs->css.cgroup) <= ntasks) -			break;				/* got enough */ -		read_unlock(&tasklist_lock);		/* try again */ -		kfree(mmarray); -	} - -	n = 0; - -	/* Load up mmarray[] with mm reference for each task in cpuset. */ -	cgroup_iter_start(cs->css.cgroup, &it); -	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) { -		struct mm_struct *mm; - -		if (n >= ntasks) { -			printk(KERN_WARNING -				"Cpuset mempolicy rebind incomplete.\n"); -			break; -		} -		mm = get_task_mm(p); -		if (!mm) -			continue; -		mmarray[n++] = mm; -	} -	cgroup_iter_end(cs->css.cgroup, &it); -	read_unlock(&tasklist_lock); +	scan.cg = cs->css.cgroup; +	scan.test_task = NULL; +	scan.process_task = cpuset_change_nodemask; +	scan.heap = heap; +	scan.data = (nodemask_t *)oldmem;  	/* -	 * Now that we've dropped the tasklist spinlock, we can -	 * rebind the vma mempolicies of each mm in mmarray[] to their -	 * new cpuset, and release that mm.  The mpol_rebind_mm() -	 * call takes mmap_sem, which we couldn't take while holding -	 * tasklist_lock.  Forks can happen again now - the mpol_dup() -	 * cpuset_being_rebound check will catch such forks, and rebind -	 * their vma mempolicies too.  Because we still hold the global -	 * cgroup_mutex, we know that no other rebind effort will -	 * be contending for the global variable cpuset_being_rebound. +	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't +	 * take while holding tasklist_lock.  Forks can happen - the +	 * mpol_dup() cpuset_being_rebound check will catch such forks, +	 * and rebind their vma mempolicies too.  Because we still hold +	 * the global cgroup_mutex, we know that no other rebind effort +	 * will be contending for the global variable cpuset_being_rebound.  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()  	 * is idempotent.  Also migrate pages in each mm to new nodes.  	 */ -	migrate = is_memory_migrate(cs); -	for (i = 0; i < n; i++) { -		struct mm_struct *mm = mmarray[i]; - -		mpol_rebind_mm(mm, &cs->mems_allowed); -		if (migrate) -			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); -		mmput(mm); -	} +	cgroup_scan_tasks(&scan);  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */ -	kfree(mmarray);  	cpuset_being_rebound = NULL; -	retval = 0; -done: -	return retval;  }  /* @@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  {  	nodemask_t oldmem;  	int retval; +	struct ptr_heap heap;  	/*  	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; @@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  	if (retval < 0)  		goto done; +	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); +	if (retval < 0) +		goto done; +  	mutex_lock(&callback_mutex);  	cs->mems_allowed = trialcs->mems_allowed;  	cs->mems_generation = cpuset_mems_generation++;  	mutex_unlock(&callback_mutex); -	retval = update_tasks_nodemask(cs, &oldmem); +	update_tasks_nodemask(cs, &oldmem, &heap); + +	heap_free(&heap);  done:  	return retval;  } @@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)  static int update_relax_domain_level(struct cpuset *cs, s64 val)  { +#ifdef CONFIG_SMP  	if (val < -1 || val >= SD_LV_MAX)  		return -EINVAL; +#endif  	if (val != cs->relax_domain_level) {  		cs->relax_domain_level = val; @@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,  			     struct cgroup *cont, struct task_struct *tsk)  {  	struct cpuset *cs = cgroup_cs(cont); -	int ret = 0;  	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))  		return -ENOSPC; -	if (tsk->flags & PF_THREAD_BOUND) { -		mutex_lock(&callback_mutex); -		if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) -			ret = -EINVAL; -		mutex_unlock(&callback_mutex); -	} +	/* +	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we +	 * cannot change their cpu affinity and isolating such threads by their +	 * set of allowed nodes is unnecessary.  Thus, cpusets are not +	 * applicable for such threads.  This prevents checking for success of +	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may +	 * be changed. +	 */ +	if (tsk->flags & PF_THREAD_BOUND) +		return -EINVAL; -	return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); +	return security_task_setscheduler(tsk, 0, NULL);  }  static void cpuset_attach(struct cgroup_subsys *ss, @@ -1706,6 +1696,7 @@ static struct cftype files[] = {  		.read_u64 = cpuset_read_u64,  		.write_u64 = cpuset_write_u64,  		.private = FILE_MEMORY_PRESSURE, +		.mode = S_IRUGO,  	},  	{ @@ -1913,10 +1904,9 @@ int __init cpuset_init(void)  static void cpuset_do_move_task(struct task_struct *tsk,  				struct cgroup_scanner *scan)  { -	struct cpuset_hotplug_scanner *chsp; +	struct cgroup *new_cgroup = scan->data; -	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); -	cgroup_attach_task(chsp->to, tsk); +	cgroup_attach_task(new_cgroup, tsk);  }  /** @@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,   */  static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)  { -	struct cpuset_hotplug_scanner scan; +	struct cgroup_scanner scan; -	scan.scan.cg = from->css.cgroup; -	scan.scan.test_task = NULL; /* select all tasks in cgroup */ -	scan.scan.process_task = cpuset_do_move_task; -	scan.scan.heap = NULL; -	scan.to = to->css.cgroup; +	scan.cg = from->css.cgroup; +	scan.test_task = NULL; /* select all tasks in cgroup */ +	scan.process_task = cpuset_do_move_task; +	scan.heap = NULL; +	scan.data = to->css.cgroup; -	if (cgroup_scan_tasks(&scan.scan)) +	if (cgroup_scan_tasks(&scan))  		printk(KERN_ERR "move_member_tasks_to_cpuset: "  				"cgroup_scan_tasks failed\n");  } @@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)  			remove_tasks_in_empty_cpuset(cp);  		else {  			update_tasks_cpumask(cp, NULL); -			update_tasks_nodemask(cp, &oldmems); +			update_tasks_nodemask(cp, &oldmems, NULL);  		}  	}  } @@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,  	}  	cgroup_lock(); +	mutex_lock(&callback_mutex);  	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); +	mutex_unlock(&callback_mutex);  	scan_for_empty_cpusets(&top_cpuset);  	ndoms = generate_sched_domains(&doms, &attr);  	cgroup_unlock(); @@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,  	cgroup_lock();  	switch (action) {  	case MEM_ONLINE: -		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; -		break;  	case MEM_OFFLINE: +		mutex_lock(&callback_mutex);  		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; -		scan_for_empty_cpusets(&top_cpuset); +		mutex_unlock(&callback_mutex); +		if (action == MEM_OFFLINE) +			scan_for_empty_cpusets(&top_cpuset);  		break;  	default:  		break; @@ -2206,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)  }  /** - * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? - * @z: is this zone on an allowed node? + * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * @node: is this an allowed node?   * @gfp_mask: memory allocation flags   * - * If we're in interrupt, yes, we can always allocate.  If - * __GFP_THISNODE is set, yes, we can always allocate.  If zone - * z's node is in our tasks mems_allowed, yes.  If it's not a - * __GFP_HARDWALL request and this zone's nodes is in the nearest - * hardwalled cpuset ancestor to this tasks cpuset, yes. - * If the task has been OOM killed and has access to memory reserves - * as specified by the TIF_MEMDIE flag, yes. + * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is + * set, yes, we can always allocate.  If node is in our task's mems_allowed, + * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest + * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been + * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE + * flag, yes.   * Otherwise, no.   * - * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() - * reduces to cpuset_zone_allowed_hardwall().  Otherwise, - * cpuset_zone_allowed_softwall() might sleep, and might allow a zone - * from an enclosing cpuset. + * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to + * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall() + * might sleep, and might allow a node from an enclosing cpuset.   * - * cpuset_zone_allowed_hardwall() only handles the simpler case of - * hardwall cpusets, and never sleeps. + * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall + * cpusets, and never sleeps.   *   * The __GFP_THISNODE placement logic is really handled elsewhere,   * by forcibly using a zonelist starting at a specified node, and by @@ -2264,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)   *	GFP_USER     - only nodes in current tasks mems allowed ok.   *   * Rule: - *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you + *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you   *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables   *    the code that might scan up ancestor cpusets and sleep.   */ - -int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) +int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)  { -	int node;			/* node that zone z is on */  	const struct cpuset *cs;	/* current cpuset ancestors */  	int allowed;			/* is allocation in zone z allowed? */  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))  		return 1; -	node = zone_to_nid(z);  	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));  	if (node_isset(node, current->mems_allowed))  		return 1; @@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)  }  /* - * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? - * @z: is this zone on an allowed node? + * cpuset_node_allowed_hardwall - Can we allocate on a memory node? + * @node: is this an allowed node?   * @gfp_mask: memory allocation flags   * - * If we're in interrupt, yes, we can always allocate. - * If __GFP_THISNODE is set, yes, we can always allocate.  If zone - * z's node is in our tasks mems_allowed, yes.   If the task has been - * OOM killed and has access to memory reserves as specified by the - * TIF_MEMDIE flag, yes.  Otherwise, no. + * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is + * set, yes, we can always allocate.  If node is in our task's mems_allowed, + * yes.  If the task has been OOM killed and has access to memory reserves as + * specified by the TIF_MEMDIE flag, yes. + * Otherwise, no.   *   * The __GFP_THISNODE placement logic is really handled elsewhere,   * by forcibly using a zonelist starting at a specified node, and by @@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)   * any node on the zonelist except the first.  By the time any such   * calls get to this routine, we should just shut up and say 'yes'.   * - * Unlike the cpuset_zone_allowed_softwall() variant, above, - * this variant requires that the zone be in the current tasks + * Unlike the cpuset_node_allowed_softwall() variant, above, + * this variant requires that the node be in the current task's   * mems_allowed or that we're in interrupt.  It does not scan up the   * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.   * It never sleeps.   */ - -int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)  { -	int node;			/* node that zone z is on */ -  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))  		return 1; -	node = zone_to_nid(z);  	if (node_isset(node, current->mems_allowed))  		return 1;  	/* diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 667c841c295..c35452cadde 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -18,6 +18,7 @@  #include <linux/syscalls.h>  #include <linux/sysctl.h>  #include <linux/types.h> +#include <linux/fs_struct.h>  static void default_handler(int, struct pt_regs *); @@ -145,28 +146,6 @@ __set_personality(u_long personality)  		return 0;  	} -	if (atomic_read(¤t->fs->count) != 1) { -		struct fs_struct *fsp, *ofsp; - -		fsp = copy_fs_struct(current->fs); -		if (fsp == NULL) { -			module_put(ep->module); -			return -ENOMEM; -		} - -		task_lock(current); -		ofsp = current->fs; -		current->fs = fsp; -		task_unlock(current); - -		put_fs_struct(ofsp); -	} - -	/* -	 * At that point we are guaranteed to be the sole owner of -	 * current->fs. -	 */ -  	current->personality = personality;  	oep = current_thread_info()->exec_domain;  	current_thread_info()->exec_domain = ep; diff --git a/kernel/exit.c b/kernel/exit.c index efd30ccf385..6686ed1e4aa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -46,6 +46,7 @@  #include <linux/blkdev.h>  #include <linux/task_io_accounting_ops.h>  #include <linux/tracehook.h> +#include <linux/fs_struct.h>  #include <linux/init_task.h>  #include <trace/sched.h> @@ -61,11 +62,6 @@ DEFINE_TRACE(sched_process_wait);  static void exit_mm(struct task_struct * tsk); -static inline int task_detached(struct task_struct *p) -{ -	return p->exit_signal == -1; -} -  static void __unhash_process(struct task_struct *p)  {  	nr_threads--; @@ -362,16 +358,12 @@ static void reparent_to_kthreadd(void)  void __set_special_pids(struct pid *pid)  {  	struct task_struct *curr = current->group_leader; -	pid_t nr = pid_nr(pid); -	if (task_session(curr) != pid) { +	if (task_session(curr) != pid)  		change_pid(curr, PIDTYPE_SID, pid); -		set_task_session(curr, nr); -	} -	if (task_pgrp(curr) != pid) { + +	if (task_pgrp(curr) != pid)  		change_pid(curr, PIDTYPE_PGID, pid); -		set_task_pgrp(curr, nr); -	}  }  static void set_special_pids(struct pid *pid) @@ -429,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal);  void daemonize(const char *name, ...)  {  	va_list args; -	struct fs_struct *fs;  	sigset_t blocked;  	va_start(args, name); @@ -462,11 +453,7 @@ void daemonize(const char *name, ...)  	/* Become as one with the init task */ -	exit_fs(current);	/* current->fs->count--; */ -	fs = init_task.fs; -	current->fs = fs; -	atomic_inc(&fs->count); - +	daemonize_fs_struct();  	exit_files(current);  	current->files = init_task.files;  	atomic_inc(¤t->files->count); @@ -565,30 +552,6 @@ void exit_files(struct task_struct *tsk)  	}  } -void put_fs_struct(struct fs_struct *fs) -{ -	/* No need to hold fs->lock if we are killing it */ -	if (atomic_dec_and_test(&fs->count)) { -		path_put(&fs->root); -		path_put(&fs->pwd); -		kmem_cache_free(fs_cachep, fs); -	} -} - -void exit_fs(struct task_struct *tsk) -{ -	struct fs_struct * fs = tsk->fs; - -	if (fs) { -		task_lock(tsk); -		tsk->fs = NULL; -		task_unlock(tsk); -		put_fs_struct(fs); -	} -} - -EXPORT_SYMBOL_GPL(exit_fs); -  #ifdef CONFIG_MM_OWNER  /*   * Task p is exiting and it owned mm, lets find a new owner for it @@ -732,119 +695,6 @@ static void exit_mm(struct task_struct * tsk)  }  /* - * Return nonzero if @parent's children should reap themselves. - * - * Called with write_lock_irq(&tasklist_lock) held. - */ -static int ignoring_children(struct task_struct *parent) -{ -	int ret; -	struct sighand_struct *psig = parent->sighand; -	unsigned long flags; -	spin_lock_irqsave(&psig->siglock, flags); -	ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || -	       (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); -	spin_unlock_irqrestore(&psig->siglock, flags); -	return ret; -} - -/* - * Detach all tasks we were using ptrace on. - * Any that need to be release_task'd are put on the @dead list. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_exit(struct task_struct *parent, struct list_head *dead) -{ -	struct task_struct *p, *n; -	int ign = -1; - -	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { -		__ptrace_unlink(p); - -		if (p->exit_state != EXIT_ZOMBIE) -			continue; - -		/* -		 * If it's a zombie, our attachedness prevented normal -		 * parent notification or self-reaping.  Do notification -		 * now if it would have happened earlier.  If it should -		 * reap itself, add it to the @dead list.  We can't call -		 * release_task() here because we already hold tasklist_lock. -		 * -		 * If it's our own child, there is no notification to do. -		 * But if our normal children self-reap, then this child -		 * was prevented by ptrace and we must reap it now. -		 */ -		if (!task_detached(p) && thread_group_empty(p)) { -			if (!same_thread_group(p->real_parent, parent)) -				do_notify_parent(p, p->exit_signal); -			else { -				if (ign < 0) -					ign = ignoring_children(parent); -				if (ign) -					p->exit_signal = -1; -			} -		} - -		if (task_detached(p)) { -			/* -			 * Mark it as in the process of being reaped. -			 */ -			p->exit_state = EXIT_DEAD; -			list_add(&p->ptrace_entry, dead); -		} -	} -} - -/* - * Finish up exit-time ptrace cleanup. - * - * Called without locks. - */ -static void ptrace_exit_finish(struct task_struct *parent, -			       struct list_head *dead) -{ -	struct task_struct *p, *n; - -	BUG_ON(!list_empty(&parent->ptraced)); - -	list_for_each_entry_safe(p, n, dead, ptrace_entry) { -		list_del_init(&p->ptrace_entry); -		release_task(p); -	} -} - -static void reparent_thread(struct task_struct *p, struct task_struct *father) -{ -	if (p->pdeath_signal) -		/* We already hold the tasklist_lock here.  */ -		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - -	list_move_tail(&p->sibling, &p->real_parent->children); - -	/* If this is a threaded reparent there is no need to -	 * notify anyone anything has happened. -	 */ -	if (same_thread_group(p->real_parent, father)) -		return; - -	/* We don't want people slaying init.  */ -	if (!task_detached(p)) -		p->exit_signal = SIGCHLD; - -	/* If we'd notified the old parent about this child's death, -	 * also notify the new parent. -	 */ -	if (!ptrace_reparented(p) && -	    p->exit_state == EXIT_ZOMBIE && -	    !task_detached(p) && thread_group_empty(p)) -		do_notify_parent(p, p->exit_signal); - -	kill_orphaned_pgrp(p, father); -} - -/*   * When we die, we re-parent all our children.   * Try to give them to another thread in our thread   * group, and if no such member exists, give it to @@ -883,17 +733,51 @@ static struct task_struct *find_new_reaper(struct task_struct *father)  	return pid_ns->child_reaper;  } +/* +* Any that need to be release_task'd are put on the @dead list. + */ +static void reparent_thread(struct task_struct *father, struct task_struct *p, +				struct list_head *dead) +{ +	if (p->pdeath_signal) +		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + +	list_move_tail(&p->sibling, &p->real_parent->children); + +	if (task_detached(p)) +		return; +	/* +	 * If this is a threaded reparent there is no need to +	 * notify anyone anything has happened. +	 */ +	if (same_thread_group(p->real_parent, father)) +		return; + +	/* We don't want people slaying init.  */ +	p->exit_signal = SIGCHLD; + +	/* If it has exited notify the new parent about this child's death. */ +	if (!p->ptrace && +	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { +		do_notify_parent(p, p->exit_signal); +		if (task_detached(p)) { +			p->exit_state = EXIT_DEAD; +			list_move_tail(&p->sibling, dead); +		} +	} + +	kill_orphaned_pgrp(p, father); +} +  static void forget_original_parent(struct task_struct *father)  {  	struct task_struct *p, *n, *reaper; -	LIST_HEAD(ptrace_dead); +	LIST_HEAD(dead_children); + +	exit_ptrace(father);  	write_lock_irq(&tasklist_lock);  	reaper = find_new_reaper(father); -	/* -	 * First clean up ptrace if we were using it. -	 */ -	ptrace_exit(father, &ptrace_dead);  	list_for_each_entry_safe(p, n, &father->children, sibling) {  		p->real_parent = reaper; @@ -901,13 +785,16 @@ static void forget_original_parent(struct task_struct *father)  			BUG_ON(p->ptrace);  			p->parent = p->real_parent;  		} -		reparent_thread(p, father); +		reparent_thread(father, p, &dead_children);  	} -  	write_unlock_irq(&tasklist_lock); +  	BUG_ON(!list_empty(&father->children)); -	ptrace_exit_finish(father, &ptrace_dead); +	list_for_each_entry_safe(p, n, &dead_children, sibling) { +		list_del_init(&p->sibling); +		release_task(p); +	}  }  /* @@ -980,12 +867,9 @@ static void check_stack_usage(void)  {  	static DEFINE_SPINLOCK(low_water_lock);  	static int lowest_to_date = THREAD_SIZE; -	unsigned long *n = end_of_stack(current);  	unsigned long free; -	while (*n == 0) -		n++; -	free = (unsigned long)n - (unsigned long)end_of_stack(current); +	free = stack_not_used(current);  	if (free >= lowest_to_date)  		return; @@ -1420,6 +1304,18 @@ static int wait_task_zombie(struct task_struct *p, int options,  	return retval;  } +static int *task_stopped_code(struct task_struct *p, bool ptrace) +{ +	if (ptrace) { +		if (task_is_stopped_or_traced(p)) +			return &p->exit_code; +	} else { +		if (p->signal->flags & SIGNAL_STOP_STOPPED) +			return &p->signal->group_exit_code; +	} +	return NULL; +} +  /*   * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold   * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold @@ -1430,7 +1326,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,  			     int options, struct siginfo __user *infop,  			     int __user *stat_addr, struct rusage __user *ru)  { -	int retval, exit_code, why; +	int retval, exit_code, *p_code, why;  	uid_t uid = 0; /* unneeded, required by compiler */  	pid_t pid; @@ -1440,22 +1336,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,  	exit_code = 0;  	spin_lock_irq(&p->sighand->siglock); -	if (unlikely(!task_is_stopped_or_traced(p))) -		goto unlock_sig; - -	if (!ptrace && p->signal->group_stop_count > 0) -		/* -		 * A group stop is in progress and this is the group leader. -		 * We won't report until all threads have stopped. -		 */ +	p_code = task_stopped_code(p, ptrace); +	if (unlikely(!p_code))  		goto unlock_sig; -	exit_code = p->exit_code; +	exit_code = *p_code;  	if (!exit_code)  		goto unlock_sig;  	if (!unlikely(options & WNOWAIT)) -		p->exit_code = 0; +		*p_code = 0;  	/* don't need the RCU readlock here as we're holding a spinlock */  	uid = __task_cred(p)->uid; @@ -1611,7 +1501,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,  	 */  	*notask_error = 0; -	if (task_is_stopped_or_traced(p)) +	if (task_stopped_code(p, ptrace))  		return wait_task_stopped(ptrace, p, options,  					 infop, stat_addr, ru); @@ -1815,7 +1705,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,  		pid = find_get_pid(-upid);  	} else if (upid == 0) {  		type = PIDTYPE_PGID; -		pid = get_pid(task_pgrp(current)); +		pid = get_task_pid(current, PIDTYPE_PGID);  	} else /* upid > 0 */ {  		type = PIDTYPE_PID;  		pid = find_get_pid(upid); diff --git a/kernel/extable.c b/kernel/extable.c index e136ed8d82b..c46da6a4703 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)  	return e;  } +static inline int init_kernel_text(unsigned long addr) +{ +	if (addr >= (unsigned long)_sinittext && +	    addr <= (unsigned long)_einittext) +		return 1; +	return 0; +} +  __notrace_funcgraph int core_kernel_text(unsigned long addr)  {  	if (addr >= (unsigned long)_stext && @@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)  		return 1;  	if (system_state == SYSTEM_BOOTING && -	    addr >= (unsigned long)_sinittext && -	    addr <= (unsigned long)_einittext) +	    init_kernel_text(addr))  		return 1;  	return 0;  } @@ -58,7 +65,19 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)  {  	if (core_kernel_text(addr))  		return 1; -	return __module_text_address(addr) != NULL; +	if (__module_text_address(addr)) +		return 1; +	/* +	 * There might be init symbols in saved stacktraces. +	 * Give those symbols a chance to be printed in +	 * backtraces (such as lockdep traces). +	 * +	 * Since we are after the module-symbols check, there's +	 * no danger of address overlap: +	 */ +	if (init_kernel_text(addr)) +		return 1; +	return 0;  }  int kernel_text_address(unsigned long addr) diff --git a/kernel/fork.c b/kernel/fork.c index a66fbde2071..660c2b8765b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -60,7 +60,9 @@  #include <linux/tty.h>  #include <linux/proc_fs.h>  #include <linux/blkdev.h> +#include <linux/fs_struct.h>  #include <trace/sched.h> +#include <linux/magic.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h> @@ -212,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)  {  	struct task_struct *tsk;  	struct thread_info *ti; +	unsigned long *stackend; +  	int err;  	prepare_to_copy(orig); @@ -237,6 +241,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)  		goto out;  	setup_thread_stack(tsk, orig); +	stackend = end_of_stack(tsk); +	*stackend = STACK_END_MAGIC;	/* for overflow detection */  #ifdef CONFIG_CC_STACKPROTECTOR  	tsk->stack_canary = get_random_int(); @@ -279,7 +285,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)  	mm->free_area_cache = oldmm->mmap_base;  	mm->cached_hole_size = ~0UL;  	mm->map_count = 0; -	cpus_clear(mm->cpu_vm_mask); +	cpumask_clear(mm_cpumask(mm));  	mm->mm_rb = RB_ROOT;  	rb_link = &mm->mm_rb.rb_node;  	rb_parent = NULL; @@ -676,38 +682,21 @@ fail_nomem:  	return retval;  } -static struct fs_struct *__copy_fs_struct(struct fs_struct *old) -{ -	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); -	/* We don't need to lock fs - think why ;-) */ -	if (fs) { -		atomic_set(&fs->count, 1); -		rwlock_init(&fs->lock); -		fs->umask = old->umask; -		read_lock(&old->lock); -		fs->root = old->root; -		path_get(&old->root); -		fs->pwd = old->pwd; -		path_get(&old->pwd); -		read_unlock(&old->lock); -	} -	return fs; -} - -struct fs_struct *copy_fs_struct(struct fs_struct *old) -{ -	return __copy_fs_struct(old); -} - -EXPORT_SYMBOL_GPL(copy_fs_struct); -  static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)  { +	struct fs_struct *fs = current->fs;  	if (clone_flags & CLONE_FS) { -		atomic_inc(¤t->fs->count); +		/* tsk->fs is already what we want */ +		write_lock(&fs->lock); +		if (fs->in_exec) { +			write_unlock(&fs->lock); +			return -EAGAIN; +		} +		fs->users++; +		write_unlock(&fs->lock);  		return 0;  	} -	tsk->fs = __copy_fs_struct(current->fs); +	tsk->fs = copy_fs_struct(fs);  	if (!tsk->fs)  		return -ENOMEM;  	return 0; @@ -836,6 +825,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	atomic_set(&sig->live, 1);  	init_waitqueue_head(&sig->wait_chldexit);  	sig->flags = 0; +	if (clone_flags & CLONE_NEWPID) +		sig->flags |= SIGNAL_UNKILLABLE;  	sig->group_exit_code = 0;  	sig->group_exit_task = NULL;  	sig->group_stop_count = 0; @@ -1120,7 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  		goto bad_fork_cleanup_mm;  	if ((retval = copy_io(clone_flags, p)))  		goto bad_fork_cleanup_namespaces; -	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); +	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);  	if (retval)  		goto bad_fork_cleanup_io; @@ -1179,10 +1170,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  #endif  	clear_all_latency_tracing(p); -	/* Our parent execution domain becomes current domain -	   These must match for thread signalling to apply */ -	p->parent_exec_id = p->self_exec_id; -  	/* ok, now we should be set up.. */  	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);  	p->pdeath_signal = 0; @@ -1220,10 +1207,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,  		set_task_cpu(p, smp_processor_id());  	/* CLONE_PARENT re-uses the old parent */ -	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) +	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {  		p->real_parent = current->real_parent; -	else +		p->parent_exec_id = current->parent_exec_id; +	} else {  		p->real_parent = current; +		p->parent_exec_id = current->self_exec_id; +	}  	spin_lock(¤t->sighand->siglock); @@ -1259,8 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  			p->signal->leader_pid = pid;  			tty_kref_put(p->signal->tty);  			p->signal->tty = tty_kref_get(current->signal->tty); -			set_task_pgrp(p, task_pgrp_nr(current)); -			set_task_session(p, task_session_nr(current));  			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));  			attach_pid(p, PIDTYPE_SID, task_session(current));  			list_add_tail_rcu(&p->tasks, &init_task.tasks); @@ -1484,6 +1472,7 @@ void __init proc_caches_init(void)  	mm_cachep = kmem_cache_create("mm_struct",  			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,  			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);  	mmap_init();  } @@ -1539,12 +1528,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)  {  	struct fs_struct *fs = current->fs; -	if ((unshare_flags & CLONE_FS) && -	    (fs && atomic_read(&fs->count) > 1)) { -		*new_fsp = __copy_fs_struct(current->fs); -		if (!*new_fsp) -			return -ENOMEM; -	} +	if (!(unshare_flags & CLONE_FS) || !fs) +		return 0; + +	/* don't need lock here; in the worst case we'll do useless copy */ +	if (fs->users == 1) +		return 0; + +	*new_fsp = copy_fs_struct(fs); +	if (!*new_fsp) +		return -ENOMEM;  	return 0;  } @@ -1660,8 +1653,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)  		if (new_fs) {  			fs = current->fs; +			write_lock(&fs->lock);  			current->fs = new_fs; -			new_fs = fs; +			if (--fs->users) +				new_fs = NULL; +			else +				new_fs = fs; +			write_unlock(&fs->lock);  		}  		if (new_mm) { @@ -1700,7 +1698,7 @@ bad_unshare_cleanup_sigh:  bad_unshare_cleanup_fs:  	if (new_fs) -		put_fs_struct(new_fs); +		free_fs_struct(new_fs);  bad_unshare_cleanup_thread:  bad_unshare_out: diff --git a/kernel/futex.c b/kernel/futex.c index 438701adce2..6b50a024bca 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -114,7 +114,9 @@ struct futex_q {  };  /* - * Split the global futex_lock into every hash list lock. + * Hash buckets are shared by all the futex_keys that hash to the same + * location.  Each key may have multiple futex_q structures, one for each task + * waiting on a futex.   */  struct futex_hash_bucket {  	spinlock_t lock; @@ -189,8 +191,7 @@ static void drop_futex_key_refs(union futex_key *key)  /**   * get_futex_key - Get parameters which are the keys for a futex.   * @uaddr: virtual address of the futex - * @shared: NULL for a PROCESS_PRIVATE futex, - *	¤t->mm->mmap_sem for a PROCESS_SHARED futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED   * @key: address where result is stored.   *   * Returns a negative error code or 0 @@ -200,9 +201,7 @@ static void drop_futex_key_refs(union futex_key *key)   * offset_within_page).  For private mappings, it's (uaddr, current->mm).   * We can usually work out the index without swapping in the page.   * - * fshared is NULL for PROCESS_PRIVATE futexes - * For other futexes, it points to ¤t->mm->mmap_sem and - * caller must have taken the reader lock. but NOT any spinlocks. + * lock_page() might sleep, the caller should not hold a spinlock.   */  static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)  { @@ -299,41 +298,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)  	return ret ? -EFAULT : 0;  } -/* - * Fault handling. - */ -static int futex_handle_fault(unsigned long address, int attempt) -{ -	struct vm_area_struct * vma; -	struct mm_struct *mm = current->mm; -	int ret = -EFAULT; - -	if (attempt > 2) -		return ret; - -	down_read(&mm->mmap_sem); -	vma = find_vma(mm, address); -	if (vma && address >= vma->vm_start && -	    (vma->vm_flags & VM_WRITE)) { -		int fault; -		fault = handle_mm_fault(mm, vma, address, 1); -		if (unlikely((fault & VM_FAULT_ERROR))) { -#if 0 -			/* XXX: let's do this when we verify it is OK */ -			if (ret & VM_FAULT_OOM) -				ret = -ENOMEM; -#endif -		} else { -			ret = 0; -			if (fault & VM_FAULT_MAJOR) -				current->maj_flt++; -			else -				current->min_flt++; -		} -	} -	up_read(&mm->mmap_sem); -	return ret; -}  /*   * PI code: @@ -589,10 +553,9 @@ static void wake_futex(struct futex_q *q)  	 * The waiting task can free the futex_q as soon as this is written,  	 * without taking any locks.  This must come last.  	 * -	 * A memory barrier is required here to prevent the following store -	 * to lock_ptr from getting ahead of the wakeup. Clearing the lock -	 * at the end of wake_up_all() does not prevent this store from -	 * moving. +	 * A memory barrier is required here to prevent the following store to +	 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the +	 * end of wake_up() does not prevent this store from moving.  	 */  	smp_wmb();  	q->lock_ptr = NULL; @@ -692,9 +655,16 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)  	}  } +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ +	spin_unlock(&hb1->lock); +	if (hb1 != hb2) +		spin_unlock(&hb2->lock); +} +  /* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: + * Wake up waiters matching bitset queued on this futex (uaddr).   */  static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)  { @@ -750,9 +720,9 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,  	struct futex_hash_bucket *hb1, *hb2;  	struct plist_head *head;  	struct futex_q *this, *next; -	int ret, op_ret, attempt = 0; +	int ret, op_ret; -retryfull: +retry:  	ret = get_futex_key(uaddr1, fshared, &key1);  	if (unlikely(ret != 0))  		goto out; @@ -763,16 +733,13 @@ retryfull:  	hb1 = hash_futex(&key1);  	hb2 = hash_futex(&key2); -retry:  	double_lock_hb(hb1, hb2); - +retry_private:  	op_ret = futex_atomic_op_inuser(op, uaddr2);  	if (unlikely(op_ret < 0)) {  		u32 dummy; -		spin_unlock(&hb1->lock); -		if (hb1 != hb2) -			spin_unlock(&hb2->lock); +		double_unlock_hb(hb1, hb2);  #ifndef CONFIG_MMU  		/* @@ -788,26 +755,16 @@ retry:  			goto out_put_keys;  		} -		/* -		 * futex_atomic_op_inuser needs to both read and write -		 * *(int __user *)uaddr2, but we can't modify it -		 * non-atomically.  Therefore, if get_user below is not -		 * enough, we need to handle the fault ourselves, while -		 * still holding the mmap_sem. -		 */ -		if (attempt++) { -			ret = futex_handle_fault((unsigned long)uaddr2, -						 attempt); -			if (ret) -				goto out_put_keys; -			goto retry; -		} -  		ret = get_user(dummy, uaddr2);  		if (ret) -			return ret; +			goto out_put_keys; + +		if (!fshared) +			goto retry_private; -		goto retryfull; +		put_futex_key(fshared, &key2); +		put_futex_key(fshared, &key1); +		goto retry;  	}  	head = &hb1->chain; @@ -834,9 +791,7 @@ retry:  		ret += op_ret;  	} -	spin_unlock(&hb1->lock); -	if (hb1 != hb2) -		spin_unlock(&hb2->lock); +	double_unlock_hb(hb1, hb2);  out_put_keys:  	put_futex_key(fshared, &key2);  out_put_key1: @@ -869,6 +824,7 @@ retry:  	hb1 = hash_futex(&key1);  	hb2 = hash_futex(&key2); +retry_private:  	double_lock_hb(hb1, hb2);  	if (likely(cmpval != NULL)) { @@ -877,16 +833,18 @@ retry:  		ret = get_futex_value_locked(&curval, uaddr1);  		if (unlikely(ret)) { -			spin_unlock(&hb1->lock); -			if (hb1 != hb2) -				spin_unlock(&hb2->lock); +			double_unlock_hb(hb1, hb2);  			ret = get_user(curval, uaddr1); +			if (ret) +				goto out_put_keys; -			if (!ret) -				goto retry; +			if (!fshared) +				goto retry_private; -			goto out_put_keys; +			put_futex_key(fshared, &key2); +			put_futex_key(fshared, &key1); +			goto retry;  		}  		if (curval != *cmpval) {  			ret = -EAGAIN; @@ -923,9 +881,7 @@ retry:  	}  out_unlock: -	spin_unlock(&hb1->lock); -	if (hb1 != hb2) -		spin_unlock(&hb2->lock); +	double_unlock_hb(hb1, hb2);  	/* drop_futex_key_refs() must be called outside the spinlocks. */  	while (--drop_count >= 0) @@ -1063,7 +1019,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,  	struct futex_pi_state *pi_state = q->pi_state;  	struct task_struct *oldowner = pi_state->owner;  	u32 uval, curval, newval; -	int ret, attempt = 0; +	int ret;  	/* Owner died? */  	if (!pi_state->owner) @@ -1076,11 +1032,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,  	 * in the user space variable. This must be atomic as we have  	 * to preserve the owner died bit here.  	 * -	 * Note: We write the user space value _before_ changing the -	 * pi_state because we can fault here. Imagine swapped out -	 * pages or a fork, which was running right before we acquired -	 * mmap_sem, that marked all the anonymous memory readonly for -	 * cow. +	 * Note: We write the user space value _before_ changing the pi_state +	 * because we can fault here. Imagine swapped out pages or a fork +	 * that marked all the anonymous memory readonly for cow.  	 *  	 * Modifying pi_state _before_ the user space value would  	 * leave the pi_state in an inconsistent state when we fault @@ -1136,7 +1090,7 @@ retry:  handle_fault:  	spin_unlock(q->lock_ptr); -	ret = futex_handle_fault((unsigned long)uaddr, attempt++); +	ret = get_user(uval, uaddr);  	spin_lock(q->lock_ptr); @@ -1185,10 +1139,11 @@ retry:  	if (unlikely(ret != 0))  		goto out; +retry_private:  	hb = queue_lock(&q);  	/* -	 * Access the page AFTER the futex is queued. +	 * Access the page AFTER the hash-bucket is locked.  	 * Order is important:  	 *  	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); @@ -1204,20 +1159,23 @@ retry:  	 * a wakeup when *uaddr != val on entry to the syscall.  This is  	 * rare, but normal.  	 * -	 * for shared futexes, we hold the mmap semaphore, so the mapping +	 * For shared futexes, we hold the mmap semaphore, so the mapping  	 * cannot have changed since we looked it up in get_futex_key.  	 */  	ret = get_futex_value_locked(&uval, uaddr);  	if (unlikely(ret)) {  		queue_unlock(&q, hb); -		put_futex_key(fshared, &q.key);  		ret = get_user(uval, uaddr); +		if (ret) +			goto out_put_key; -		if (!ret) -			goto retry; -		goto out; +		if (!fshared) +			goto retry_private; + +		put_futex_key(fshared, &q.key); +		goto retry;  	}  	ret = -EWOULDBLOCK;  	if (unlikely(uval != val)) { @@ -1248,16 +1206,13 @@ retry:  		if (!abs_time)  			schedule();  		else { -			unsigned long slack; -			slack = current->timer_slack_ns; -			if (rt_task(current)) -				slack = 0;  			hrtimer_init_on_stack(&t.timer,  					      clockrt ? CLOCK_REALTIME :  					      CLOCK_MONOTONIC,  					      HRTIMER_MODE_ABS);  			hrtimer_init_sleeper(&t, current); -			hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); +			hrtimer_set_expires_range_ns(&t.timer, *abs_time, +						     current->timer_slack_ns);  			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);  			if (!hrtimer_active(&t.timer)) @@ -1354,7 +1309,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,  	struct futex_hash_bucket *hb;  	u32 uval, newval, curval;  	struct futex_q q; -	int ret, lock_taken, ownerdied = 0, attempt = 0; +	int ret, lock_taken, ownerdied = 0;  	if (refill_pi_state_cache())  		return -ENOMEM; @@ -1374,7 +1329,7 @@ retry:  	if (unlikely(ret != 0))  		goto out; -retry_unlocked: +retry_private:  	hb = queue_lock(&q);  retry_locked: @@ -1458,6 +1413,7 @@ retry_locked:  			 * exit to complete.  			 */  			queue_unlock(&q, hb); +			put_futex_key(fshared, &q.key);  			cond_resched();  			goto retry; @@ -1564,6 +1520,13 @@ retry_locked:  		}  	} +	/* +	 * If fixup_pi_state_owner() faulted and was unable to handle the +	 * fault, unlock it and return the fault to userspace. +	 */ +	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) +		rt_mutex_unlock(&q.pi_state->pi_mutex); +  	/* Unqueue and drop the lock */  	unqueue_me_pi(&q); @@ -1591,22 +1554,18 @@ uaddr_faulted:  	 */  	queue_unlock(&q, hb); -	if (attempt++) { -		ret = futex_handle_fault((unsigned long)uaddr, attempt); -		if (ret) -			goto out_put_key; -		goto retry_unlocked; -	} -  	ret = get_user(uval, uaddr); -	if (!ret) -		goto retry; +	if (ret) +		goto out_put_key; -	if (to) -		destroy_hrtimer_on_stack(&to->timer); -	return ret; +	if (!fshared) +		goto retry_private; + +	put_futex_key(fshared, &q.key); +	goto retry;  } +  /*   * Userspace attempted a TID -> 0 atomic transition, and failed.   * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1619,7 +1578,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)  	u32 uval;  	struct plist_head *head;  	union futex_key key = FUTEX_KEY_INIT; -	int ret, attempt = 0; +	int ret;  retry:  	if (get_user(uval, uaddr)) @@ -1635,7 +1594,6 @@ retry:  		goto out;  	hb = hash_futex(&key); -retry_unlocked:  	spin_lock(&hb->lock);  	/* @@ -1700,14 +1658,7 @@ pi_faulted:  	 * we have to drop the mmap_sem in order to call get_user().  	 */  	spin_unlock(&hb->lock); - -	if (attempt++) { -		ret = futex_handle_fault((unsigned long)uaddr, attempt); -		if (ret) -			goto out; -		uval = 0; -		goto retry_unlocked; -	} +	put_futex_key(fshared, &key);  	ret = get_user(uval, uaddr);  	if (!ret) diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 4dd5b1edac9..3394f8f5296 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o  obj-$(CONFIG_PROC_FS) += proc.o  obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o  obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 7de11bd64df..c687ba4363f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)  	desc->irq_count = 0;  	desc->irqs_unhandled = 0;  #ifdef CONFIG_SMP -	cpumask_setall(&desc->affinity); +	cpumask_setall(desc->affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ +	cpumask_clear(desc->pending_mask); +#endif  #endif  	spin_unlock_irqrestore(&desc->lock, flags);  } @@ -78,6 +81,7 @@ void dynamic_irq_cleanup(unsigned int irq)  	desc->handle_irq = handle_bad_irq;  	desc->chip = &no_irq_chip;  	desc->name = NULL; +	clear_kstat_irqs(desc);  	spin_unlock_irqrestore(&desc->lock, flags);  } @@ -290,7 +294,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)  		desc->chip->mask_ack(irq);  	else {  		desc->chip->mask(irq); -		desc->chip->ack(irq); +		if (desc->chip->ack) +			desc->chip->ack(irq);  	}  } @@ -476,7 +481,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)  	kstat_incr_irqs_this_cpu(irq, desc);  	/* Start handling the irq */ -	desc->chip->ack(irq); +	if (desc->chip->ack) +		desc->chip->ack(irq);  	desc = irq_remap_to_desc(irq, desc);  	/* Mark the IRQ currently in progress.*/ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3aba8d12f32..9ebf7796887 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@  #include <linux/kernel_stat.h>  #include <linux/rculist.h>  #include <linux/hash.h> +#include <linux/bootmem.h>  #include "internals.h" @@ -69,6 +70,7 @@ int nr_irqs = NR_IRQS;  EXPORT_SYMBOL_GPL(nr_irqs);  #ifdef CONFIG_SPARSE_IRQ +  static struct irq_desc irq_desc_init = {  	.irq	    = -1,  	.status	    = IRQ_DISABLED, @@ -76,26 +78,25 @@ static struct irq_desc irq_desc_init = {  	.handle_irq = handle_bad_irq,  	.depth      = 1,  	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP -	.affinity   = CPU_MASK_ALL -#endif  };  void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)  { -	unsigned long bytes; -	char *ptr;  	int node; - -	/* Compute how many bytes we need per irq and allocate them */ -	bytes = nr * sizeof(unsigned int); +	void *ptr;  	node = cpu_to_node(cpu); -	ptr = kzalloc_node(bytes, GFP_ATOMIC, node); -	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node); +	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); -	if (ptr) -		desc->kstat_irqs = (unsigned int *)ptr; +	/* +	 * don't overwite if can not get new one +	 * init_copy_kstat_irqs() could still use old one +	 */ +	if (ptr) { +		printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", +			 cpu, node); +		desc->kstat_irqs = ptr; +	}  }  static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) @@ -113,6 +114,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)  		printk(KERN_ERR "can not alloc kstat_irqs\n");  		BUG_ON(1);  	} +	if (!init_alloc_desc_masks(desc, cpu, false)) { +		printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); +		BUG_ON(1); +	}  	arch_init_chip_data(desc, cpu);  } @@ -121,7 +126,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)   */  DEFINE_SPINLOCK(sparse_irq_lock); -struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; +struct irq_desc **irq_desc_ptrs __read_mostly;  static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {  	[0 ... NR_IRQS_LEGACY-1] = { @@ -131,14 +136,10 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm  		.handle_irq = handle_bad_irq,  		.depth	    = 1,  		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP -		.affinity   = CPU_MASK_ALL -#endif  	}  }; -/* FIXME: use bootmem alloc ...*/ -static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; +static unsigned int *kstat_irqs_legacy;  int __init early_irq_init(void)  { @@ -148,18 +149,30 @@ int __init early_irq_init(void)  	init_irq_default_affinity(); +	 /* initialize nr_irqs based on nr_cpu_ids */ +	arch_probe_nr_irqs(); +	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); +  	desc = irq_desc_legacy;  	legacy_count = ARRAY_SIZE(irq_desc_legacy); +	/* allocate irq_desc_ptrs array based on nr_irqs */ +	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + +	/* allocate based on nr_cpu_ids */ +	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ +	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * +					  sizeof(int)); +  	for (i = 0; i < legacy_count; i++) {  		desc[i].irq = i; -		desc[i].kstat_irqs = kstat_irqs_legacy[i]; +		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;  		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - +		init_alloc_desc_masks(&desc[i], 0, true);  		irq_desc_ptrs[i] = desc + i;  	} -	for (i = legacy_count; i < NR_IRQS; i++) +	for (i = legacy_count; i < nr_irqs; i++)  		irq_desc_ptrs[i] = NULL;  	return arch_early_irq_init(); @@ -167,7 +180,10 @@ int __init early_irq_init(void)  struct irq_desc *irq_to_desc(unsigned int irq)  { -	return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; +	if (irq_desc_ptrs && irq < nr_irqs) +		return irq_desc_ptrs[irq]; + +	return NULL;  }  struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) @@ -176,10 +192,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)  	unsigned long flags;  	int node; -	if (irq >= NR_IRQS) { -		printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", -				irq, NR_IRQS); -		WARN_ON(1); +	if (irq >= nr_irqs) { +		WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", +			irq, nr_irqs);  		return NULL;  	} @@ -221,12 +236,10 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {  		.handle_irq = handle_bad_irq,  		.depth = 1,  		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), -#ifdef CONFIG_SMP -		.affinity = CPU_MASK_ALL -#endif  	}  }; +static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];  int __init early_irq_init(void)  {  	struct irq_desc *desc; @@ -235,12 +248,16 @@ int __init early_irq_init(void)  	init_irq_default_affinity(); +	printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); +  	desc = irq_desc;  	count = ARRAY_SIZE(irq_desc); -	for (i = 0; i < count; i++) +	for (i = 0; i < count; i++) {  		desc[i].irq = i; - +		init_alloc_desc_masks(&desc[i], 0, true); +		desc[i].kstat_irqs = kstat_irqs_all[i]; +	}  	return arch_early_irq_init();  } @@ -255,6 +272,11 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)  }  #endif /* !CONFIG_SPARSE_IRQ */ +void clear_kstat_irqs(struct irq_desc *desc) +{ +	memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); +} +  /*   * What should we do if we get a hw irq event on an illegal vector?   * Each architecture has to answer this themself. @@ -328,6 +350,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)  	irqreturn_t ret, retval = IRQ_NONE;  	unsigned int status = 0; +	WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!"); +  	if (!(action->flags & IRQF_DISABLED))  		local_irq_enable_in_hardirq(); @@ -347,6 +371,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)  }  #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ + +#ifdef CONFIG_ENABLE_WARN_DEPRECATED +# warning __do_IRQ is deprecated. Please convert to proper flow handlers +#endif +  /**   * __do_IRQ - original all in one highlevel IRQ handler   * @irq:	the interrupt number @@ -467,12 +496,10 @@ void early_init_irq_lock_class(void)  	}  } -#ifdef CONFIG_SPARSE_IRQ  unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)  {  	struct irq_desc *desc = irq_to_desc(irq);  	return desc ? desc->kstat_irqs[cpu] : 0;  } -#endif  EXPORT_SYMBOL(kstat_irqs_cpu); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e6d0a43cc12..01ce20eab38 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -12,11 +12,21 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);  extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,  		unsigned long flags); +extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); +extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);  extern struct lock_class_key irq_desc_lock_class;  extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void clear_kstat_irqs(struct irq_desc *desc);  extern spinlock_t sparse_irq_lock; + +#ifdef CONFIG_SPARSE_IRQ +/* irq_desc_ptrs allocated at boot time */ +extern struct irq_desc **irq_desc_ptrs; +#else +/* irq_desc_ptrs is a fixed size array */  extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; +#endif  #ifdef CONFIG_PROC_FS  extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 291f0366455..1516ab77355 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -90,14 +90,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)  #ifdef CONFIG_GENERIC_PENDING_IRQ  	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { -		cpumask_copy(&desc->affinity, cpumask); +		cpumask_copy(desc->affinity, cpumask);  		desc->chip->set_affinity(irq, cpumask);  	} else {  		desc->status |= IRQ_MOVE_PENDING; -		cpumask_copy(&desc->pending_mask, cpumask); +		cpumask_copy(desc->pending_mask, cpumask);  	}  #else -	cpumask_copy(&desc->affinity, cpumask); +	cpumask_copy(desc->affinity, cpumask);  	desc->chip->set_affinity(irq, cpumask);  #endif  	desc->status |= IRQ_AFFINITY_SET; @@ -109,7 +109,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)  /*   * Generic version of the affinity autoselector.   */ -int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) +static int setup_affinity(unsigned int irq, struct irq_desc *desc)  {  	if (!irq_can_set_affinity(irq))  		return 0; @@ -119,21 +119,21 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)  	 * one of the targets is online.  	 */  	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { -		if (cpumask_any_and(&desc->affinity, cpu_online_mask) +		if (cpumask_any_and(desc->affinity, cpu_online_mask)  		    < nr_cpu_ids)  			goto set_affinity;  		else  			desc->status &= ~IRQ_AFFINITY_SET;  	} -	cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); +	cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);  set_affinity: -	desc->chip->set_affinity(irq, &desc->affinity); +	desc->chip->set_affinity(irq, desc->affinity);  	return 0;  }  #else -static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) +static inline int setup_affinity(unsigned int irq, struct irq_desc *d)  {  	return irq_select_affinity(irq);  } @@ -149,19 +149,33 @@ int irq_select_affinity_usr(unsigned int irq)  	int ret;  	spin_lock_irqsave(&desc->lock, flags); -	ret = do_irq_select_affinity(irq, desc); +	ret = setup_affinity(irq, desc);  	spin_unlock_irqrestore(&desc->lock, flags);  	return ret;  }  #else -static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) +static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)  {  	return 0;  }  #endif +void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) +{ +	if (suspend) { +		if (!desc->action || (desc->action->flags & IRQF_TIMER)) +			return; +		desc->status |= IRQ_SUSPENDED; +	} + +	if (!desc->depth++) { +		desc->status |= IRQ_DISABLED; +		desc->chip->disable(irq); +	} +} +  /**   *	disable_irq_nosync - disable an irq without waiting   *	@irq: Interrupt to disable @@ -182,10 +196,7 @@ void disable_irq_nosync(unsigned int irq)  		return;  	spin_lock_irqsave(&desc->lock, flags); -	if (!desc->depth++) { -		desc->status |= IRQ_DISABLED; -		desc->chip->disable(irq); -	} +	__disable_irq(desc, irq, false);  	spin_unlock_irqrestore(&desc->lock, flags);  }  EXPORT_SYMBOL(disable_irq_nosync); @@ -215,15 +226,21 @@ void disable_irq(unsigned int irq)  }  EXPORT_SYMBOL(disable_irq); -static void __enable_irq(struct irq_desc *desc, unsigned int irq) +void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)  { +	if (resume) +		desc->status &= ~IRQ_SUSPENDED; +  	switch (desc->depth) {  	case 0: + err_out:  		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);  		break;  	case 1: {  		unsigned int status = desc->status & ~IRQ_DISABLED; +		if (desc->status & IRQ_SUSPENDED) +			goto err_out;  		/* Prevent probing on this irq: */  		desc->status = status | IRQ_NOPROBE;  		check_irq_resend(desc, irq); @@ -253,7 +270,7 @@ void enable_irq(unsigned int irq)  		return;  	spin_lock_irqsave(&desc->lock, flags); -	__enable_irq(desc, irq); +	__enable_irq(desc, irq, false);  	spin_unlock_irqrestore(&desc->lock, flags);  }  EXPORT_SYMBOL(enable_irq); @@ -389,9 +406,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,   * allocate special interrupts that are part of the architecture.   */  static int -__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) +__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  { -	struct irqaction *old, **p; +	struct irqaction *old, **old_ptr;  	const char *old_name = NULL;  	unsigned long flags;  	int shared = 0; @@ -423,8 +440,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)  	 * The following block of code has to be executed atomically  	 */  	spin_lock_irqsave(&desc->lock, flags); -	p = &desc->action; -	old = *p; +	old_ptr = &desc->action; +	old = *old_ptr;  	if (old) {  		/*  		 * Can't share interrupts unless both agree to and are @@ -447,8 +464,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)  		/* add new interrupt at end of irq queue */  		do { -			p = &old->next; -			old = *p; +			old_ptr = &old->next; +			old = *old_ptr;  		} while (old);  		shared = 1;  	} @@ -488,7 +505,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)  			desc->status |= IRQ_NO_BALANCING;  		/* Set default affinity mask once everything is setup */ -		do_irq_select_affinity(irq, desc); +		setup_affinity(irq, desc);  	} else if ((new->flags & IRQF_TRIGGER_MASK)  			&& (new->flags & IRQF_TRIGGER_MASK) @@ -499,7 +516,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)  				(int)(new->flags & IRQF_TRIGGER_MASK));  	} -	*p = new; +	*old_ptr = new;  	/* Reset broken irq detection when installing new handler */  	desc->irq_count = 0; @@ -511,7 +528,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)  	 */  	if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {  		desc->status &= ~IRQ_SPURIOUS_DISABLED; -		__enable_irq(desc, irq); +		__enable_irq(desc, irq, false);  	}  	spin_unlock_irqrestore(&desc->lock, flags); @@ -549,90 +566,117 @@ int setup_irq(unsigned int irq, struct irqaction *act)  	return __setup_irq(irq, desc, act);  } +EXPORT_SYMBOL_GPL(setup_irq); -/** - *	free_irq - free an interrupt - *	@irq: Interrupt line to free - *	@dev_id: Device identity to free - * - *	Remove an interrupt handler. The handler is removed and if the - *	interrupt line is no longer in use by any driver it is disabled. - *	On a shared IRQ the caller must ensure the interrupt is disabled - *	on the card it drives before calling this function. The function - *	does not return until any executing interrupts for this IRQ - *	have completed. - * - *	This function must not be called from interrupt context. + /* + * Internal function to unregister an irqaction - used to free + * regular and special interrupts that are part of the architecture.   */ -void free_irq(unsigned int irq, void *dev_id) +static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  {  	struct irq_desc *desc = irq_to_desc(irq); -	struct irqaction **p; +	struct irqaction *action, **action_ptr;  	unsigned long flags; -	WARN_ON(in_interrupt()); +	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);  	if (!desc) -		return; +		return NULL;  	spin_lock_irqsave(&desc->lock, flags); -	p = &desc->action; + +	/* +	 * There can be multiple actions per IRQ descriptor, find the right +	 * one based on the dev_id: +	 */ +	action_ptr = &desc->action;  	for (;;) { -		struct irqaction *action = *p; +		action = *action_ptr; -		if (action) { -			struct irqaction **pp = p; +		if (!action) { +			WARN(1, "Trying to free already-free IRQ %d\n", irq); +			spin_unlock_irqrestore(&desc->lock, flags); -			p = &action->next; -			if (action->dev_id != dev_id) -				continue; +			return NULL; +		} -			/* Found it - now remove it from the list of entries */ -			*pp = action->next; +		if (action->dev_id == dev_id) +			break; +		action_ptr = &action->next; +	} -			/* Currently used only by UML, might disappear one day.*/ +	/* Found it - now remove it from the list of entries: */ +	*action_ptr = action->next; + +	/* Currently used only by UML, might disappear one day: */  #ifdef CONFIG_IRQ_RELEASE_METHOD -			if (desc->chip->release) -				desc->chip->release(irq, dev_id); +	if (desc->chip->release) +		desc->chip->release(irq, dev_id);  #endif -			if (!desc->action) { -				desc->status |= IRQ_DISABLED; -				if (desc->chip->shutdown) -					desc->chip->shutdown(irq); -				else -					desc->chip->disable(irq); -			} -			spin_unlock_irqrestore(&desc->lock, flags); -			unregister_handler_proc(irq, action); +	/* If this was the last handler, shut down the IRQ line: */ +	if (!desc->action) { +		desc->status |= IRQ_DISABLED; +		if (desc->chip->shutdown) +			desc->chip->shutdown(irq); +		else +			desc->chip->disable(irq); +	} +	spin_unlock_irqrestore(&desc->lock, flags); + +	unregister_handler_proc(irq, action); + +	/* Make sure it's not being used on another CPU: */ +	synchronize_irq(irq); -			/* Make sure it's not being used on another CPU */ -			synchronize_irq(irq); -#ifdef CONFIG_DEBUG_SHIRQ -			/* -			 * It's a shared IRQ -- the driver ought to be -			 * prepared for it to happen even now it's -			 * being freed, so let's make sure....  We do -			 * this after actually deregistering it, to -			 * make sure that a 'real' IRQ doesn't run in -			 * parallel with our fake -			 */ -			if (action->flags & IRQF_SHARED) { -				local_irq_save(flags); -				action->handler(irq, dev_id); -				local_irq_restore(flags); -			} -#endif -			kfree(action); -			return; -		} -		printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);  #ifdef CONFIG_DEBUG_SHIRQ -		dump_stack(); -#endif -		spin_unlock_irqrestore(&desc->lock, flags); -		return; +	/* +	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ +	 * event to happen even now it's being freed, so let's make sure that +	 * is so by doing an extra call to the handler .... +	 * +	 * ( We do this after actually deregistering it, to make sure that a +	 *   'real' IRQ doesn't run in * parallel with our fake. ) +	 */ +	if (action->flags & IRQF_SHARED) { +		local_irq_save(flags); +		action->handler(irq, dev_id); +		local_irq_restore(flags);  	} +#endif +	return action; +} + +/** + *	remove_irq - free an interrupt + *	@irq: Interrupt line to free + *	@act: irqaction for the interrupt + * + * Used to remove interrupts statically setup by the early boot process. + */ +void remove_irq(unsigned int irq, struct irqaction *act) +{ +	__free_irq(irq, act->dev_id); +} +EXPORT_SYMBOL_GPL(remove_irq); + +/** + *	free_irq - free an interrupt allocated with request_irq + *	@irq: Interrupt line to free + *	@dev_id: Device identity to free + * + *	Remove an interrupt handler. The handler is removed and if the + *	interrupt line is no longer in use by any driver it is disabled. + *	On a shared IRQ the caller must ensure the interrupt is disabled + *	on the card it drives before calling this function. The function + *	does not return until any executing interrupts for this IRQ + *	have completed. + * + *	This function must not be called from interrupt context. + */ +void free_irq(unsigned int irq, void *dev_id) +{ +	kfree(__free_irq(irq, dev_id));  }  EXPORT_SYMBOL(free_irq); @@ -679,11 +723,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,  	 * the behavior is classified as "will not fix" so we need to  	 * start nudging drivers away from using that idiom.  	 */ -	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) -			== (IRQF_SHARED|IRQF_DISABLED)) -		pr_warning("IRQ %d/%s: IRQF_DISABLED is not " -				"guaranteed on shared IRQs\n", -				irq, devname); +	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) == +					(IRQF_SHARED|IRQF_DISABLED)) { +		pr_warning( +		  "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n", +			irq, devname); +	}  #ifdef CONFIG_LOCKDEP  	/* @@ -709,15 +754,13 @@ int request_irq(unsigned int irq, irq_handler_t handler,  	if (!handler)  		return -EINVAL; -	action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); +	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);  	if (!action)  		return -ENOMEM;  	action->handler = handler;  	action->flags = irqflags; -	cpus_clear(action->mask);  	action->name = devname; -	action->next = NULL;  	action->dev_id = dev_id;  	retval = __setup_irq(irq, desc, action); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index bd72329e630..e05ad9be43b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -18,7 +18,7 @@ void move_masked_irq(int irq)  	desc->status &= ~IRQ_MOVE_PENDING; -	if (unlikely(cpumask_empty(&desc->pending_mask))) +	if (unlikely(cpumask_empty(desc->pending_mask)))  		return;  	if (!desc->chip->set_affinity) @@ -38,13 +38,13 @@ void move_masked_irq(int irq)  	 * For correct operation this depends on the caller  	 * masking the irqs.  	 */ -	if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) +	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)  		   < nr_cpu_ids)) { -		cpumask_and(&desc->affinity, -			    &desc->pending_mask, cpu_online_mask); -		desc->chip->set_affinity(irq, &desc->affinity); +		cpumask_and(desc->affinity, +			    desc->pending_mask, cpu_online_mask); +		desc->chip->set_affinity(irq, desc->affinity);  	} -	cpumask_clear(&desc->pending_mask); +	cpumask_clear(desc->pending_mask);  }  void move_native_irq(int irq) diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index acd88356ac7..243d6121e50 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -17,16 +17,11 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc,  				 struct irq_desc *desc,  				 int cpu, int nr)  { -	unsigned long bytes; -  	init_kstat_irqs(desc, cpu, nr); -	if (desc->kstat_irqs != old_desc->kstat_irqs) { -		/* Compute how many bytes we need per irq and allocate them */ -		bytes = nr * sizeof(unsigned int); - -		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes); -	} +	if (desc->kstat_irqs != old_desc->kstat_irqs) +		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, +			 nr * sizeof(*desc->kstat_irqs));  }  static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) @@ -38,15 +33,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)  	old_desc->kstat_irqs = NULL;  } -static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, +static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,  		 struct irq_desc *desc, int cpu)  {  	memcpy(desc, old_desc, sizeof(struct irq_desc)); +	if (!init_alloc_desc_masks(desc, cpu, false)) { +		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " +				"for migration.\n", irq); +		return false; +	}  	spin_lock_init(&desc->lock);  	desc->cpu = cpu;  	lockdep_set_class(&desc->lock, &irq_desc_lock_class);  	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); +	init_copy_desc_masks(old_desc, desc);  	arch_init_copy_chip_data(old_desc, desc, cpu); +	return true;  }  static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) @@ -76,12 +78,18 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,  	node = cpu_to_node(cpu);  	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);  	if (!desc) { -		printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); +		printk(KERN_ERR "irq %d: can not get new irq_desc " +				"for migration.\n", irq); +		/* still use old one */ +		desc = old_desc; +		goto out_unlock; +	} +	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {  		/* still use old one */ +		kfree(desc);  		desc = old_desc;  		goto out_unlock;  	} -	init_copy_one_irq_desc(irq, old_desc, desc, cpu);  	irq_desc_ptrs[irq] = desc;  	spin_unlock_irqrestore(&sparse_irq_lock, flags); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c new file mode 100644 index 00000000000..638d8bedec1 --- /dev/null +++ b/kernel/irq/pm.c @@ -0,0 +1,79 @@ +/* + * linux/kernel/irq/pm.c + * + * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + * + * This file contains power management functions related to interrupts. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/interrupt.h> + +#include "internals.h" + +/** + * suspend_device_irqs - disable all currently enabled interrupt lines + * + * During system-wide suspend or hibernation device interrupts need to be + * disabled at the chip level and this function is provided for this purpose. + * It disables all interrupt lines that are enabled at the moment and sets the + * IRQ_SUSPENDED flag for them. + */ +void suspend_device_irqs(void) +{ +	struct irq_desc *desc; +	int irq; + +	for_each_irq_desc(irq, desc) { +		unsigned long flags; + +		spin_lock_irqsave(&desc->lock, flags); +		__disable_irq(desc, irq, true); +		spin_unlock_irqrestore(&desc->lock, flags); +	} + +	for_each_irq_desc(irq, desc) +		if (desc->status & IRQ_SUSPENDED) +			synchronize_irq(irq); +} +EXPORT_SYMBOL_GPL(suspend_device_irqs); + +/** + * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() + * + * Enable all interrupt lines previously disabled by suspend_device_irqs() that + * have the IRQ_SUSPENDED flag set. + */ +void resume_device_irqs(void) +{ +	struct irq_desc *desc; +	int irq; + +	for_each_irq_desc(irq, desc) { +		unsigned long flags; + +		if (!(desc->status & IRQ_SUSPENDED)) +			continue; + +		spin_lock_irqsave(&desc->lock, flags); +		__enable_irq(desc, irq, true); +		spin_unlock_irqrestore(&desc->lock, flags); +	} +} +EXPORT_SYMBOL_GPL(resume_device_irqs); + +/** + * check_wakeup_irqs - check if any wake-up interrupts are pending + */ +int check_wakeup_irqs(void) +{ +	struct irq_desc *desc; +	int irq; + +	for_each_irq_desc(irq, desc) +		if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) +			return -EBUSY; + +	return 0; +} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index aae3f742bce..692363dd591 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;  static int irq_affinity_proc_show(struct seq_file *m, void *v)  {  	struct irq_desc *desc = irq_to_desc((long)m->private); -	const struct cpumask *mask = &desc->affinity; +	const struct cpumask *mask = desc->affinity;  #ifdef CONFIG_GENERIC_PENDING_IRQ  	if (desc->status & IRQ_MOVE_PENDING) -		mask = &desc->pending_mask; +		mask = desc->pending_mask;  #endif  	seq_cpumask(m, mask);  	seq_putc(m, '\n'); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd364c11e56..4d568294de3 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -104,7 +104,7 @@ static int misrouted_irq(int irq)  	return ok;  } -static void poll_spurious_irqs(unsigned long dummy) +static void poll_all_shared_irqs(void)  {  	struct irq_desc *desc;  	int i; @@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)  		try_one_irq(i, desc);  	} +} + +static void poll_spurious_irqs(unsigned long dummy) +{ +	poll_all_shared_irqs();  	mod_timer(&poll_spurious_irq_timer,  		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);  } +#ifdef CONFIG_DEBUG_SHIRQ +void debug_poll_all_shared_irqs(void) +{ +	poll_all_shared_irqs(); +} +#endif +  /*   * If 99,900 of the previous 100,000 interrupts have not been handled   * then assume that the IRQ is stuck in some manner. Drop a diagnostic diff --git a/kernel/kexec.c b/kernel/kexec.c index 48389957825..5a758c6e495 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -42,7 +42,7 @@  note_buf_t* crash_notes;  /* vmcoreinfo stuff */ -unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];  u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];  size_t vmcoreinfo_size;  size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); @@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)  		return;  	memset(&prstatus, 0, sizeof(prstatus));  	prstatus.pr_pid = current->pid; -	elf_core_copy_regs(&prstatus.pr_reg, regs); +	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);  	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,  		      	      &prstatus, sizeof(prstatus));  	final_note(buf); @@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_OFFSET(list_head, prev);  	VMCOREINFO_OFFSET(vm_struct, addr);  	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); +	log_buf_kexec_setup();  	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);  	VMCOREINFO_NUMBER(NR_FREE_PAGES);  	VMCOREINFO_NUMBER(PG_lru); @@ -1450,11 +1451,7 @@ int kernel_kexec(void)  		error = device_suspend(PMSG_FREEZE);  		if (error)  			goto Resume_console; -		error = disable_nonboot_cpus(); -		if (error) -			goto Resume_devices;  		device_pm_lock(); -		local_irq_disable();  		/* At this point, device_suspend() has been called,  		 * but *not* device_power_down(). We *must*  		 * device_power_down() now.  Otherwise, drivers for @@ -1464,12 +1461,15 @@ int kernel_kexec(void)  		 */  		error = device_power_down(PMSG_FREEZE);  		if (error) -			goto Enable_irqs; - +			goto Resume_devices; +		error = disable_nonboot_cpus(); +		if (error) +			goto Enable_cpus; +		local_irq_disable();  		/* Suspend system devices */  		error = sysdev_suspend(PMSG_FREEZE);  		if (error) -			goto Power_up_devices; +			goto Enable_irqs;  	} else  #endif  	{ @@ -1483,13 +1483,13 @@ int kernel_kexec(void)  #ifdef CONFIG_KEXEC_JUMP  	if (kexec_image->preserve_context) {  		sysdev_resume(); - Power_up_devices: -		device_power_up(PMSG_RESTORE);   Enable_irqs:  		local_irq_enable(); -		device_pm_unlock(); + Enable_cpus:  		enable_nonboot_cpus(); +		device_power_up(PMSG_RESTORE);   Resume_devices: +		device_pm_unlock();  		device_resume(PMSG_RESTORE);   Resume_console:  		resume_console(); diff --git a/kernel/kmod.c b/kernel/kmod.c index a27a5f64443..f0c8f545180 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -167,7 +167,7 @@ static int ____call_usermodehelper(void *data)  	}  	/* We can run anywhere, unlike our parent keventd(). */ -	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); +	set_cpus_allowed_ptr(current, cpu_all_mask);  	/*  	 * Our parent is keventd, which runs with elevated scheduling priority. diff --git a/kernel/kthread.c b/kernel/kthread.c index 4fbc456f393..84bbadd4d02 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)  		 */  		sched_setscheduler(create->result, SCHED_NORMAL, ¶m);  		set_user_nice(create->result, KTHREAD_NICE_LEVEL); -		set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); +		set_cpus_allowed_ptr(create->result, cpu_all_mask);  	}  	complete(&create->done);  } @@ -240,7 +240,7 @@ int kthreadd(void *unused)  	set_task_comm(tsk, "kthreadd");  	ignore_signals(tsk);  	set_user_nice(tsk, KTHREAD_NICE_LEVEL); -	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); +	set_cpus_allowed_ptr(tsk, cpu_all_mask);  	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 449db466bdb..ca07c5c0c91 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -9,6 +9,44 @@   * as published by the Free Software Foundation; version 2   * of the License.   */ + +/* + * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is + * used by the "latencytop" userspace tool. The latency that is tracked is not + * the 'traditional' interrupt latency (which is primarily caused by something + * else consuming CPU), but instead, it is the latency an application encounters + * because the kernel sleeps on its behalf for various reasons. + * + * This code tracks 2 levels of statistics: + * 1) System level latency + * 2) Per process latency + * + * The latency is stored in fixed sized data structures in an accumulated form; + * if the "same" latency cause is hit twice, this will be tracked as one entry + * in the data structure. Both the count, total accumulated latency and maximum + * latency are tracked in this data structure. When the fixed size structure is + * full, no new causes are tracked until the buffer is flushed by writing to + * the /proc file; the userspace tool does this on a regular basis. + * + * A latency cause is identified by a stringified backtrace at the point that + * the scheduler gets invoked. The userland tool will use this string to + * identify the cause of the latency in human readable form. + * + * The information is exported via /proc/latency_stats and /proc/<pid>/latency. + * These files look like this: + * + * Latency Top version : v0.1 + * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl + * |    |    |    | + * |    |    |    +----> the stringified backtrace + * |    |    +---------> The maximum latency for this entry in microseconds + * |    +--------------> The accumulated latency for this entry (microseconds) + * +-------------------> The number of times this entry is hit + * + * (note: the average latency is the accumulated latency divided by the number + * of times) + */ +  #include <linux/latencytop.h>  #include <linux/kallsyms.h>  #include <linux/seq_file.h> @@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record  				firstnonnull = i;  			continue;  		} -		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { +		for (q = 0; q < LT_BACKTRACEDEPTH; q++) {  			unsigned long record = lat->backtrace[q];  			if (latency_record[i].backtrace[q] != record) { @@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record  	memcpy(&latency_record[i], lat, sizeof(struct latency_record));  } -static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) +/* + * Iterator to store a backtrace into a latency record entry + */ +static inline void store_stacktrace(struct task_struct *tsk, +					struct latency_record *lat)  {  	struct stack_trace trace;  	memset(&trace, 0, sizeof(trace));  	trace.max_entries = LT_BACKTRACEDEPTH;  	trace.entries = &lat->backtrace[0]; -	trace.skip = 0;  	save_stack_trace_tsk(tsk, &trace);  } +/** + * __account_scheduler_latency - record an occured latency + * @tsk - the task struct of the task hitting the latency + * @usecs - the duration of the latency in microseconds + * @inter - 1 if the sleep was interruptible, 0 if uninterruptible + * + * This function is the main entry point for recording latency entries + * as called by the scheduler. + * + * This function has a few special cases to deal with normal 'non-latency' + * sleeps: specifically, interruptible sleep longer than 5 msec is skipped + * since this usually is caused by waiting for events via select() and co. + * + * Negative latencies (caused by time going backwards) are also explicitly + * skipped. + */  void __sched -account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) +__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  {  	unsigned long flags;  	int i, q;  	struct latency_record lat; -	if (!latencytop_enabled) -		return; -  	/* Long interruptible waits are generally user requested... */  	if (inter && usecs > 5000)  		return; +	/* Negative sleeps are time going backwards */ +	/* Zero-time sleeps are non-interesting */ +	if (usecs <= 0) +		return; +  	memset(&lat, 0, sizeof(lat));  	lat.count = 1;  	lat.time = usecs; @@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  	if (tsk->latency_record_count >= LT_SAVECOUNT)  		goto out_unlock; -	for (i = 0; i < LT_SAVECOUNT ; i++) { +	for (i = 0; i < LT_SAVECOUNT; i++) {  		struct latency_record *mylat;  		int same = 1;  		mylat = &tsk->latency_record[i]; -		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { +		for (q = 0; q < LT_BACKTRACEDEPTH; q++) {  			unsigned long record = lat.backtrace[q];  			if (mylat->backtrace[q] != record) { @@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)  	for (i = 0; i < MAXLR; i++) {  		if (latency_record[i].backtrace[0]) {  			int q; -			seq_printf(m, "%i %li %li ", +			seq_printf(m, "%i %lu %lu ",  				latency_record[i].count,  				latency_record[i].time,  				latency_record[i].max); @@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)  	return single_open(filp, lstats_show, NULL);  } -static struct file_operations lstats_fops = { +static const struct file_operations lstats_fops = {  	.open		= lstats_open,  	.read		= seq_read,  	.write		= lstats_write, @@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)  	proc_create("latency_stats", 0644, NULL, &lstats_fops);  	return 0;  } -__initcall(init_lstats_procfs); +device_initcall(init_lstats_procfs); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 06b0c3568f0..981cd485428 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -41,6 +41,7 @@  #include <linux/utsname.h>  #include <linux/hash.h>  #include <linux/ftrace.h> +#include <linux/stringify.h>  #include <asm/sections.h> @@ -310,12 +311,14 @@ EXPORT_SYMBOL(lockdep_on);  #if VERBOSE  # define HARDIRQ_VERBOSE	1  # define SOFTIRQ_VERBOSE	1 +# define RECLAIM_VERBOSE	1  #else  # define HARDIRQ_VERBOSE	0  # define SOFTIRQ_VERBOSE	0 +# define RECLAIM_VERBOSE	0  #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE  /*   * Quick filtering for interesting events:   */ @@ -430,30 +433,24 @@ atomic_t nr_find_usage_forwards_checks;  atomic_t nr_find_usage_forwards_recursions;  atomic_t nr_find_usage_backwards_checks;  atomic_t nr_find_usage_backwards_recursions; -# define debug_atomic_inc(ptr)		atomic_inc(ptr) -# define debug_atomic_dec(ptr)		atomic_dec(ptr) -# define debug_atomic_read(ptr)		atomic_read(ptr) -#else -# define debug_atomic_inc(ptr)		do { } while (0) -# define debug_atomic_dec(ptr)		do { } while (0) -# define debug_atomic_read(ptr)		0  #endif  /*   * Locking printouts:   */ +#define __USAGE(__STATE)						\ +	[LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W",	\ +	[LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W",		\ +	[LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ +	[LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", +  static const char *usage_str[] =  { -	[LOCK_USED] =			"initial-use ", -	[LOCK_USED_IN_HARDIRQ] =	"in-hardirq-W", -	[LOCK_USED_IN_SOFTIRQ] =	"in-softirq-W", -	[LOCK_ENABLED_SOFTIRQS] =	"softirq-on-W", -	[LOCK_ENABLED_HARDIRQS] =	"hardirq-on-W", -	[LOCK_USED_IN_HARDIRQ_READ] =	"in-hardirq-R", -	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R", -	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R", -	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R", +#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) +#include "lockdep_states.h" +#undef LOCKDEP_STATE +	[LOCK_USED] = "INITIAL USE",  };  const char * __get_key_name(struct lockdep_subclass_key *key, char *str) @@ -461,46 +458,45 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)  	return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);  } -void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) +static inline unsigned long lock_flag(enum lock_usage_bit bit)  { -	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; - -	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) -		*c1 = '+'; -	else -		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) -			*c1 = '-'; +	return 1UL << bit; +} -	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) -		*c2 = '+'; -	else -		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) -			*c2 = '-'; +static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) +{ +	char c = '.'; -	if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) -		*c3 = '-'; -	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { -		*c3 = '+'; -		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) -			*c3 = '?'; +	if (class->usage_mask & lock_flag(bit + 2)) +		c = '+'; +	if (class->usage_mask & lock_flag(bit)) { +		c = '-'; +		if (class->usage_mask & lock_flag(bit + 2)) +			c = '?';  	} -	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) -		*c4 = '-'; -	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { -		*c4 = '+'; -		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) -			*c4 = '?'; -	} +	return c; +} + +void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) +{ +	int i = 0; + +#define LOCKDEP_STATE(__STATE) 						\ +	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE);	\ +	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); +#include "lockdep_states.h" +#undef LOCKDEP_STATE + +	usage[i] = '\0';  }  static void print_lock_name(struct lock_class *class)  { -	char str[KSYM_NAME_LEN], c1, c2, c3, c4; +	char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];  	const char *name; -	get_usage_chars(class, &c1, &c2, &c3, &c4); +	get_usage_chars(class, usage);  	name = class->name;  	if (!name) { @@ -513,7 +509,7 @@ static void print_lock_name(struct lock_class *class)  		if (class->subclass)  			printk("/%d", class->subclass);  	} -	printk("){%c%c%c%c}", c1, c2, c3, c4); +	printk("){%s}", usage);  }  static void print_lockdep_cache(struct lockdep_map *lock) @@ -1263,9 +1259,49 @@ check_usage(struct task_struct *curr, struct held_lock *prev,  			bit_backwards, bit_forwards, irqclass);  } -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, -		struct held_lock *next) +static const char *state_names[] = { +#define LOCKDEP_STATE(__STATE) \ +	__stringify(__STATE), +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static const char *state_rnames[] = { +#define LOCKDEP_STATE(__STATE) \ +	__stringify(__STATE)"-READ", +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline const char *state_name(enum lock_usage_bit bit) +{ +	return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +} + +static int exclusive_bit(int new_bit) +{ +	/* +	 * USED_IN +	 * USED_IN_READ +	 * ENABLED +	 * ENABLED_READ +	 * +	 * bit 0 - write/read +	 * bit 1 - used_in/enabled +	 * bit 2+  state +	 */ + +	int state = new_bit & ~3; +	int dir = new_bit & 2; + +	/* +	 * keep state, bit flip the direction and strip read. +	 */ +	return state | (dir ^ 2); +} + +static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, +			   struct held_lock *next, enum lock_usage_bit bit)  {  	/*  	 * Prove that the new dependency does not connect a hardirq-safe @@ -1273,38 +1309,34 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,  	 * the backwards-subgraph starting at <prev>, and the  	 * forwards-subgraph starting at <next>:  	 */ -	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, -					LOCK_ENABLED_HARDIRQS, "hard")) +	if (!check_usage(curr, prev, next, bit, +			   exclusive_bit(bit), state_name(bit)))  		return 0; +	bit++; /* _READ */ +  	/*  	 * Prove that the new dependency does not connect a hardirq-safe-read  	 * lock with a hardirq-unsafe lock - to achieve this we search  	 * the backwards-subgraph starting at <prev>, and the  	 * forwards-subgraph starting at <next>:  	 */ -	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, -					LOCK_ENABLED_HARDIRQS, "hard-read")) +	if (!check_usage(curr, prev, next, bit, +			   exclusive_bit(bit), state_name(bit)))  		return 0; -	/* -	 * Prove that the new dependency does not connect a softirq-safe -	 * lock with a softirq-unsafe lock - to achieve this we search -	 * the backwards-subgraph starting at <prev>, and the -	 * forwards-subgraph starting at <next>: -	 */ -	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, -					LOCK_ENABLED_SOFTIRQS, "soft")) -		return 0; -	/* -	 * Prove that the new dependency does not connect a softirq-safe-read -	 * lock with a softirq-unsafe lock - to achieve this we search -	 * the backwards-subgraph starting at <prev>, and the -	 * forwards-subgraph starting at <next>: -	 */ -	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, -					LOCK_ENABLED_SOFTIRQS, "soft")) +	return 1; +} + +static int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, +		struct held_lock *next) +{ +#define LOCKDEP_STATE(__STATE)						\ +	if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE))	\  		return 0; +#include "lockdep_states.h" +#undef LOCKDEP_STATE  	return 1;  } @@ -1861,9 +1893,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,  		curr->comm, task_pid_nr(curr));  	print_lock(this);  	if (forwards) -		printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); +		printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);  	else -		printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); +		printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);  	print_lock_name(other);  	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); @@ -1933,7 +1965,7 @@ void print_irqtrace_events(struct task_struct *curr)  	print_ip_sym(curr->softirq_disable_ip);  } -static int hardirq_verbose(struct lock_class *class) +static int HARDIRQ_verbose(struct lock_class *class)  {  #if HARDIRQ_VERBOSE  	return class_filter(class); @@ -1941,7 +1973,7 @@ static int hardirq_verbose(struct lock_class *class)  	return 0;  } -static int softirq_verbose(struct lock_class *class) +static int SOFTIRQ_verbose(struct lock_class *class)  {  #if SOFTIRQ_VERBOSE  	return class_filter(class); @@ -1949,185 +1981,95 @@ static int softirq_verbose(struct lock_class *class)  	return 0;  } +static int RECLAIM_FS_verbose(struct lock_class *class) +{ +#if RECLAIM_VERBOSE +	return class_filter(class); +#endif +	return 0; +} +  #define STRICT_READ_CHECKS	1 -static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, +static int (*state_verbose_f[])(struct lock_class *class) = { +#define LOCKDEP_STATE(__STATE) \ +	__STATE##_verbose, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline int state_verbose(enum lock_usage_bit bit, +				struct lock_class *class) +{ +	return state_verbose_f[bit >> 2](class); +} + +typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, +			     enum lock_usage_bit bit, const char *name); + +static int +mark_lock_irq(struct task_struct *curr, struct held_lock *this,  		enum lock_usage_bit new_bit)  { -	int ret = 1; +	int excl_bit = exclusive_bit(new_bit); +	int read = new_bit & 1; +	int dir = new_bit & 2; -	switch(new_bit) { -	case LOCK_USED_IN_HARDIRQ: -		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) -			return 0; -		if (!valid_state(curr, this, new_bit, -				 LOCK_ENABLED_HARDIRQS_READ)) -			return 0; -		/* -		 * just marked it hardirq-safe, check that this lock -		 * took no hardirq-unsafe lock in the past: -		 */ -		if (!check_usage_forwards(curr, this, -					  LOCK_ENABLED_HARDIRQS, "hard")) -			return 0; -#if STRICT_READ_CHECKS -		/* -		 * just marked it hardirq-safe, check that this lock -		 * took no hardirq-unsafe-read lock in the past: -		 */ -		if (!check_usage_forwards(curr, this, -				LOCK_ENABLED_HARDIRQS_READ, "hard-read")) -			return 0; -#endif -		if (hardirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	case LOCK_USED_IN_SOFTIRQ: -		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) -			return 0; -		if (!valid_state(curr, this, new_bit, -				 LOCK_ENABLED_SOFTIRQS_READ)) -			return 0; -		/* -		 * just marked it softirq-safe, check that this lock -		 * took no softirq-unsafe lock in the past: -		 */ -		if (!check_usage_forwards(curr, this, -					  LOCK_ENABLED_SOFTIRQS, "soft")) -			return 0; -#if STRICT_READ_CHECKS -		/* -		 * just marked it softirq-safe, check that this lock -		 * took no softirq-unsafe-read lock in the past: -		 */ -		if (!check_usage_forwards(curr, this, -				LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) -			return 0; -#endif -		if (softirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	case LOCK_USED_IN_HARDIRQ_READ: -		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) -			return 0; -		/* -		 * just marked it hardirq-read-safe, check that this lock -		 * took no hardirq-unsafe lock in the past: -		 */ -		if (!check_usage_forwards(curr, this, -					  LOCK_ENABLED_HARDIRQS, "hard")) -			return 0; -		if (hardirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	case LOCK_USED_IN_SOFTIRQ_READ: -		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) -			return 0; -		/* -		 * just marked it softirq-read-safe, check that this lock -		 * took no softirq-unsafe lock in the past: -		 */ -		if (!check_usage_forwards(curr, this, -					  LOCK_ENABLED_SOFTIRQS, "soft")) -			return 0; -		if (softirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	case LOCK_ENABLED_HARDIRQS: -		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) -			return 0; -		if (!valid_state(curr, this, new_bit, -				 LOCK_USED_IN_HARDIRQ_READ)) -			return 0; -		/* -		 * just marked it hardirq-unsafe, check that no hardirq-safe -		 * lock in the system ever took it in the past: -		 */ -		if (!check_usage_backwards(curr, this, -					   LOCK_USED_IN_HARDIRQ, "hard")) -			return 0; -#if STRICT_READ_CHECKS -		/* -		 * just marked it hardirq-unsafe, check that no -		 * hardirq-safe-read lock in the system ever took -		 * it in the past: -		 */ -		if (!check_usage_backwards(curr, this, -				   LOCK_USED_IN_HARDIRQ_READ, "hard-read")) -			return 0; -#endif -		if (hardirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	case LOCK_ENABLED_SOFTIRQS: -		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) -			return 0; -		if (!valid_state(curr, this, new_bit, -				 LOCK_USED_IN_SOFTIRQ_READ)) -			return 0; -		/* -		 * just marked it softirq-unsafe, check that no softirq-safe -		 * lock in the system ever took it in the past: -		 */ -		if (!check_usage_backwards(curr, this, -					   LOCK_USED_IN_SOFTIRQ, "soft")) -			return 0; -#if STRICT_READ_CHECKS -		/* -		 * just marked it softirq-unsafe, check that no -		 * softirq-safe-read lock in the system ever took -		 * it in the past: -		 */ -		if (!check_usage_backwards(curr, this, -				   LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) -			return 0; -#endif -		if (softirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	case LOCK_ENABLED_HARDIRQS_READ: -		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) -			return 0; -#if STRICT_READ_CHECKS -		/* -		 * just marked it hardirq-read-unsafe, check that no -		 * hardirq-safe lock in the system ever took it in the past: -		 */ -		if (!check_usage_backwards(curr, this, -					   LOCK_USED_IN_HARDIRQ, "hard")) -			return 0; -#endif -		if (hardirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	case LOCK_ENABLED_SOFTIRQS_READ: -		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) +	/* +	 * mark USED_IN has to look forwards -- to ensure no dependency +	 * has ENABLED state, which would allow recursion deadlocks. +	 * +	 * mark ENABLED has to look backwards -- to ensure no dependee +	 * has USED_IN state, which, again, would allow  recursion deadlocks. +	 */ +	check_usage_f usage = dir ? +		check_usage_backwards : check_usage_forwards; + +	/* +	 * Validate that this particular lock does not have conflicting +	 * usage states. +	 */ +	if (!valid_state(curr, this, new_bit, excl_bit)) +		return 0; + +	/* +	 * Validate that the lock dependencies don't have conflicting usage +	 * states. +	 */ +	if ((!read || !dir || STRICT_READ_CHECKS) && +			!usage(curr, this, excl_bit, state_name(new_bit & ~1))) +		return 0; + +	/* +	 * Check for read in write conflicts +	 */ +	if (!read) { +		if (!valid_state(curr, this, new_bit, excl_bit + 1))  			return 0; -#if STRICT_READ_CHECKS -		/* -		 * just marked it softirq-read-unsafe, check that no -		 * softirq-safe lock in the system ever took it in the past: -		 */ -		if (!check_usage_backwards(curr, this, -					   LOCK_USED_IN_SOFTIRQ, "soft")) + +		if (STRICT_READ_CHECKS && +			!usage(curr, this, excl_bit + 1, +				state_name(new_bit + 1)))  			return 0; -#endif -		if (softirq_verbose(hlock_class(this))) -			ret = 2; -		break; -	default: -		WARN_ON(1); -		break;  	} -	return ret; +	if (state_verbose(new_bit, hlock_class(this))) +		return 2; + +	return 1;  } +enum mark_type { +#define LOCKDEP_STATE(__STATE)	__STATE, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; +  /*   * Mark all held locks with a usage bit:   */  static int -mark_held_locks(struct task_struct *curr, int hardirq) +mark_held_locks(struct task_struct *curr, enum mark_type mark)  {  	enum lock_usage_bit usage_bit;  	struct held_lock *hlock; @@ -2136,17 +2078,12 @@ mark_held_locks(struct task_struct *curr, int hardirq)  	for (i = 0; i < curr->lockdep_depth; i++) {  		hlock = curr->held_locks + i; -		if (hardirq) { -			if (hlock->read) -				usage_bit = LOCK_ENABLED_HARDIRQS_READ; -			else -				usage_bit = LOCK_ENABLED_HARDIRQS; -		} else { -			if (hlock->read) -				usage_bit = LOCK_ENABLED_SOFTIRQS_READ; -			else -				usage_bit = LOCK_ENABLED_SOFTIRQS; -		} +		usage_bit = 2 + (mark << 2); /* ENABLED */ +		if (hlock->read) +			usage_bit += 1; /* READ */ + +		BUG_ON(usage_bit >= LOCK_USAGE_STATES); +  		if (!mark_lock(curr, hlock, usage_bit))  			return 0;  	} @@ -2200,7 +2137,7 @@ void trace_hardirqs_on_caller(unsigned long ip)  	 * We are going to turn hardirqs on, so set the  	 * usage bit for all held locks:  	 */ -	if (!mark_held_locks(curr, 1)) +	if (!mark_held_locks(curr, HARDIRQ))  		return;  	/*  	 * If we have softirqs enabled, then set the usage @@ -2208,7 +2145,7 @@ void trace_hardirqs_on_caller(unsigned long ip)  	 * this bit from being set before)  	 */  	if (curr->softirqs_enabled) -		if (!mark_held_locks(curr, 0)) +		if (!mark_held_locks(curr, SOFTIRQ))  			return;  	curr->hardirq_enable_ip = ip; @@ -2288,7 +2225,7 @@ void trace_softirqs_on(unsigned long ip)  	 * enabled too:  	 */  	if (curr->hardirqs_enabled) -		mark_held_locks(curr, 0); +		mark_held_locks(curr, SOFTIRQ);  }  /* @@ -2317,6 +2254,48 @@ void trace_softirqs_off(unsigned long ip)  		debug_atomic_inc(&redundant_softirqs_off);  } +static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) +{ +	struct task_struct *curr = current; + +	if (unlikely(!debug_locks)) +		return; + +	/* no reclaim without waiting on it */ +	if (!(gfp_mask & __GFP_WAIT)) +		return; + +	/* this guy won't enter reclaim */ +	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) +		return; + +	/* We're only interested __GFP_FS allocations for now */ +	if (!(gfp_mask & __GFP_FS)) +		return; + +	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) +		return; + +	mark_held_locks(curr, RECLAIM_FS); +} + +static void check_flags(unsigned long flags); + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ +	unsigned long flags; + +	if (unlikely(current->lockdep_recursion)) +		return; + +	raw_local_irq_save(flags); +	check_flags(flags); +	current->lockdep_recursion = 1; +	__lockdep_trace_alloc(gfp_mask, flags); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} +  static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)  {  	/* @@ -2345,19 +2324,35 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)  	if (!hlock->hardirqs_off) {  		if (hlock->read) {  			if (!mark_lock(curr, hlock, -					LOCK_ENABLED_HARDIRQS_READ)) +					LOCK_ENABLED_HARDIRQ_READ))  				return 0;  			if (curr->softirqs_enabled)  				if (!mark_lock(curr, hlock, -						LOCK_ENABLED_SOFTIRQS_READ)) +						LOCK_ENABLED_SOFTIRQ_READ))  					return 0;  		} else {  			if (!mark_lock(curr, hlock, -					LOCK_ENABLED_HARDIRQS)) +					LOCK_ENABLED_HARDIRQ))  				return 0;  			if (curr->softirqs_enabled)  				if (!mark_lock(curr, hlock, -						LOCK_ENABLED_SOFTIRQS)) +						LOCK_ENABLED_SOFTIRQ)) +					return 0; +		} +	} + +	/* +	 * We reuse the irq context infrastructure more broadly as a general +	 * context checking code. This tests GFP_FS recursion (a lock taken +	 * during reclaim for a GFP_FS allocation is held over a GFP_FS +	 * allocation). +	 */ +	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { +		if (hlock->read) { +			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) +					return 0; +		} else { +			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))  					return 0;  		}  	} @@ -2412,6 +2407,10 @@ static inline int separate_irq_context(struct task_struct *curr,  	return 0;  } +void lockdep_trace_alloc(gfp_t gfp_mask) +{ +} +  #endif  /* @@ -2445,14 +2444,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,  		return 0;  	switch (new_bit) { -	case LOCK_USED_IN_HARDIRQ: -	case LOCK_USED_IN_SOFTIRQ: -	case LOCK_USED_IN_HARDIRQ_READ: -	case LOCK_USED_IN_SOFTIRQ_READ: -	case LOCK_ENABLED_HARDIRQS: -	case LOCK_ENABLED_SOFTIRQS: -	case LOCK_ENABLED_HARDIRQS_READ: -	case LOCK_ENABLED_SOFTIRQS_READ: +#define LOCKDEP_STATE(__STATE)			\ +	case LOCK_USED_IN_##__STATE:		\ +	case LOCK_USED_IN_##__STATE##_READ:	\ +	case LOCK_ENABLED_##__STATE:		\ +	case LOCK_ENABLED_##__STATE##_READ: +#include "lockdep_states.h" +#undef LOCKDEP_STATE  		ret = mark_lock_irq(curr, this, new_bit);  		if (!ret)  			return 0; @@ -2966,6 +2964,16 @@ void lock_release(struct lockdep_map *lock, int nested,  }  EXPORT_SYMBOL_GPL(lock_release); +void lockdep_set_current_reclaim_state(gfp_t gfp_mask) +{ +	current->lockdep_reclaim_gfp = gfp_mask; +} + +void lockdep_clear_current_reclaim_state(void) +{ +	current->lockdep_reclaim_gfp = 0; +} +  #ifdef CONFIG_LOCK_STAT  static int  print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 56b196932c0..a2cc7e9a6e8 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -7,6 +7,45 @@   */  /* + * Lock-class usage-state bits: + */ +enum lock_usage_bit { +#define LOCKDEP_STATE(__STATE)		\ +	LOCK_USED_IN_##__STATE,		\ +	LOCK_USED_IN_##__STATE##_READ,	\ +	LOCK_ENABLED_##__STATE,		\ +	LOCK_ENABLED_##__STATE##_READ, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +	LOCK_USED, +	LOCK_USAGE_STATES +}; + +/* + * Usage-state bitmasks: + */ +#define __LOCKF(__STATE)	LOCKF_##__STATE = (1 << LOCK_##__STATE), + +enum { +#define LOCKDEP_STATE(__STATE)						\ +	__LOCKF(USED_IN_##__STATE)					\ +	__LOCKF(USED_IN_##__STATE##_READ)				\ +	__LOCKF(ENABLED_##__STATE)					\ +	__LOCKF(ENABLED_##__STATE##_READ) +#include "lockdep_states.h" +#undef LOCKDEP_STATE +	__LOCKF(USED) +}; + +#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) +#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) + +#define LOCKF_ENABLED_IRQ_READ \ +		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) +#define LOCKF_USED_IN_IRQ_READ \ +		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) + +/*   * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies   * we track.   * @@ -31,8 +70,10 @@  extern struct list_head all_lock_classes;  extern struct lock_chain lock_chains[]; -extern void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); +#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) + +extern void get_usage_chars(struct lock_class *class, +			    char usage[LOCK_USAGE_CHARS]);  extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 13716b81389..d7135aa2d2c 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)  {  	struct lock_class *class = v;  	struct lock_list *entry; -	char c1, c2, c3, c4; +	char usage[LOCK_USAGE_CHARS];  	if (v == SEQ_START_TOKEN) {  		seq_printf(m, "all lock classes:\n"); @@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)  	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));  #endif -	get_usage_chars(class, &c1, &c2, &c3, &c4); -	seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); +	get_usage_chars(class, usage); +	seq_printf(m, " %s", usage);  	seq_printf(m, ": ");  	print_name(m, class); @@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)  			nr_uncategorized++;  		if (class->usage_mask & LOCKF_USED_IN_IRQ)  			nr_irq_safe++; -		if (class->usage_mask & LOCKF_ENABLED_IRQS) +		if (class->usage_mask & LOCKF_ENABLED_IRQ)  			nr_irq_unsafe++;  		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)  			nr_softirq_safe++; -		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) +		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)  			nr_softirq_unsafe++;  		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)  			nr_hardirq_safe++; -		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) +		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)  			nr_hardirq_unsafe++;  		if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)  			nr_irq_read_safe++; -		if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) +		if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)  			nr_irq_read_unsafe++;  		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)  			nr_softirq_read_safe++; -		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) +		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)  			nr_softirq_read_unsafe++;  		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)  			nr_hardirq_read_safe++; -		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) +		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)  			nr_hardirq_read_unsafe++;  #ifdef CONFIG_PROVE_LOCKING @@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)  static void seq_header(struct seq_file *m)  {  	seq_printf(m, "lock_stat version 0.3\n"); + +	if (unlikely(!debug_locks)) +		seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); +  	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));  	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "  			"%14s %14s\n", diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h new file mode 100644 index 00000000000..995b0cc2b84 --- /dev/null +++ b/kernel/lockdep_states.h @@ -0,0 +1,9 @@ +/* + * Lockdep states, + * + * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever + * you add one, or come up with a nice dynamic solution. + */ +LOCKDEP_STATE(HARDIRQ) +LOCKDEP_STATE(SOFTIRQ) +LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/module.c b/kernel/module.c index ba22484a987..f77ac320d0b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -51,6 +51,7 @@  #include <linux/tracepoint.h>  #include <linux/ftrace.h>  #include <linux/async.h> +#include <linux/percpu.h>  #if 0  #define DEBUGP printk @@ -366,6 +367,34 @@ static struct module *find_module(const char *name)  }  #ifdef CONFIG_SMP + +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + +static void *percpu_modalloc(unsigned long size, unsigned long align, +			     const char *name) +{ +	void *ptr; + +	if (align > PAGE_SIZE) { +		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", +		       name, align, PAGE_SIZE); +		align = PAGE_SIZE; +	} + +	ptr = __alloc_reserved_percpu(size, align); +	if (!ptr) +		printk(KERN_WARNING +		       "Could not allocate %lu bytes percpu data\n", size); +	return ptr; +} + +static void percpu_modfree(void *freeme) +{ +	free_percpu(freeme); +} + +#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +  /* Number of blocks used and allocated. */  static unsigned int pcpu_num_used, pcpu_num_allocated;  /* Size of each block.  -ve means used. */ @@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)  	}  } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, -				 Elf_Shdr *sechdrs, -				 const char *secstrings) -{ -	return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); -} - -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -{ -	int cpu; - -	for_each_possible_cpu(cpu) -		memcpy(pcpudest + per_cpu_offset(cpu), from, size); -} -  static int percpu_modinit(void)  {  	pcpu_num_used = 2; @@ -513,7 +527,26 @@ static int percpu_modinit(void)  	return 0;  }  __initcall(percpu_modinit); + +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + +static unsigned int find_pcpusec(Elf_Ehdr *hdr, +				 Elf_Shdr *sechdrs, +				 const char *secstrings) +{ +	return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +} + +static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +{ +	int cpu; + +	for_each_possible_cpu(cpu) +		memcpy(pcpudest + per_cpu_offset(cpu), from, size); +} +  #else /* ... !CONFIG_SMP */ +  static inline void *percpu_modalloc(unsigned long size, unsigned long align,  				    const char *name)  { @@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,  	/* pcpusec should be 0, and size of that section should be 0. */  	BUG_ON(size != 0);  } +  #endif /* CONFIG_SMP */  #define MODINFO_ATTR(field)	\ @@ -822,7 +856,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,  	mutex_lock(&module_mutex);  	/* Store the name of the last unloaded module for diagnostic purposes */  	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); -	unregister_dynamic_debug_module(mod->name); +	ddebug_remove_module(mod->name);  	free_module(mod);   out: @@ -1827,19 +1861,13 @@ static inline void add_kallsyms(struct module *mod,  }  #endif /* CONFIG_KALLSYMS */ -static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) +static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)  { -#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG -	unsigned int i; - -	for (i = 0; i < num; i++) { -		register_dynamic_debug_module(debug[i].modname, -					      debug[i].type, -					      debug[i].logical_modname, -					      debug[i].flag_names, -					      debug[i].hash, debug[i].hash2); -	} -#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ +#ifdef CONFIG_DYNAMIC_DEBUG +	if (ddebug_add_module(debug, num, debug->modname)) +		printk(KERN_ERR "dynamic debug error adding module: %s\n", +					debug->modname); +#endif  }  static void *module_alloc_update_bounds(unsigned long size) @@ -2015,14 +2043,6 @@ static noinline struct module *load_module(void __user *umod,  	if (err < 0)  		goto free_mod; -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) -	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), -				      mod->name); -	if (!mod->refptr) { -		err = -ENOMEM; -		goto free_mod; -	} -#endif  	if (pcpuindex) {  		/* We have a special allocation for this section. */  		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, @@ -2030,7 +2050,7 @@ static noinline struct module *load_module(void __user *umod,  					 mod->name);  		if (!percpu) {  			err = -ENOMEM; -			goto free_percpu; +			goto free_mod;  		}  		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;  		mod->percpu = percpu; @@ -2082,6 +2102,14 @@ static noinline struct module *load_module(void __user *umod,  	/* Module has been moved. */  	mod = (void *)sechdrs[modindex].sh_addr; +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) +	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), +				      mod->name); +	if (!mod->refptr) { +		err = -ENOMEM; +		goto free_init; +	} +#endif  	/* Now we've moved module, initialize linked lists, etc. */  	module_unload_init(mod); @@ -2213,12 +2241,13 @@ static noinline struct module *load_module(void __user *umod,  	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);  	if (!mod->taints) { -		struct mod_debug *debug; +		struct _ddebug *debug;  		unsigned int num_debug;  		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",  				     sizeof(*debug), &num_debug); -		dynamic_printk_setup(debug, num_debug); +		if (debug) +			dynamic_debug_setup(debug, num_debug);  	}  	/* sechdrs[0].sh_size is always zero */ @@ -2288,15 +2317,17 @@ static noinline struct module *load_module(void __user *umod,  	ftrace_release(mod->module_core, mod->core_size);   free_unload:  	module_unload_free(mod); + free_init: +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) +	percpu_modfree(mod->refptr); +#endif  	module_free(mod, mod->module_init);   free_core:  	module_free(mod, mod->module_core); +	/* mod will be freed with core. Don't access it beyond this line! */   free_percpu:  	if (percpu)  		percpu_modfree(percpu); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) -	percpu_modfree(mod->refptr); -#endif   free_mod:  	kfree(args);   free_hdr: diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 1d94160eb53..50d022e5a56 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -26,11 +26,6 @@  /*   * Must be called with lock->wait_lock held.   */ -void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) -{ -	lock->owner = new_owner; -} -  void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)  {  	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); @@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,  	/* Mark the current thread as blocked on the lock: */  	ti->task->blocked_on = waiter; -	waiter->lock = lock;  }  void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, @@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)  	DEBUG_LOCKS_WARN_ON(lock->magic != lock);  	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());  	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); -	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); +	mutex_clear_owner(lock);  }  void debug_mutex_init(struct mutex *lock, const char *name, @@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,  	debug_check_no_locks_freed((void *)lock, sizeof(*lock));  	lockdep_init_map(&lock->dep_map, name, key, 0);  #endif -	lock->owner = NULL;  	lock->magic = lock;  } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index babfbdfc534..6b2d735846a 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -13,14 +13,6 @@  /*   * This must be called with lock->wait_lock held.   */ -extern void -debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner); - -static inline void debug_mutex_clear_owner(struct mutex *lock) -{ -	lock->owner = NULL; -} -  extern void debug_mutex_lock_common(struct mutex *lock,  				    struct mutex_waiter *waiter);  extern void debug_mutex_wake_waiter(struct mutex *lock, @@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);  extern void debug_mutex_init(struct mutex *lock, const char *name,  			     struct lock_class_key *key); +static inline void mutex_set_owner(struct mutex *lock) +{ +	lock->owner = current_thread_info(); +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +	lock->owner = NULL; +} +  #define spin_lock_mutex(lock, flags)			\  	do {						\  		struct mutex *l = container_of(lock, struct mutex, wait_lock); \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 4f45d4b658e..5d79781394a 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -10,6 +10,11 @@   * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and   * David Howells for suggestions and improvements.   * + *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline + *    from the -rt tree, where it was originally implemented for rtmutexes + *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale + *    and Sven Dietrich. + *   * Also see Documentation/mutex-design.txt.   */  #include <linux/mutex.h> @@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)  	atomic_set(&lock->count, 1);  	spin_lock_init(&lock->wait_lock);  	INIT_LIST_HEAD(&lock->wait_list); +	mutex_clear_owner(lock);  	debug_mutex_init(lock, name, key);  } @@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)  	 * 'unlocked' into 'locked' state.  	 */  	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); +	mutex_set_owner(lock);  }  EXPORT_SYMBOL(mutex_lock); @@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)  	 * The unlocking fastpath is the 0->1 transition from 'locked'  	 * into 'unlocked' state:  	 */ +#ifndef CONFIG_DEBUG_MUTEXES +	/* +	 * When debugging is enabled we must not clear the owner before time, +	 * the slow path will always be taken, and that clears the owner field +	 * after verifying that it was indeed current. +	 */ +	mutex_clear_owner(lock); +#endif  	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);  } @@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  {  	struct task_struct *task = current;  	struct mutex_waiter waiter; -	unsigned int old_val;  	unsigned long flags; +	preempt_disable(); +	mutex_acquire(&lock->dep_map, subclass, 0, ip); +#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) +	/* +	 * Optimistic spinning. +	 * +	 * We try to spin for acquisition when we find that there are no +	 * pending waiters and the lock owner is currently running on a +	 * (different) CPU. +	 * +	 * The rationale is that if the lock owner is running, it is likely to +	 * release the lock soon. +	 * +	 * Since this needs the lock owner, and this mutex implementation +	 * doesn't track the owner atomically in the lock field, we need to +	 * track it non-atomically. +	 * +	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock +	 * to serialize everything. +	 */ + +	for (;;) { +		struct thread_info *owner; + +		/* +		 * If there's an owner, wait for it to either +		 * release the lock or go to sleep. +		 */ +		owner = ACCESS_ONCE(lock->owner); +		if (owner && !mutex_spin_on_owner(lock, owner)) +			break; + +		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { +			lock_acquired(&lock->dep_map, ip); +			mutex_set_owner(lock); +			preempt_enable(); +			return 0; +		} + +		/* +		 * When there's no owner, we might have preempted between the +		 * owner acquiring the lock and setting the owner field. If +		 * we're an RT task that will live-lock because we won't let +		 * the owner complete. +		 */ +		if (!owner && (need_resched() || rt_task(task))) +			break; + +		/* +		 * The cpu_relax() call is a compiler barrier which forces +		 * everything in this loop to be re-loaded. We don't need +		 * memory barriers as we'll eventually observe the right +		 * values at the cost of a few extra spins. +		 */ +		cpu_relax(); +	} +#endif  	spin_lock_mutex(&lock->wait_lock, flags);  	debug_mutex_lock_common(lock, &waiter); -	mutex_acquire(&lock->dep_map, subclass, 0, ip);  	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));  	/* add waiting tasks to the end of the waitqueue (FIFO): */  	list_add_tail(&waiter.list, &lock->wait_list);  	waiter.task = task; -	old_val = atomic_xchg(&lock->count, -1); -	if (old_val == 1) +	if (atomic_xchg(&lock->count, -1) == 1)  		goto done;  	lock_contended(&lock->dep_map, ip); @@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 * that when we release the lock, we properly wake up the  		 * other waiters:  		 */ -		old_val = atomic_xchg(&lock->count, -1); -		if (old_val == 1) +		if (atomic_xchg(&lock->count, -1) == 1)  			break;  		/* @@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  			spin_unlock_mutex(&lock->wait_lock, flags);  			debug_mutex_free_waiter(&waiter); +			preempt_enable();  			return -EINTR;  		}  		__set_task_state(task, state);  		/* didnt get the lock, go to sleep: */  		spin_unlock_mutex(&lock->wait_lock, flags); -		schedule(); +		__schedule();  		spin_lock_mutex(&lock->wait_lock, flags);  	}  done:  	lock_acquired(&lock->dep_map, ip);  	/* got the lock - rejoice! */ -	mutex_remove_waiter(lock, &waiter, task_thread_info(task)); -	debug_mutex_set_owner(lock, task_thread_info(task)); +	mutex_remove_waiter(lock, &waiter, current_thread_info()); +	mutex_set_owner(lock);  	/* set it to 0 if there are no waiters left: */  	if (likely(list_empty(&lock->wait_list))) @@ -196,6 +265,7 @@ done:  	spin_unlock_mutex(&lock->wait_lock, flags);  	debug_mutex_free_waiter(&waiter); +	preempt_enable();  	return 0;  } @@ -222,7 +292,8 @@ int __sched  mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep(); -	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, +				   subclass, _RET_IP_);  }  EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)  		wake_up_process(waiter->task);  	} -	debug_mutex_clear_owner(lock); -  	spin_unlock_mutex(&lock->wait_lock, flags);  } @@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);   */  int __sched mutex_lock_interruptible(struct mutex *lock)  { +	int ret; +  	might_sleep(); -	return __mutex_fastpath_lock_retval +	ret =  __mutex_fastpath_lock_retval  			(&lock->count, __mutex_lock_interruptible_slowpath); +	if (!ret) +		mutex_set_owner(lock); + +	return ret;  }  EXPORT_SYMBOL(mutex_lock_interruptible);  int __sched mutex_lock_killable(struct mutex *lock)  { +	int ret; +  	might_sleep(); -	return __mutex_fastpath_lock_retval +	ret = __mutex_fastpath_lock_retval  			(&lock->count, __mutex_lock_killable_slowpath); +	if (!ret) +		mutex_set_owner(lock); + +	return ret;  }  EXPORT_SYMBOL(mutex_lock_killable); @@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)  	prev = atomic_xchg(&lock->count, -1);  	if (likely(prev == 1)) { -		debug_mutex_set_owner(lock, current_thread_info()); +		mutex_set_owner(lock);  		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);  	} +  	/* Set it back to 0 if there are no waiters: */  	if (likely(list_empty(&lock->wait_list)))  		atomic_set(&lock->count, 0); @@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)   */  int __sched mutex_trylock(struct mutex *lock)  { -	return __mutex_fastpath_trylock(&lock->count, -					__mutex_trylock_slowpath); +	int ret; + +	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); +	if (ret) +		mutex_set_owner(lock); + +	return ret;  }  EXPORT_SYMBOL(mutex_trylock); diff --git a/kernel/mutex.h b/kernel/mutex.h index a075dafbb29..67578ca48f9 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -16,8 +16,26 @@  #define mutex_remove_waiter(lock, waiter, ti) \  		__list_del((waiter)->list.prev, (waiter)->list.next) -#define debug_mutex_set_owner(lock, new_owner)		do { } while (0) -#define debug_mutex_clear_owner(lock)			do { } while (0) +#ifdef CONFIG_SMP +static inline void mutex_set_owner(struct mutex *lock) +{ +	lock->owner = current_thread_info(); +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +	lock->owner = NULL; +} +#else +static inline void mutex_set_owner(struct mutex *lock) +{ +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +} +#endif +  #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)  #define debug_mutex_free_waiter(waiter)			do { } while (0)  #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0) diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 78bc3fdac0d..5aa854f9e5a 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)  /*   * Rules: - *   1. you can only enter a cgroup which is a child of your current + *   1. you can only enter a cgroup which is a descendant of your current   *     cgroup   *   2. you can only place another process into a cgroup if   *     a. you have CAP_SYS_ADMIN @@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)  static int ns_can_attach(struct cgroup_subsys *ss,  		struct cgroup *new_cgroup, struct task_struct *task)  { -	struct cgroup *orig; -  	if (current != task) {  		if (!capable(CAP_SYS_ADMIN))  			return -EPERM; -		if (!cgroup_is_descendant(new_cgroup)) +		if (!cgroup_is_descendant(new_cgroup, current))  			return -EPERM;  	} -	if (atomic_read(&new_cgroup->count) != 0) -		return -EPERM; - -	orig = task_cgroup(task, ns_subsys_id); -	if (orig && orig != new_cgroup->parent) +	if (!cgroup_is_descendant(new_cgroup, task))  		return -EPERM;  	return 0; @@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,  	if (!capable(CAP_SYS_ADMIN))  		return ERR_PTR(-EPERM); -	if (!cgroup_is_descendant(cgroup)) +	if (!cgroup_is_descendant(cgroup, current))  		return ERR_PTR(-EPERM);  	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); diff --git a/kernel/panic.c b/kernel/panic.c index 2a2ff36ff44..3fd8c5bf8b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -8,19 +8,19 @@   * This function is used through-out the kernel (including mm and fs)   * to indicate a major problem.   */ +#include <linux/debug_locks.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/notifier.h>  #include <linux/module.h> -#include <linux/sched.h> -#include <linux/delay.h> +#include <linux/random.h>  #include <linux/reboot.h> -#include <linux/notifier.h> -#include <linux/init.h> +#include <linux/delay.h> +#include <linux/kexec.h> +#include <linux/sched.h>  #include <linux/sysrq.h> -#include <linux/interrupt.h> +#include <linux/init.h>  #include <linux/nmi.h> -#include <linux/kexec.h> -#include <linux/debug_locks.h> -#include <linux/random.h> -#include <linux/kallsyms.h>  #include <linux/dmi.h>  int panic_on_oops; @@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);   *   *	This function never returns.   */ -  NORET_TYPE void panic(const char * fmt, ...)  { -	long i;  	static char buf[1024];  	va_list args; -#if defined(CONFIG_S390) -	unsigned long caller = (unsigned long) __builtin_return_address(0); -#endif +	long i;  	/* -	 * It's possible to come here directly from a panic-assertion and not -	 * have preempt disabled. Some functions called from here want +	 * It's possible to come here directly from a panic-assertion and +	 * not have preempt disabled. Some functions called from here want  	 * preempt to be disabled. No point enabling it later though...  	 */  	preempt_disable(); @@ -74,7 +70,9 @@ NORET_TYPE void panic(const char * fmt, ...)  	vsnprintf(buf, sizeof(buf), fmt, args);  	va_end(args);  	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); -	bust_spinlocks(0); +#ifdef CONFIG_DEBUG_BUGVERBOSE +	dump_stack(); +#endif  	/*  	 * If we have crashed and we have a crash kernel loaded let it handle @@ -83,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)  	 */  	crash_kexec(NULL); -#ifdef CONFIG_SMP  	/*  	 * Note smp_send_stop is the usual smp shutdown function, which  	 * unfortunately means it may not be hardened to work in a panic  	 * situation.  	 */  	smp_send_stop(); -#endif  	atomic_notifier_call_chain(&panic_notifier_list, 0, buf); @@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)  	if (panic_timeout > 0) {  		/* -	 	 * Delay timeout seconds before rebooting the machine.  -		 * We can't use the "normal" timers since we just panicked.. -	 	 */ -		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); +		 * Delay timeout seconds before rebooting the machine. +		 * We can't use the "normal" timers since we just panicked. +		 */ +		printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); +  		for (i = 0; i < panic_timeout*1000; ) {  			touch_nmi_watchdog();  			i += panic_blink(i);  			mdelay(1);  			i++;  		} -		/*	This will not be a clean reboot, with everything -		 *	shutting down.  But if there is a chance of -		 *	rebooting the system it will be rebooted. +		/* +		 * This will not be a clean reboot, with everything +		 * shutting down.  But if there is a chance of +		 * rebooting the system it will be rebooted.  		 */  		emergency_restart();  	} @@ -124,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)  	}  #endif  #if defined(CONFIG_S390) -	disabled_wait(caller); +	{ +		unsigned long caller; + +		caller = (unsigned long)__builtin_return_address(0); +		disabled_wait(caller); +	}  #endif  	local_irq_enable(); -	for (i = 0;;) { +	for (i = 0; ; ) {  		touch_softlockup_watchdog();  		i += panic_blink(i);  		mdelay(1);  		i++;  	} +	bust_spinlocks(0);  }  EXPORT_SYMBOL(panic);  struct tnt { -	u8 bit; -	char true; -	char false; +	u8	bit; +	char	true; +	char	false;  };  static const struct tnt tnts[] = { -	{ TAINT_PROPRIETARY_MODULE, 'P', 'G' }, -	{ TAINT_FORCED_MODULE, 'F', ' ' }, -	{ TAINT_UNSAFE_SMP, 'S', ' ' }, -	{ TAINT_FORCED_RMMOD, 'R', ' ' }, -	{ TAINT_MACHINE_CHECK, 'M', ' ' }, -	{ TAINT_BAD_PAGE, 'B', ' ' }, -	{ TAINT_USER, 'U', ' ' }, -	{ TAINT_DIE, 'D', ' ' }, -	{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, -	{ TAINT_WARN, 'W', ' ' }, -	{ TAINT_CRAP, 'C', ' ' }, +	{ TAINT_PROPRIETARY_MODULE,	'P', 'G' }, +	{ TAINT_FORCED_MODULE,		'F', ' ' }, +	{ TAINT_UNSAFE_SMP,		'S', ' ' }, +	{ TAINT_FORCED_RMMOD,		'R', ' ' }, +	{ TAINT_MACHINE_CHECK,		'M', ' ' }, +	{ TAINT_BAD_PAGE,		'B', ' ' }, +	{ TAINT_USER,			'U', ' ' }, +	{ TAINT_DIE,			'D', ' ' }, +	{ TAINT_OVERRIDDEN_ACPI_TABLE,	'A', ' ' }, +	{ TAINT_WARN,			'W', ' ' }, +	{ TAINT_CRAP,			'C', ' ' },  };  /** @@ -192,7 +196,8 @@ const char *print_tainted(void)  		*s = 0;  	} else  		snprintf(buf, sizeof(buf), "Not tainted"); -	return(buf); + +	return buf;  }  int test_taint(unsigned flag) @@ -208,7 +213,8 @@ unsigned long get_taint(void)  void add_taint(unsigned flag)  { -	debug_locks = 0; /* can't trust the integrity of the kernel anymore */ +	/* can't trust the integrity of the kernel anymore: */ +	debug_locks = 0;  	set_bit(flag, &tainted_mask);  }  EXPORT_SYMBOL(add_taint); @@ -263,8 +269,8 @@ static void do_oops_enter_exit(void)  }  /* - * Return true if the calling CPU is allowed to print oops-related info.  This - * is a bit racy.. + * Return true if the calling CPU is allowed to print oops-related info. + * This is a bit racy..   */  int oops_may_print(void)  { @@ -273,20 +279,22 @@ int oops_may_print(void)  /*   * Called when the architecture enters its oops handler, before it prints - * anything.  If this is the first CPU to oops, and it's oopsing the first time - * then let it proceed. + * anything.  If this is the first CPU to oops, and it's oopsing the first + * time then let it proceed.   * - * This is all enabled by the pause_on_oops kernel boot option.  We do all this - * to ensure that oopses don't scroll off the screen.  It has the side-effect - * of preventing later-oopsing CPUs from mucking up the display, too. + * This is all enabled by the pause_on_oops kernel boot option.  We do all + * this to ensure that oopses don't scroll off the screen.  It has the + * side-effect of preventing later-oopsing CPUs from mucking up the display, + * too.   * - * It turns out that the CPU which is allowed to print ends up pausing for the - * right duration, whereas all the other CPUs pause for twice as long: once in - * oops_enter(), once in oops_exit(). + * It turns out that the CPU which is allowed to print ends up pausing for + * the right duration, whereas all the other CPUs pause for twice as long: + * once in oops_enter(), once in oops_exit().   */  void oops_enter(void)  { -	debug_locks_off(); /* can't trust the integrity of the kernel anymore */ +	/* can't trust the integrity of the kernel anymore: */ +	debug_locks_off();  	do_oops_enter_exit();  } @@ -355,15 +363,18 @@ EXPORT_SYMBOL(warn_slowpath);  #endif  #ifdef CONFIG_CC_STACKPROTECTOR +  /*   * Called when gcc's -fstack-protector feature is used, and   * gcc detects corruption of the on-stack canary value   */  void __stack_chk_fail(void)  { -	panic("stack-protector: Kernel stack is corrupted"); +	panic("stack-protector: Kernel stack is corrupted in: %p\n", +		__builtin_return_address(0));  }  EXPORT_SYMBOL(__stack_chk_fail); +  #endif  core_param(panic, panic_timeout, int, 0644); diff --git a/kernel/pid.c b/kernel/pid.c index 1b3586fe753..b2e5f78fd28 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)  {  	struct pid *pid;  	rcu_read_lock(); +	if (type != PIDTYPE_PID) +		task = task->group_leader;  	pid = get_pid(task->pids[type].pid);  	rcu_read_unlock();  	return pid; @@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)  }  EXPORT_SYMBOL_GPL(pid_vnr); -pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, +			struct pid_namespace *ns)  { -	return pid_nr_ns(task_pid(tsk), ns); +	pid_t nr = 0; + +	rcu_read_lock(); +	if (!ns) +		ns = current->nsproxy->pid_ns; +	if (likely(pid_alive(task))) { +		if (type != PIDTYPE_PID) +			task = task->group_leader; +		nr = pid_nr_ns(task->pids[type].pid, ns); +	} +	rcu_read_unlock(); + +	return nr;  } -EXPORT_SYMBOL(task_pid_nr_ns); +EXPORT_SYMBOL(__task_pid_nr_ns);  pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)  { @@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)  }  EXPORT_SYMBOL(task_tgid_nr_ns); -pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ -	return pid_nr_ns(task_pgrp(tsk), ns); -} -EXPORT_SYMBOL(task_pgrp_nr_ns); - -pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ -	return pid_nr_ns(task_session(tsk), ns); -} -EXPORT_SYMBOL(task_session_nr_ns); -  struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)  {  	return ns_of_pid(task_pid(tsk)); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index fab8ea86fac..2d1001b4858 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  {  	int nr;  	int rc; +	struct task_struct *task;  	/*  	 * The last thread in the cgroup-init thread group is terminating. @@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	read_lock(&tasklist_lock);  	nr = next_pidmap(pid_ns, 1);  	while (nr > 0) { -		kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); +		rcu_read_lock(); + +		/* +		 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring +		 * any nested-container's init processes don't ignore the +		 * signal +		 */ +		task = pid_task(find_vpid(nr), PIDTYPE_PID); +		if (task) +			force_sig(SIGKILL, task); + +		rcu_read_unlock(); +  		nr = next_pidmap(pid_ns, nr);  	}  	read_unlock(&tasklist_lock); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e976e505648..8e5d9a68b02 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1370,7 +1370,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)  		if (task_cputime_expired(&group_sample, &sig->cputime_expires))  			return 1;  	} -	return 0; + +	return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;  }  /* diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 4a4a206b197..5f21ab2bbcd 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,6 +22,7 @@  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/freezer.h> +#include <asm/suspend.h>  #include "power.h" @@ -214,7 +215,7 @@ static int create_image(int platform_mode)  		return error;  	device_pm_lock(); -	local_irq_disable(); +  	/* At this point, device_suspend() has been called, but *not*  	 * device_power_down(). We *must* call device_power_down() now.  	 * Otherwise, drivers for some devices (e.g. interrupt controllers) @@ -225,13 +226,25 @@ static int create_image(int platform_mode)  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down, "  			"aborting hibernation\n"); -		goto Enable_irqs; +		goto Unlock;  	} + +	error = platform_pre_snapshot(platform_mode); +	if (error || hibernation_test(TEST_PLATFORM)) +		goto Platform_finish; + +	error = disable_nonboot_cpus(); +	if (error || hibernation_test(TEST_CPUS) +	    || hibernation_testmode(HIBERNATION_TEST)) +		goto Enable_cpus; + +	local_irq_disable(); +  	sysdev_suspend(PMSG_FREEZE);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down, "  			"aborting hibernation\n"); -		goto Power_up_devices; +		goto Enable_irqs;  	}  	if (hibernation_test(TEST_CORE)) @@ -247,17 +260,28 @@ static int create_image(int platform_mode)  	restore_processor_state();  	if (!in_suspend)  		platform_leave(platform_mode); +   Power_up:  	sysdev_resume();  	/* NOTE:  device_power_up() is just a resume() for devices  	 * that suspended with irqs off ... no overall powerup.  	 */ - Power_up_devices: -	device_power_up(in_suspend ? -		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); +   Enable_irqs:  	local_irq_enable(); + + Enable_cpus: +	enable_nonboot_cpus(); + + Platform_finish: +	platform_finish(platform_mode); + +	device_power_up(in_suspend ? +		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + + Unlock:  	device_pm_unlock(); +  	return error;  } @@ -265,7 +289,7 @@ static int create_image(int platform_mode)   *	hibernation_snapshot - quiesce devices and create the hibernation   *	snapshot image.   *	@platform_mode - if set, use the platform driver, if available, to - *			 prepare the platform frimware for the power transition. + *			 prepare the platform firmware for the power transition.   *   *	Must be called with pm_mutex held   */ @@ -291,25 +315,9 @@ int hibernation_snapshot(int platform_mode)  	if (hibernation_test(TEST_DEVICES))  		goto Recover_platform; -	error = platform_pre_snapshot(platform_mode); -	if (error || hibernation_test(TEST_PLATFORM)) -		goto Finish; - -	error = disable_nonboot_cpus(); -	if (!error) { -		if (hibernation_test(TEST_CPUS)) -			goto Enable_cpus; - -		if (hibernation_testmode(HIBERNATION_TEST)) -			goto Enable_cpus; +	error = create_image(platform_mode); +	/* Control returns here after successful restore */ -		error = create_image(platform_mode); -		/* Control returns here after successful restore */ -	} - Enable_cpus: -	enable_nonboot_cpus(); - Finish: -	platform_finish(platform_mode);   Resume_devices:  	device_resume(in_suspend ?  		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); @@ -331,19 +339,33 @@ int hibernation_snapshot(int platform_mode)   *	kernel.   */ -static int resume_target_kernel(void) +static int resume_target_kernel(bool platform_mode)  {  	int error;  	device_pm_lock(); -	local_irq_disable(); +  	error = device_power_down(PMSG_QUIESCE);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down, "  			"aborting resume\n"); -		goto Enable_irqs; +		goto Unlock;  	} -	sysdev_suspend(PMSG_QUIESCE); + +	error = platform_pre_restore(platform_mode); +	if (error) +		goto Cleanup; + +	error = disable_nonboot_cpus(); +	if (error) +		goto Enable_cpus; + +	local_irq_disable(); + +	error = sysdev_suspend(PMSG_QUIESCE); +	if (error) +		goto Enable_irqs; +  	/* We'll ignore saved state, but this gets preempt count (etc) right */  	save_processor_state();  	error = restore_highmem(); @@ -366,11 +388,23 @@ static int resume_target_kernel(void)  	swsusp_free();  	restore_processor_state();  	touch_softlockup_watchdog(); +  	sysdev_resume(); -	device_power_up(PMSG_RECOVER); +   Enable_irqs:  	local_irq_enable(); + + Enable_cpus: +	enable_nonboot_cpus(); + + Cleanup: +	platform_restore_cleanup(platform_mode); + +	device_power_up(PMSG_RECOVER); + + Unlock:  	device_pm_unlock(); +  	return error;  } @@ -378,7 +412,7 @@ static int resume_target_kernel(void)   *	hibernation_restore - quiesce devices and restore the hibernation   *	snapshot image.  If successful, control returns in hibernation_snaphot()   *	@platform_mode - if set, use the platform driver, if available, to - *			 prepare the platform frimware for the transition. + *			 prepare the platform firmware for the transition.   *   *	Must be called with pm_mutex held   */ @@ -390,19 +424,10 @@ int hibernation_restore(int platform_mode)  	pm_prepare_console();  	suspend_console();  	error = device_suspend(PMSG_QUIESCE); -	if (error) -		goto Finish; - -	error = platform_pre_restore(platform_mode);  	if (!error) { -		error = disable_nonboot_cpus(); -		if (!error) -			error = resume_target_kernel(); -		enable_nonboot_cpus(); +		error = resume_target_kernel(platform_mode); +		device_resume(PMSG_RECOVER);  	} -	platform_restore_cleanup(platform_mode); -	device_resume(PMSG_RECOVER); - Finish:  	resume_console();  	pm_restore_console();  	return error; @@ -438,38 +463,46 @@ int hibernation_platform_enter(void)  		goto Resume_devices;  	} +	device_pm_lock(); + +	error = device_power_down(PMSG_HIBERNATE); +	if (error) +		goto Unlock; +  	error = hibernation_ops->prepare();  	if (error) -		goto Resume_devices; +		goto Platofrm_finish;  	error = disable_nonboot_cpus();  	if (error) -		goto Finish; +		goto Platofrm_finish; -	device_pm_lock();  	local_irq_disable(); -	error = device_power_down(PMSG_HIBERNATE); -	if (!error) { -		sysdev_suspend(PMSG_HIBERNATE); -		hibernation_ops->enter(); -		/* We should never get here */ -		while (1); -	} -	local_irq_enable(); -	device_pm_unlock(); +	sysdev_suspend(PMSG_HIBERNATE); +	hibernation_ops->enter(); +	/* We should never get here */ +	while (1);  	/*  	 * We don't need to reenable the nonboot CPUs or resume consoles, since  	 * the system is going to be halted anyway.  	 */ - Finish: + Platofrm_finish:  	hibernation_ops->finish(); + +	device_power_up(PMSG_RESTORE); + + Unlock: +	device_pm_unlock(); +   Resume_devices:  	entering_platform_hibernation = false;  	device_resume(PMSG_RESTORE);  	resume_console(); +   Close:  	hibernation_ops->end(); +  	return error;  } diff --git a/kernel/power/main.c b/kernel/power/main.c index c9632f841f6..f172f41858b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -287,17 +287,32 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)   */  static int suspend_enter(suspend_state_t state)  { -	int error = 0; +	int error;  	device_pm_lock(); -	arch_suspend_disable_irqs(); -	BUG_ON(!irqs_disabled()); -	if ((error = device_power_down(PMSG_SUSPEND))) { +	error = device_power_down(PMSG_SUSPEND); +	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down\n");  		goto Done;  	} +	if (suspend_ops->prepare) { +		error = suspend_ops->prepare(); +		if (error) +			goto Power_up_devices; +	} + +	if (suspend_test(TEST_PLATFORM)) +		goto Platfrom_finish; + +	error = disable_nonboot_cpus(); +	if (error || suspend_test(TEST_CPUS)) +		goto Enable_cpus; + +	arch_suspend_disable_irqs(); +	BUG_ON(!irqs_disabled()); +  	error = sysdev_suspend(PMSG_SUSPEND);  	if (!error) {  		if (!suspend_test(TEST_CORE)) @@ -305,11 +320,22 @@ static int suspend_enter(suspend_state_t state)  		sysdev_resume();  	} -	device_power_up(PMSG_RESUME); - Done:  	arch_suspend_enable_irqs();  	BUG_ON(irqs_disabled()); + + Enable_cpus: +	enable_nonboot_cpus(); + + Platfrom_finish: +	if (suspend_ops->finish) +		suspend_ops->finish(); + + Power_up_devices: +	device_power_up(PMSG_RESUME); + + Done:  	device_pm_unlock(); +  	return error;  } @@ -341,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state)  	if (suspend_test(TEST_DEVICES))  		goto Recover_platform; -	if (suspend_ops->prepare) { -		error = suspend_ops->prepare(); -		if (error) -			goto Resume_devices; -	} - -	if (suspend_test(TEST_PLATFORM)) -		goto Finish; - -	error = disable_nonboot_cpus(); -	if (!error && !suspend_test(TEST_CPUS)) -		suspend_enter(state); +	suspend_enter(state); -	enable_nonboot_cpus(); - Finish: -	if (suspend_ops->finish) -		suspend_ops->finish();   Resume_devices:  	suspend_test_start();  	device_resume(PMSG_RESUME); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f5fc2d7680f..33e2e4a819f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)  	INIT_LIST_HEAD(list); -	for_each_zone(zone) { +	for_each_populated_zone(zone) {  		unsigned long zone_start, zone_end;  		struct mem_extent *ext, *cur, *aux; -		if (!populated_zone(zone)) -			continue; -  		zone_start = zone->zone_start_pfn;  		zone_end = zone->zone_start_pfn + zone->spanned_pages; @@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)  	struct zone *zone;  	unsigned int cnt = 0; -	for_each_zone(zone) -		if (populated_zone(zone) && is_highmem(zone)) +	for_each_populated_zone(zone) +		if (is_highmem(zone))  			cnt += zone_page_state(zone, NR_FREE_PAGES);  	return cnt; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index a92c9145155..78c35047586 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -51,6 +51,7 @@  #include <linux/highmem.h>  #include <linux/time.h>  #include <linux/rbtree.h> +#include <linux/io.h>  #include "power.h" @@ -229,17 +230,16 @@ int swsusp_shrink_memory(void)  		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;  		tmp = size;  		size += highmem_size; -		for_each_zone (zone) -			if (populated_zone(zone)) { -				tmp += snapshot_additional_pages(zone); -				if (is_highmem(zone)) { -					highmem_size -= +		for_each_populated_zone(zone) { +			tmp += snapshot_additional_pages(zone); +			if (is_highmem(zone)) { +				highmem_size -=  					zone_page_state(zone, NR_FREE_PAGES); -				} else { -					tmp -= zone_page_state(zone, NR_FREE_PAGES); -					tmp += zone->lowmem_reserve[ZONE_NORMAL]; -				} +			} else { +				tmp -= zone_page_state(zone, NR_FREE_PAGES); +				tmp += zone->lowmem_reserve[ZONE_NORMAL];  			} +		}  		if (highmem_size < 0)  			highmem_size = 0; diff --git a/kernel/printk.c b/kernel/printk.c index e3602d0755b..5052b5497c6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -32,6 +32,7 @@  #include <linux/security.h>  #include <linux/bootmem.h>  #include <linux/syscalls.h> +#include <linux/kexec.h>  #include <asm/uaccess.h> @@ -135,6 +136,24 @@ static char *log_buf = __log_buf;  static int log_buf_len = __LOG_BUF_LEN;  static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ +#ifdef CONFIG_KEXEC +/* + * This appends the listed symbols to /proc/vmcoreinfo + * + * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to + * obtain access to symbols that are otherwise very difficult to locate.  These + * symbols are specifically used so that utilities can access and extract the + * dmesg log from a vmcore file after a crash. + */ +void log_buf_kexec_setup(void) +{ +	VMCOREINFO_SYMBOL(log_buf); +	VMCOREINFO_SYMBOL(log_end); +	VMCOREINFO_SYMBOL(log_buf_len); +	VMCOREINFO_SYMBOL(logged_chars); +} +#endif +  static int __init log_buf_len_setup(char *str)  {  	unsigned size = memparse(str, &str); @@ -1292,8 +1311,11 @@ EXPORT_SYMBOL(printk_ratelimit);  bool printk_timed_ratelimit(unsigned long *caller_jiffies,  			unsigned int interval_msecs)  { -	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { -		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); +	if (*caller_jiffies == 0 +			|| !time_in_range(jiffies, *caller_jiffies, +					*caller_jiffies +					+ msecs_to_jiffies(interval_msecs))) { +		*caller_jiffies = jiffies;  		return true;  	}  	return false; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c9cf48b21f0..aaad0ec3419 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)  {  	spin_lock(&child->sighand->siglock);  	if (task_is_traced(child)) { -		if (child->signal->flags & SIGNAL_STOP_STOPPED) { +		/* +		 * If the group stop is completed or in progress, +		 * this thread was already counted as stopped. +		 */ +		if (child->signal->flags & SIGNAL_STOP_STOPPED || +		    child->signal->group_stop_count)  			__set_task_state(child, TASK_STOPPED); -		} else { +		else  			signal_wake_up(child, 1); -		}  	}  	spin_unlock(&child->sighand->siglock);  } @@ -235,18 +239,58 @@ out:  	return retval;  } -static inline void __ptrace_detach(struct task_struct *child, unsigned int data) +/* + * Called with irqs disabled, returns true if childs should reap themselves. + */ +static int ignoring_children(struct sighand_struct *sigh)  { -	child->exit_code = data; -	/* .. re-parent .. */ -	__ptrace_unlink(child); -	/* .. and wake it up. */ -	if (child->exit_state != EXIT_ZOMBIE) -		wake_up_process(child); +	int ret; +	spin_lock(&sigh->siglock); +	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) || +	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT); +	spin_unlock(&sigh->siglock); +	return ret; +} + +/* + * Called with tasklist_lock held for writing. + * Unlink a traced task, and clean it up if it was a traced zombie. + * Return true if it needs to be reaped with release_task(). + * (We can't call release_task() here because we already hold tasklist_lock.) + * + * If it's a zombie, our attachedness prevented normal parent notification + * or self-reaping.  Do notification now if it would have happened earlier. + * If it should reap itself, return true. + * + * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. + */ +static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) +{ +	__ptrace_unlink(p); + +	if (p->exit_state == EXIT_ZOMBIE) { +		if (!task_detached(p) && thread_group_empty(p)) { +			if (!same_thread_group(p->real_parent, tracer)) +				do_notify_parent(p, p->exit_signal); +			else if (ignoring_children(tracer->sighand)) +				p->exit_signal = -1; +		} +		if (task_detached(p)) { +			/* Mark it as in the process of being reaped. */ +			p->exit_state = EXIT_DEAD; +			return true; +		} +	} + +	return false;  }  int ptrace_detach(struct task_struct *child, unsigned int data)  { +	bool dead = false; +  	if (!valid_signal(data))  		return -EIO; @@ -255,14 +299,45 @@ int ptrace_detach(struct task_struct *child, unsigned int data)  	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);  	write_lock_irq(&tasklist_lock); -	/* protect against de_thread()->release_task() */ -	if (child->ptrace) -		__ptrace_detach(child, data); +	/* +	 * This child can be already killed. Make sure de_thread() or +	 * our sub-thread doing do_wait() didn't do release_task() yet. +	 */ +	if (child->ptrace) { +		child->exit_code = data; +		dead = __ptrace_detach(current, child); +	}  	write_unlock_irq(&tasklist_lock); +	if (unlikely(dead)) +		release_task(child); +  	return 0;  } +/* + * Detach all tasks we were using ptrace on. + */ +void exit_ptrace(struct task_struct *tracer) +{ +	struct task_struct *p, *n; +	LIST_HEAD(ptrace_dead); + +	write_lock_irq(&tasklist_lock); +	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { +		if (__ptrace_detach(tracer, p)) +			list_add(&p->ptrace_entry, &ptrace_dead); +	} +	write_unlock_irq(&tasklist_lock); + +	BUG_ON(!list_empty(&tracer->ptraced)); + +	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { +		list_del_init(&p->ptrace_entry); +		release_task(p); +	} +} +  int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)  {  	int copied = 0; @@ -612,8 +687,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)  		goto out_put_task_struct;  	ret = arch_ptrace(child, request, addr, data); -	if (ret < 0) -		goto out_put_task_struct;   out_put_task_struct:  	put_task_struct(child); diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index bd5a9003497..654c640a6b9 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu)  void rcu_check_callbacks(int cpu, int user)  {  	if (user || -	    (idle_cpu(cpu) && !in_softirq() && -				hardirq_count() <= (1 << HARDIRQ_SHIFT))) { +	    (idle_cpu(cpu) && rcu_scheduler_active && +	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {  		/*  		 * Get here if this CPU took its interrupt from user diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index d92a76a881a..2c7b8457d0d 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,6 +44,7 @@  #include <linux/cpu.h>  #include <linux/mutex.h>  #include <linux/module.h> +#include <linux/kernel_stat.h>  enum rcu_barrier {  	RCU_BARRIER_STD, @@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};  static atomic_t rcu_barrier_cpu_count;  static DEFINE_MUTEX(rcu_barrier_mutex);  static struct completion rcu_barrier_completion; +int rcu_scheduler_active __read_mostly;  /*   * Awaken the corresponding synchronize_rcu() instance now that a @@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head  *head)  void synchronize_rcu(void)  {  	struct rcu_synchronize rcu; + +	if (rcu_blocking_is_gp()) +		return; +  	init_completion(&rcu.completion);  	/* Will wake me after RCU finished. */  	call_rcu(&rcu.head, wakeme_after_rcu); @@ -116,6 +122,8 @@ static void rcu_barrier_func(void *type)  	}  } +static inline void wait_migrated_callbacks(void); +  /*   * Orchestrate the specified type of RCU barrier, waiting for all   * RCU callbacks of the specified type to complete. @@ -141,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)  		complete(&rcu_barrier_completion);  	wait_for_completion(&rcu_barrier_completion);  	mutex_unlock(&rcu_barrier_mutex); +	wait_migrated_callbacks();  }  /** @@ -170,8 +179,55 @@ void rcu_barrier_sched(void)  }  EXPORT_SYMBOL_GPL(rcu_barrier_sched); +static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); +static struct rcu_head rcu_migrate_head[3]; +static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); + +static void rcu_migrate_callback(struct rcu_head *notused) +{ +	if (atomic_dec_and_test(&rcu_migrate_type_count)) +		wake_up(&rcu_migrate_wq); +} + +static inline void wait_migrated_callbacks(void) +{ +	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); +} + +static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, +		unsigned long action, void *hcpu) +{ +	if (action == CPU_DYING) { +		/* +		 * preempt_disable() in on_each_cpu() prevents stop_machine(), +		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" +		 * returns, all online cpus have queued rcu_barrier_func(), +		 * and the dead cpu(if it exist) queues rcu_migrate_callback()s. +		 * +		 * These callbacks ensure _rcu_barrier() waits for all +		 * RCU callbacks of the specified type to complete. +		 */ +		atomic_set(&rcu_migrate_type_count, 3); +		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); +		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); +		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); +	} else if (action == CPU_POST_DEAD) { +		/* rcu_migrate_head is protected by cpu_add_remove_lock */ +		wait_migrated_callbacks(); +	} + +	return NOTIFY_OK; +} +  void __init rcu_init(void)  {  	__rcu_init(); +	hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);  } +void rcu_scheduler_starting(void) +{ +	WARN_ON(num_online_cpus() != 1); +	WARN_ON(nr_context_switches() > 0); +	rcu_scheduler_active = 1; +} diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 33cfc50781f..5d59e850fb7 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1181,6 +1181,9 @@ void __synchronize_sched(void)  {  	struct rcu_synchronize rcu; +	if (num_online_cpus() == 1) +		return;  /* blocking is gp if only one CPU! */ +  	init_completion(&rcu.completion);  	/* Will wake me after RCU finished. */  	call_rcu_sched(&rcu.head, wakeme_after_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7c4142a79f0..9b4a975a4b4 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;  static atomic_t n_rcu_torture_error;  static long n_rcu_torture_timers = 0;  static struct list_head rcu_torture_removed; +static cpumask_var_t shuffle_tmp_mask;  static int stutter_pause_test = 0; @@ -889,10 +890,9 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */   */  static void rcu_torture_shuffle_tasks(void)  { -	cpumask_t tmp_mask;  	int i; -	cpus_setall(tmp_mask); +	cpumask_setall(shuffle_tmp_mask);  	get_online_cpus();  	/* No point in shuffling if there is only one online CPU (ex: UP) */ @@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)  	}  	if (rcu_idle_cpu != -1) -		cpu_clear(rcu_idle_cpu, tmp_mask); +		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); -	set_cpus_allowed_ptr(current, &tmp_mask); +	set_cpus_allowed_ptr(current, shuffle_tmp_mask);  	if (reader_tasks) {  		for (i = 0; i < nrealreaders; i++)  			if (reader_tasks[i])  				set_cpus_allowed_ptr(reader_tasks[i], -						     &tmp_mask); +						     shuffle_tmp_mask);  	}  	if (fakewriter_tasks) {  		for (i = 0; i < nfakewriters; i++)  			if (fakewriter_tasks[i])  				set_cpus_allowed_ptr(fakewriter_tasks[i], -						     &tmp_mask); +						     shuffle_tmp_mask);  	}  	if (writer_task) -		set_cpus_allowed_ptr(writer_task, &tmp_mask); +		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);  	if (stats_task) -		set_cpus_allowed_ptr(stats_task, &tmp_mask); +		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);  	if (rcu_idle_cpu == -1)  		rcu_idle_cpu = num_online_cpus() - 1; @@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)  	if (shuffler_task) {  		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");  		kthread_stop(shuffler_task); +		free_cpumask_var(shuffle_tmp_mask);  	}  	shuffler_task = NULL; @@ -1190,10 +1191,18 @@ rcu_torture_init(void)  	}  	if (test_no_idle_hz) {  		rcu_idle_cpu = num_online_cpus() - 1; + +		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { +			firsterr = -ENOMEM; +			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); +			goto unwind; +		} +  		/* Create the shuffler thread */  		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,  					  "rcu_torture_shuffle");  		if (IS_ERR(shuffler_task)) { +			free_cpumask_var(shuffle_tmp_mask);  			firsterr = PTR_ERR(shuffler_task);  			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");  			shuffler_task = NULL; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fd602a6f6..97ce31579ec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp)  void rcu_check_callbacks(int cpu, int user)  {  	if (user || -	    (idle_cpu(cpu) && !in_softirq() && -				hardirq_count() <= (1 << HARDIRQ_SHIFT))) { +	    (idle_cpu(cpu) && rcu_scheduler_active && +	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {  		/*  		 * Get here if this CPU took its interrupt from user diff --git a/kernel/relay.c b/kernel/relay.c index 9d79b7854fa..e92db8c06ac 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -750,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)  			 * from the scheduler (trying to re-grab  			 * rq->lock), so defer it.  			 */ -			__mod_timer(&buf->timer, jiffies + 1); +			mod_timer(&buf->timer, jiffies + 1);  	}  	old = buf->data; @@ -797,13 +797,15 @@ void relay_subbufs_consumed(struct rchan *chan,  	if (!chan)  		return; -	if (cpu >= NR_CPUS || !chan->buf[cpu]) +	if (cpu >= NR_CPUS || !chan->buf[cpu] || +					subbufs_consumed > chan->n_subbufs)  		return;  	buf = chan->buf[cpu]; -	buf->subbufs_consumed += subbufs_consumed; -	if (buf->subbufs_consumed > buf->subbufs_produced) +	if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)  		buf->subbufs_consumed = buf->subbufs_produced; +	else +		buf->subbufs_consumed += subbufs_consumed;  }  EXPORT_SYMBOL_GPL(relay_subbufs_consumed); diff --git a/kernel/sched.c b/kernel/sched.c index 410eec40413..2325db2be31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)  {  	ktime_t now; -	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) +	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)  		return;  	if (hrtimer_active(&rt_b->rt_period_timer)) @@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;   */  static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ +	return list_empty(&root_task_group.children); +} +#endif +  #ifdef CONFIG_FAIR_GROUP_SCHED  #ifdef CONFIG_USER_SCHED  # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD) @@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)  #else +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ +	return 1; +} +#endif +  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }  static inline struct task_group *task_group(struct task_struct *p)  { @@ -467,11 +481,17 @@ struct rt_rq {  	struct rt_prio_array active;  	unsigned long rt_nr_running;  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED -	int highest_prio; /* highest queued rt task prio */ +	struct { +		int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP +		int next; /* next highest */ +#endif +	} highest_prio;  #endif  #ifdef CONFIG_SMP  	unsigned long rt_nr_migratory;  	int overloaded; +	struct plist_head pushable_tasks;  #endif  	int rt_throttled;  	u64 rt_time; @@ -549,7 +569,6 @@ struct rq {  	unsigned long nr_running;  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX]; -	unsigned char idle_at_tick;  #ifdef CONFIG_NO_HZ  	unsigned long last_tick_seen;  	unsigned char in_nohz_recently; @@ -590,6 +609,7 @@ struct rq {  	struct root_domain *rd;  	struct sched_domain *sd; +	unsigned char idle_at_tick;  	/* For active balancing */  	int active_balance;  	int push_cpu; @@ -618,9 +638,6 @@ struct rq {  	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */  	/* sys_sched_yield() stats */ -	unsigned int yld_exp_empty; -	unsigned int yld_act_empty; -	unsigned int yld_both_empty;  	unsigned int yld_count;  	/* schedule() stats */ @@ -1093,7 +1110,7 @@ static void hrtick_start(struct rq *rq, u64 delay)  	if (rq == this_rq()) {  		hrtimer_restart(timer);  	} else if (!rq->hrtick_csd_pending) { -		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); +		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);  		rq->hrtick_csd_pending = 1;  	}  } @@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p)  	assert_spin_locked(&task_rq(p)->lock); -	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) +	if (test_tsk_need_resched(p))  		return; -	set_tsk_thread_flag(p, TIF_NEED_RESCHED); +	set_tsk_need_resched(p);  	cpu = task_cpu(p);  	if (cpu == smp_processor_id()) @@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu)  	 * lockless. The worst case is that the other CPU runs the  	 * idle task through an additional NOOP schedule()  	 */ -	set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); +	set_tsk_need_resched(rq->idle);  	/* NEED_RESCHED must be visible before we test polling */  	smp_mb(); @@ -1610,21 +1627,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)  #endif +#ifdef CONFIG_PREEMPT +  /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations.  This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below.  However, it + * also adds more overhead and therefore may reduce throughput.   */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) +	__releases(this_rq->lock) +	__acquires(busiest->lock) +	__acquires(this_rq->lock) +{ +	spin_unlock(&this_rq->lock); +	double_rq_lock(this_rq, busiest); + +	return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry.  This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)  	__releases(this_rq->lock)  	__acquires(busiest->lock)  	__acquires(this_rq->lock)  {  	int ret = 0; -	if (unlikely(!irqs_disabled())) { -		/* printk() doesn't work good under rq->lock */ -		spin_unlock(&this_rq->lock); -		BUG_ON(1); -	}  	if (unlikely(!spin_trylock(&busiest->lock))) {  		if (busiest < this_rq) {  			spin_unlock(&this_rq->lock); @@ -1637,6 +1675,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)  	return ret;  } +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ +	if (unlikely(!irqs_disabled())) { +		/* printk() doesn't work good under rq->lock */ +		spin_unlock(&this_rq->lock); +		BUG_ON(1); +	} + +	return _double_lock_balance(this_rq, busiest); +} +  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)  	__releases(busiest->lock)  { @@ -1705,6 +1759,9 @@ static void update_avg(u64 *avg, u64 sample)  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)  { +	if (wakeup) +		p->se.start_runtime = p->se.sum_exec_runtime; +  	sched_info_queued(p);  	p->sched_class->enqueue_task(rq, p, wakeup);  	p->se.on_rq = 1; @@ -1712,10 +1769,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)  { -	if (sleep && p->se.last_wakeup) { -		update_avg(&p->se.avg_overlap, -			   p->se.sum_exec_runtime - p->se.last_wakeup); -		p->se.last_wakeup = 0; +	if (sleep) { +		if (p->se.last_wakeup) { +			update_avg(&p->se.avg_overlap, +				p->se.sum_exec_runtime - p->se.last_wakeup); +			p->se.last_wakeup = 0; +		} else { +			update_avg(&p->se.avg_wakeup, +				sysctl_sched_wakeup_granularity); +		}  	}  	sched_info_dequeued(p); @@ -2017,7 +2079,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)  		 * it must be off the runqueue _entirely_, and not  		 * preempted!  		 * -		 * So if it wa still runnable (but just not actively +		 * So if it was still runnable (but just not actively  		 * running right now), it's preempted, and we should  		 * yield - it could be a while.  		 */ @@ -2267,7 +2329,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)  		sync = 0;  #ifdef CONFIG_SMP -	if (sched_feat(LB_WAKEUP_UPDATE)) { +	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {  		struct sched_domain *sd;  		this_cpu = raw_smp_processor_id(); @@ -2345,6 +2407,22 @@ out_activate:  	activate_task(rq, p, 1);  	success = 1; +	/* +	 * Only attribute actual wakeups done by this task. +	 */ +	if (!in_interrupt()) { +		struct sched_entity *se = ¤t->se; +		u64 sample = se->sum_exec_runtime; + +		if (se->last_wakeup) +			sample -= se->last_wakeup; +		else +			sample -= se->start_runtime; +		update_avg(&se->avg_wakeup, sample); + +		se->last_wakeup = se->sum_exec_runtime; +	} +  out_running:  	trace_sched_wakeup(rq, p, success);  	check_preempt_curr(rq, p, sync); @@ -2355,8 +2433,6 @@ out_running:  		p->sched_class->task_wake_up(rq, p);  #endif  out: -	current->se.last_wakeup = current->se.sum_exec_runtime; -  	task_rq_unlock(rq, &flags);  	return success; @@ -2386,6 +2462,8 @@ static void __sched_fork(struct task_struct *p)  	p->se.prev_sum_exec_runtime	= 0;  	p->se.last_wakeup		= 0;  	p->se.avg_overlap		= 0; +	p->se.start_runtime		= 0; +	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;  #ifdef CONFIG_SCHEDSTATS  	p->se.wait_start		= 0; @@ -2448,6 +2526,8 @@ void sched_fork(struct task_struct *p, int clone_flags)  	/* Want to start with kernel preemption disabled. */  	task_thread_info(p)->preempt_count = 1;  #endif +	plist_node_init(&p->pushable_tasks, MAX_PRIO); +  	put_cpu();  } @@ -2491,7 +2571,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)  #ifdef CONFIG_PREEMPT_NOTIFIERS  /** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * preempt_notifier_register - tell me when current is being preempted & rescheduled   * @notifier: notifier struct to register   */  void preempt_notifier_register(struct preempt_notifier *notifier) @@ -2588,6 +2668,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  {  	struct mm_struct *mm = rq->prev_mm;  	long prev_state; +#ifdef CONFIG_SMP +	int post_schedule = 0; + +	if (current->sched_class->needs_post_schedule) +		post_schedule = current->sched_class->needs_post_schedule(rq); +#endif  	rq->prev_mm = NULL; @@ -2606,7 +2692,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	finish_arch_switch(prev);  	finish_lock_switch(rq, prev);  #ifdef CONFIG_SMP -	if (current->sched_class->post_schedule) +	if (post_schedule)  		current->sched_class->post_schedule(rq);  #endif @@ -2913,6 +2999,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  		     struct sched_domain *sd, enum cpu_idle_type idle,  		     int *all_pinned)  { +	int tsk_cache_hot = 0;  	/*  	 * We do not migrate tasks that are:  	 * 1) running (obviously), or @@ -2936,10 +3023,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  	 * 2) too many balance attempts have failed.  	 */ -	if (!task_hot(p, rq->clock, sd) || -			sd->nr_balance_failed > sd->cache_nice_tries) { +	tsk_cache_hot = task_hot(p, rq->clock, sd); +	if (!tsk_cache_hot || +		sd->nr_balance_failed > sd->cache_nice_tries) {  #ifdef CONFIG_SCHEDSTATS -		if (task_hot(p, rq->clock, sd)) { +		if (tsk_cache_hot) {  			schedstat_inc(sd, lb_hot_gained[idle]);  			schedstat_inc(p, se.nr_forced_migrations);  		} @@ -2947,7 +3035,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  		return 1;  	} -	if (task_hot(p, rq->clock, sd)) { +	if (tsk_cache_hot) {  		schedstat_inc(p, se.nr_failed_migrations_hot);  		return 0;  	} @@ -2987,6 +3075,16 @@ next:  	pulled++;  	rem_load_move -= p->se.load.weight; +#ifdef CONFIG_PREEMPT +	/* +	 * NEWIDLE balancing is a source of latency, so preemptible kernels +	 * will stop after the first task is pulled to minimize the critical +	 * section. +	 */ +	if (idle == CPU_NEWLY_IDLE) +		goto out; +#endif +  	/*  	 * We only want to steal up to the prescribed amount of weighted load.  	 */ @@ -3033,9 +3131,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,  				sd, idle, all_pinned, &this_best_prio);  		class = class->next; +#ifdef CONFIG_PREEMPT +		/* +		 * NEWIDLE balancing is a source of latency, so preemptible +		 * kernels will stop after the first task is pulled to minimize +		 * the critical section. +		 */  		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)  			break; - +#endif  	} while (class && max_load_move > total_load_moved);  	return total_load_moved > 0; @@ -3085,246 +3189,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,  	return 0;  } - +/********** Helpers for find_busiest_group ************************/  /* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. + * sd_lb_stats - Structure to store the statistics of a sched_domain + * 		during load balancing.   */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, -		   unsigned long *imbalance, enum cpu_idle_type idle, -		   int *sd_idle, const struct cpumask *cpus, int *balance) -{ -	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; -	unsigned long max_load, avg_load, total_load, this_load, total_pwr; -	unsigned long max_pull; -	unsigned long busiest_load_per_task, busiest_nr_running; -	unsigned long this_load_per_task, this_nr_running; -	int load_idx, group_imb = 0; +struct sd_lb_stats { +	struct sched_group *busiest; /* Busiest group in this sd */ +	struct sched_group *this;  /* Local group in this sd */ +	unsigned long total_load;  /* Total load of all groups in sd */ +	unsigned long total_pwr;   /*	Total power of all groups in sd */ +	unsigned long avg_load;	   /* Average load across all groups in sd */ + +	/** Statistics of this group */ +	unsigned long this_load; +	unsigned long this_load_per_task; +	unsigned long this_nr_running; + +	/* Statistics of the busiest group */ +	unsigned long max_load; +	unsigned long busiest_load_per_task; +	unsigned long busiest_nr_running; + +	int group_imb; /* Is there imbalance in this sd */  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	int power_savings_balance = 1; -	unsigned long leader_nr_running = 0, min_load_per_task = 0; -	unsigned long min_nr_running = ULONG_MAX; -	struct sched_group *group_min = NULL, *group_leader = NULL; +	int power_savings_balance; /* Is powersave balance needed for this sd */ +	struct sched_group *group_min; /* Least loaded group in sd */ +	struct sched_group *group_leader; /* Group which relieves group_min */ +	unsigned long min_load_per_task; /* load_per_task in group_min */ +	unsigned long leader_nr_running; /* Nr running of group_leader */ +	unsigned long min_nr_running; /* Nr running of group_min */  #endif +}; -	max_load = this_load = total_load = total_pwr = 0; -	busiest_load_per_task = busiest_nr_running = 0; -	this_load_per_task = this_nr_running = 0; +/* + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { +	unsigned long avg_load; /*Avg load across the CPUs of the group */ +	unsigned long group_load; /* Total load over the CPUs of the group */ +	unsigned long sum_nr_running; /* Nr tasks running in the group */ +	unsigned long sum_weighted_load; /* Weighted load of group's tasks */ +	unsigned long group_capacity; +	int group_imb; /* Is there an imbalance in the group ? */ +}; + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ +	return cpumask_first(sched_group_cpus(group)); +} + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, +					enum cpu_idle_type idle) +{ +	int load_idx; -	if (idle == CPU_NOT_IDLE) +	switch (idle) { +	case CPU_NOT_IDLE:  		load_idx = sd->busy_idx; -	else if (idle == CPU_NEWLY_IDLE) +		break; + +	case CPU_NEWLY_IDLE:  		load_idx = sd->newidle_idx; -	else +		break; +	default:  		load_idx = sd->idle_idx; +		break; +	} -	do { -		unsigned long load, group_capacity, max_cpu_load, min_cpu_load; -		int local_group; -		int i; -		int __group_imb = 0; -		unsigned int balance_cpu = -1, first_idle_cpu = 0; -		unsigned long sum_nr_running, sum_weighted_load; -		unsigned long sum_avg_load_per_task; -		unsigned long avg_load_per_task; +	return load_idx; +} -		local_group = cpumask_test_cpu(this_cpu, -					       sched_group_cpus(group)); -		if (local_group) -			balance_cpu = cpumask_first(sched_group_cpus(group)); +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, +	struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ +	/* +	 * Busy processors will not participate in power savings +	 * balance. +	 */ +	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) +		sds->power_savings_balance = 0; +	else { +		sds->power_savings_balance = 1; +		sds->min_nr_running = ULONG_MAX; +		sds->leader_nr_running = 0; +	} +} -		/* Tally up the load of all CPUs in the group */ -		sum_weighted_load = sum_nr_running = avg_load = 0; -		sum_avg_load_per_task = avg_load_per_task = 0; +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * 		load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, +	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ -		max_cpu_load = 0; -		min_cpu_load = ~0UL; +	if (!sds->power_savings_balance) +		return; -		for_each_cpu_and(i, sched_group_cpus(group), cpus) { -			struct rq *rq = cpu_rq(i); +	/* +	 * If the local group is idle or completely loaded +	 * no need to do power savings balance at this domain +	 */ +	if (local_group && (sds->this_nr_running >= sgs->group_capacity || +				!sds->this_nr_running)) +		sds->power_savings_balance = 0; -			if (*sd_idle && rq->nr_running) -				*sd_idle = 0; +	/* +	 * If a group is already running at full capacity or idle, +	 * don't include that group in power savings calculations +	 */ +	if (!sds->power_savings_balance || +		sgs->sum_nr_running >= sgs->group_capacity || +		!sgs->sum_nr_running) +		return; -			/* Bias balancing toward cpus of our domain */ -			if (local_group) { -				if (idle_cpu(i) && !first_idle_cpu) { -					first_idle_cpu = 1; -					balance_cpu = i; -				} +	/* +	 * Calculate the group which has the least non-idle load. +	 * This is the group from where we need to pick up the load +	 * for saving power +	 */ +	if ((sgs->sum_nr_running < sds->min_nr_running) || +	    (sgs->sum_nr_running == sds->min_nr_running && +	     group_first_cpu(group) > group_first_cpu(sds->group_min))) { +		sds->group_min = group; +		sds->min_nr_running = sgs->sum_nr_running; +		sds->min_load_per_task = sgs->sum_weighted_load / +						sgs->sum_nr_running; +	} -				load = target_load(i, load_idx); -			} else { -				load = source_load(i, load_idx); -				if (load > max_cpu_load) -					max_cpu_load = load; -				if (min_cpu_load > load) -					min_cpu_load = load; -			} +	/* +	 * Calculate the group which is almost near its +	 * capacity but still has some space to pick up some load +	 * from other group and save more power +	 */ +	if (sgs->sum_nr_running > sgs->group_capacity - 1) +		return; -			avg_load += load; -			sum_nr_running += rq->nr_running; -			sum_weighted_load += weighted_cpuload(i); +	if (sgs->sum_nr_running > sds->leader_nr_running || +	    (sgs->sum_nr_running == sds->leader_nr_running && +	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) { +		sds->group_leader = group; +		sds->leader_nr_running = sgs->sum_nr_running; +	} +} -			sum_avg_load_per_task += cpu_avg_load_per_task(i); -		} +/** + * check_power_save_busiest_group - see if there is potential for some power-savings balance + * @sds: Variable containing the statistics of the sched_domain + *	under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Description: + * Check if we have potential to perform some power-savings balance. + * If yes, set the busiest group to be the least loaded group in the + * sched_domain, so that it's CPUs can be put to idle. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, +					int this_cpu, unsigned long *imbalance) +{ +	if (!sds->power_savings_balance) +		return 0; -		/* -		 * First idle cpu or the first cpu(busiest) in this sched group -		 * is eligible for doing load balancing at this and above -		 * domains. In the newly idle case, we will allow all the cpu's -		 * to do the newly idle load balance. -		 */ -		if (idle != CPU_NEWLY_IDLE && local_group && -		    balance_cpu != this_cpu && balance) { -			*balance = 0; -			goto ret; -		} +	if (sds->this != sds->group_leader || +			sds->group_leader == sds->group_min) +		return 0; -		total_load += avg_load; -		total_pwr += group->__cpu_power; +	*imbalance = sds->min_load_per_task; +	sds->busiest = sds->group_min; -		/* Adjust by relative CPU power of the group */ -		avg_load = sg_div_cpu_power(group, -				avg_load * SCHED_LOAD_SCALE); +	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { +		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = +			group_first_cpu(sds->group_leader); +	} +	return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, +	struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ +	return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, +	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ +	return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, +					int this_cpu, unsigned long *imbalance) +{ +	return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -		/* -		 * Consider the group unbalanced when the imbalance is larger -		 * than the average weight of two tasks. -		 * -		 * APZ: with cgroup the avg task weight can vary wildly and -		 *      might not be a suitable number - should we keep a -		 *      normalized nr_running number somewhere that negates -		 *      the hierarchy? -		 */ -		avg_load_per_task = sg_div_cpu_power(group, -				sum_avg_load_per_task * SCHED_LOAD_SCALE); -		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) -			__group_imb = 1; +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @group: sched_group whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @sd_idle: Idle status of the sched_domain containing group. + * @local_group: Does group contain this_cpu. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, +			enum cpu_idle_type idle, int load_idx, int *sd_idle, +			int local_group, const struct cpumask *cpus, +			int *balance, struct sg_lb_stats *sgs) +{ +	unsigned long load, max_cpu_load, min_cpu_load; +	int i; +	unsigned int balance_cpu = -1, first_idle_cpu = 0; +	unsigned long sum_avg_load_per_task; +	unsigned long avg_load_per_task; -		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; +	if (local_group) +		balance_cpu = group_first_cpu(group); +	/* Tally up the load of all CPUs in the group */ +	sum_avg_load_per_task = avg_load_per_task = 0; +	max_cpu_load = 0; +	min_cpu_load = ~0UL; + +	for_each_cpu_and(i, sched_group_cpus(group), cpus) { +		struct rq *rq = cpu_rq(i); + +		if (*sd_idle && rq->nr_running) +			*sd_idle = 0; + +		/* Bias balancing toward cpus of our domain */  		if (local_group) { -			this_load = avg_load; -			this = group; -			this_nr_running = sum_nr_running; -			this_load_per_task = sum_weighted_load; -		} else if (avg_load > max_load && -			   (sum_nr_running > group_capacity || __group_imb)) { -			max_load = avg_load; -			busiest = group; -			busiest_nr_running = sum_nr_running; -			busiest_load_per_task = sum_weighted_load; -			group_imb = __group_imb; +			if (idle_cpu(i) && !first_idle_cpu) { +				first_idle_cpu = 1; +				balance_cpu = i; +			} + +			load = target_load(i, load_idx); +		} else { +			load = source_load(i, load_idx); +			if (load > max_cpu_load) +				max_cpu_load = load; +			if (min_cpu_load > load) +				min_cpu_load = load;  		} -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -		/* -		 * Busy processors will not participate in power savings -		 * balance. -		 */ -		if (idle == CPU_NOT_IDLE || -				!(sd->flags & SD_POWERSAVINGS_BALANCE)) -			goto group_next; +		sgs->group_load += load; +		sgs->sum_nr_running += rq->nr_running; +		sgs->sum_weighted_load += weighted_cpuload(i); -		/* -		 * If the local group is idle or completely loaded -		 * no need to do power savings balance at this domain -		 */ -		if (local_group && (this_nr_running >= group_capacity || -				    !this_nr_running)) -			power_savings_balance = 0; +		sum_avg_load_per_task += cpu_avg_load_per_task(i); +	} -		/* -		 * If a group is already running at full capacity or idle, -		 * don't include that group in power savings calculations -		 */ -		if (!power_savings_balance || sum_nr_running >= group_capacity -		    || !sum_nr_running) -			goto group_next; +	/* +	 * First idle cpu or the first cpu(busiest) in this sched group +	 * is eligible for doing load balancing at this and above +	 * domains. In the newly idle case, we will allow all the cpu's +	 * to do the newly idle load balance. +	 */ +	if (idle != CPU_NEWLY_IDLE && local_group && +	    balance_cpu != this_cpu && balance) { +		*balance = 0; +		return; +	} -		/* -		 * Calculate the group which has the least non-idle load. -		 * This is the group from where we need to pick up the load -		 * for saving power -		 */ -		if ((sum_nr_running < min_nr_running) || -		    (sum_nr_running == min_nr_running && -		     cpumask_first(sched_group_cpus(group)) > -		     cpumask_first(sched_group_cpus(group_min)))) { -			group_min = group; -			min_nr_running = sum_nr_running; -			min_load_per_task = sum_weighted_load / -						sum_nr_running; -		} +	/* Adjust by relative CPU power of the group */ +	sgs->avg_load = sg_div_cpu_power(group, +			sgs->group_load * SCHED_LOAD_SCALE); -		/* -		 * Calculate the group which is almost near its -		 * capacity but still has some space to pick up some load -		 * from other group and save more power -		 */ -		if (sum_nr_running <= group_capacity - 1) { -			if (sum_nr_running > leader_nr_running || -			    (sum_nr_running == leader_nr_running && -			     cpumask_first(sched_group_cpus(group)) < -			     cpumask_first(sched_group_cpus(group_leader)))) { -				group_leader = group; -				leader_nr_running = sum_nr_running; -			} + +	/* +	 * Consider the group unbalanced when the imbalance is larger +	 * than the average weight of two tasks. +	 * +	 * APZ: with cgroup the avg task weight can vary wildly and +	 *      might not be a suitable number - should we keep a +	 *      normalized nr_running number somewhere that negates +	 *      the hierarchy? +	 */ +	avg_load_per_task = sg_div_cpu_power(group, +			sum_avg_load_per_task * SCHED_LOAD_SCALE); + +	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) +		sgs->group_imb = 1; + +	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + +} + +/** + * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * @sd: sched_domain whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @sd_idle: Idle status of the sched_domain containing group. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. + */ +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, +			enum cpu_idle_type idle, int *sd_idle, +			const struct cpumask *cpus, int *balance, +			struct sd_lb_stats *sds) +{ +	struct sched_group *group = sd->groups; +	struct sg_lb_stats sgs; +	int load_idx; + +	init_sd_power_savings_stats(sd, sds, idle); +	load_idx = get_sd_load_idx(sd, idle); + +	do { +		int local_group; + +		local_group = cpumask_test_cpu(this_cpu, +					       sched_group_cpus(group)); +		memset(&sgs, 0, sizeof(sgs)); +		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, +				local_group, cpus, balance, &sgs); + +		if (local_group && balance && !(*balance)) +			return; + +		sds->total_load += sgs.group_load; +		sds->total_pwr += group->__cpu_power; + +		if (local_group) { +			sds->this_load = sgs.avg_load; +			sds->this = group; +			sds->this_nr_running = sgs.sum_nr_running; +			sds->this_load_per_task = sgs.sum_weighted_load; +		} else if (sgs.avg_load > sds->max_load && +			   (sgs.sum_nr_running > sgs.group_capacity || +				sgs.group_imb)) { +			sds->max_load = sgs.avg_load; +			sds->busiest = group; +			sds->busiest_nr_running = sgs.sum_nr_running; +			sds->busiest_load_per_task = sgs.sum_weighted_load; +			sds->group_imb = sgs.group_imb;  		} -group_next: -#endif + +		update_sd_power_savings_stats(group, sds, local_group, &sgs);  		group = group->next;  	} while (group != sd->groups); -	if (!busiest || this_load >= max_load || busiest_nr_running == 0) -		goto out_balanced; +} -	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + *			amongst the groups of a sched_domain, during + *			load balancing. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: Variable to store the imbalance. + */ +static inline void fix_small_imbalance(struct sd_lb_stats *sds, +				int this_cpu, unsigned long *imbalance) +{ +	unsigned long tmp, pwr_now = 0, pwr_move = 0; +	unsigned int imbn = 2; -	if (this_load >= avg_load || -			100*max_load <= sd->imbalance_pct*this_load) -		goto out_balanced; +	if (sds->this_nr_running) { +		sds->this_load_per_task /= sds->this_nr_running; +		if (sds->busiest_load_per_task > +				sds->this_load_per_task) +			imbn = 1; +	} else +		sds->this_load_per_task = +			cpu_avg_load_per_task(this_cpu); -	busiest_load_per_task /= busiest_nr_running; -	if (group_imb) -		busiest_load_per_task = min(busiest_load_per_task, avg_load); +	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= +			sds->busiest_load_per_task * imbn) { +		*imbalance = sds->busiest_load_per_task; +		return; +	}  	/* -	 * We're trying to get all the cpus to the average_load, so we don't -	 * want to push ourselves above the average load, nor do we wish to -	 * reduce the max loaded cpu below the average load, as either of these -	 * actions would just result in more rebalancing later, and ping-pong -	 * tasks around. Thus we look for the minimum possible imbalance. -	 * Negative imbalances (*we* are more loaded than anyone else) will -	 * be counted as no imbalance for these purposes -- we can't fix that -	 * by pulling tasks to us. Be careful of negative numbers as they'll -	 * appear as very large values with unsigned longs. +	 * OK, we don't have enough imbalance to justify moving tasks, +	 * however we may be able to increase total CPU power used by +	 * moving them.  	 */ -	if (max_load <= busiest_load_per_task) -		goto out_balanced; +	pwr_now += sds->busiest->__cpu_power * +			min(sds->busiest_load_per_task, sds->max_load); +	pwr_now += sds->this->__cpu_power * +			min(sds->this_load_per_task, sds->this_load); +	pwr_now /= SCHED_LOAD_SCALE; + +	/* Amount of load we'd subtract */ +	tmp = sg_div_cpu_power(sds->busiest, +			sds->busiest_load_per_task * SCHED_LOAD_SCALE); +	if (sds->max_load > tmp) +		pwr_move += sds->busiest->__cpu_power * +			min(sds->busiest_load_per_task, sds->max_load - tmp); + +	/* Amount of load we'd add */ +	if (sds->max_load * sds->busiest->__cpu_power < +		sds->busiest_load_per_task * SCHED_LOAD_SCALE) +		tmp = sg_div_cpu_power(sds->this, +			sds->max_load * sds->busiest->__cpu_power); +	else +		tmp = sg_div_cpu_power(sds->this, +			sds->busiest_load_per_task * SCHED_LOAD_SCALE); +	pwr_move += sds->this->__cpu_power * +			min(sds->this_load_per_task, sds->this_load + tmp); +	pwr_move /= SCHED_LOAD_SCALE; + +	/* Move if we gain throughput */ +	if (pwr_move > pwr_now) +		*imbalance = sds->busiest_load_per_task; +} + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + *			 groups of a given sched_domain during load balance. + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: Cpu for which currently load balance is being performed. + * @imbalance: The variable to store the imbalance. + */ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, +		unsigned long *imbalance) +{ +	unsigned long max_pull;  	/*  	 * In the presence of smp nice balancing, certain scenarios can have  	 * max load less than avg load(as we skip the groups at or below  	 * its cpu_power, while calculating max_load..)  	 */ -	if (max_load < avg_load) { +	if (sds->max_load < sds->avg_load) {  		*imbalance = 0; -		goto small_imbalance; +		return fix_small_imbalance(sds, this_cpu, imbalance);  	}  	/* Don't want to pull so many tasks that a group would go idle */ -	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); +	max_pull = min(sds->max_load - sds->avg_load, +			sds->max_load - sds->busiest_load_per_task);  	/* How much load to actually move to equalise the imbalance */ -	*imbalance = min(max_pull * busiest->__cpu_power, -				(avg_load - this_load) * this->__cpu_power) +	*imbalance = min(max_pull * sds->busiest->__cpu_power, +		(sds->avg_load - sds->this_load) * sds->this->__cpu_power)  			/ SCHED_LOAD_SCALE;  	/* @@ -3333,78 +3671,110 @@ group_next:  	 * a think about bumping its value to force at least one task to be  	 * moved  	 */ -	if (*imbalance < busiest_load_per_task) { -		unsigned long tmp, pwr_now, pwr_move; -		unsigned int imbn; +	if (*imbalance < sds->busiest_load_per_task) +		return fix_small_imbalance(sds, this_cpu, imbalance); -small_imbalance: -		pwr_move = pwr_now = 0; -		imbn = 2; -		if (this_nr_running) { -			this_load_per_task /= this_nr_running; -			if (busiest_load_per_task > this_load_per_task) -				imbn = 1; -		} else -			this_load_per_task = cpu_avg_load_per_task(this_cpu); +} +/******* find_busiest_group() helpers end here *********************/ -		if (max_load - this_load + busiest_load_per_task >= -					busiest_load_per_task * imbn) { -			*imbalance = busiest_load_per_task; -			return busiest; -		} +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + *		be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @sd_idle: The idleness of sd + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + *	is the appropriate cpu to perform load balancing at this_level. + * + * Returns:	- the busiest group if imbalance exists. + *		- If no imbalance and user has opted for power-savings balance, + *		   return the least loaded group whose CPUs can be + *		   put to idle by rebalancing its tasks onto our group. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, +		   unsigned long *imbalance, enum cpu_idle_type idle, +		   int *sd_idle, const struct cpumask *cpus, int *balance) +{ +	struct sd_lb_stats sds; -		/* -		 * OK, we don't have enough imbalance to justify moving tasks, -		 * however we may be able to increase total CPU power used by -		 * moving them. -		 */ +	memset(&sds, 0, sizeof(sds)); -		pwr_now += busiest->__cpu_power * -				min(busiest_load_per_task, max_load); -		pwr_now += this->__cpu_power * -				min(this_load_per_task, this_load); -		pwr_now /= SCHED_LOAD_SCALE; +	/* +	 * Compute the various statistics relavent for load balancing at +	 * this level. +	 */ +	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, +					balance, &sds); -		/* Amount of load we'd subtract */ -		tmp = sg_div_cpu_power(busiest, -				busiest_load_per_task * SCHED_LOAD_SCALE); -		if (max_load > tmp) -			pwr_move += busiest->__cpu_power * -				min(busiest_load_per_task, max_load - tmp); +	/* Cases where imbalance does not exist from POV of this_cpu */ +	/* 1) this_cpu is not the appropriate cpu to perform load balancing +	 *    at this level. +	 * 2) There is no busy sibling group to pull from. +	 * 3) This group is the busiest group. +	 * 4) This group is more busy than the avg busieness at this +	 *    sched_domain. +	 * 5) The imbalance is within the specified limit. +	 * 6) Any rebalance would lead to ping-pong +	 */ +	if (balance && !(*balance)) +		goto ret; -		/* Amount of load we'd add */ -		if (max_load * busiest->__cpu_power < -				busiest_load_per_task * SCHED_LOAD_SCALE) -			tmp = sg_div_cpu_power(this, -					max_load * busiest->__cpu_power); -		else -			tmp = sg_div_cpu_power(this, -				busiest_load_per_task * SCHED_LOAD_SCALE); -		pwr_move += this->__cpu_power * -				min(this_load_per_task, this_load + tmp); -		pwr_move /= SCHED_LOAD_SCALE; +	if (!sds.busiest || sds.busiest_nr_running == 0) +		goto out_balanced; -		/* Move if we gain throughput */ -		if (pwr_move > pwr_now) -			*imbalance = busiest_load_per_task; -	} +	if (sds.this_load >= sds.max_load) +		goto out_balanced; -	return busiest; +	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; -out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) -		goto ret; +	if (sds.this_load >= sds.avg_load) +		goto out_balanced; -	if (this == group_leader && group_leader != group_min) { -		*imbalance = min_load_per_task; -		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { -			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = -				cpumask_first(sched_group_cpus(group_leader)); -		} -		return group_min; -	} -#endif +	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) +		goto out_balanced; + +	sds.busiest_load_per_task /= sds.busiest_nr_running; +	if (sds.group_imb) +		sds.busiest_load_per_task = +			min(sds.busiest_load_per_task, sds.avg_load); + +	/* +	 * We're trying to get all the cpus to the average_load, so we don't +	 * want to push ourselves above the average load, nor do we wish to +	 * reduce the max loaded cpu below the average load, as either of these +	 * actions would just result in more rebalancing later, and ping-pong +	 * tasks around. Thus we look for the minimum possible imbalance. +	 * Negative imbalances (*we* are more loaded than anyone else) will +	 * be counted as no imbalance for these purposes -- we can't fix that +	 * by pulling tasks to us. Be careful of negative numbers as they'll +	 * appear as very large values with unsigned longs. +	 */ +	if (sds.max_load <= sds.busiest_load_per_task) +		goto out_balanced; + +	/* Looks like there is an imbalance. Compute it */ +	calculate_imbalance(&sds, this_cpu, imbalance); +	return sds.busiest; + +out_balanced: +	/* +	 * There is no obvious imbalance. But check if we can do some balancing +	 * to save power. +	 */ +	if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) +		return sds.busiest;  ret:  	*imbalance = 0;  	return NULL; @@ -4057,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)  #endif  } +static inline int on_null_domain(int cpu) +{ +	return !rcu_dereference(cpu_rq(cpu)->sd); +} +  /*   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.   * @@ -4114,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)  	    cpumask_test_cpu(cpu, nohz.cpu_mask))  		return;  #endif -	if (time_after_eq(jiffies, rq->next_balance)) +	/* Don't need to rebalance while attached to NULL domain */ +	if (time_after_eq(jiffies, rq->next_balance) && +	    likely(!on_null_domain(cpu)))  		raise_softirq(SCHED_SOFTIRQ);  } @@ -4508,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)  #endif  } +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ +	if (prev->state == TASK_RUNNING) { +		u64 runtime = prev->se.sum_exec_runtime; + +		runtime -= prev->se.prev_sum_exec_runtime; +		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + +		/* +		 * In order to avoid avg_overlap growing stale when we are +		 * indeed overlapping and hence not getting put to sleep, grow +		 * the avg_overlap on preemption. +		 * +		 * We use the average preemption runtime because that +		 * correlates to the amount of cache footprint a task can +		 * build up. +		 */ +		update_avg(&prev->se.avg_overlap, runtime); +	} +	prev->sched_class->put_prev_task(rq, prev); +} +  /*   * Pick up the highest-prio task:   */  static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq)  {  	const struct sched_class *class;  	struct task_struct *p; @@ -4543,15 +4942,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)  /*   * schedule() is the main scheduler function.   */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void)  {  	struct task_struct *prev, *next;  	unsigned long *switch_count;  	struct rq *rq;  	int cpu; -need_resched: -	preempt_disable();  	cpu = smp_processor_id();  	rq = cpu_rq(cpu);  	rcu_qsctr_inc(cpu); @@ -4586,8 +4983,8 @@ need_resched_nonpreemptible:  	if (unlikely(!rq->nr_running))  		idle_balance(cpu, rq); -	prev->sched_class->put_prev_task(rq, prev); -	next = pick_next_task(rq, prev); +	put_prev_task(rq, prev); +	next = pick_next_task(rq);  	if (likely(prev != next)) {  		sched_info_switch(prev, next); @@ -4608,13 +5005,80 @@ need_resched_nonpreemptible:  	if (unlikely(reacquire_kernel_lock(current) < 0))  		goto need_resched_nonpreemptible; +} +asmlinkage void __sched schedule(void) +{ +need_resched: +	preempt_disable(); +	__schedule();  	preempt_enable_no_resched();  	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))  		goto need_resched;  }  EXPORT_SYMBOL(schedule); +#ifdef CONFIG_SMP +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) +{ +	unsigned int cpu; +	struct rq *rq; + +	if (!sched_feat(OWNER_SPIN)) +		return 0; + +#ifdef CONFIG_DEBUG_PAGEALLOC +	/* +	 * Need to access the cpu field knowing that +	 * DEBUG_PAGEALLOC could have unmapped it if +	 * the mutex owner just released it and exited. +	 */ +	if (probe_kernel_address(&owner->cpu, cpu)) +		goto out; +#else +	cpu = owner->cpu; +#endif + +	/* +	 * Even if the access succeeded (likely case), +	 * the cpu field may no longer be valid. +	 */ +	if (cpu >= nr_cpumask_bits) +		goto out; + +	/* +	 * We need to validate that we can do a +	 * get_cpu() and that we have the percpu area. +	 */ +	if (!cpu_online(cpu)) +		goto out; + +	rq = cpu_rq(cpu); + +	for (;;) { +		/* +		 * Owner changed, break to re-assess state. +		 */ +		if (lock->owner != owner) +			break; + +		/* +		 * Is that owner really running on that cpu? +		 */ +		if (task_thread_info(rq->curr) != owner || need_resched()) +			return 0; + +		cpu_relax(); +	} +out: +	return 1; +} +#endif +  #ifdef CONFIG_PREEMPT  /*   * this is the entry point to schedule() from in-kernel preemption @@ -4642,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)  		 * between schedule and now.  		 */  		barrier(); -	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +	} while (need_resched());  }  EXPORT_SYMBOL(preempt_schedule); @@ -4671,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)  		 * between schedule and now.  		 */  		barrier(); -	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +	} while (need_resched());  }  #endif /* CONFIG_PREEMPT */ @@ -4732,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)  	__wake_up_common(q, mode, 1, 0, NULL);  } +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ +	__wake_up_common(q, mode, 1, 0, key); +} +  /** - * __wake_up_sync - wake up threads blocked on a waitqueue. + * __wake_up_sync_key - wake up threads blocked on a waitqueue.   * @q: the waitqueue   * @mode: which threads   * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets   *   * The sync wakeup differs that the waker knows that it will schedule   * away soon, so while the target thread will be woken up, it will not @@ -4745,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)   *   * On UP it can prevent extra preemption.   */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key)  {  	unsigned long flags;  	int sync = 1; @@ -4758,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)  		sync = 0;  	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, sync, NULL); +	__wake_up_common(q, mode, nr_exclusive, sync, key);  	spin_unlock_irqrestore(&q->lock, flags);  } +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ +	__wake_up_sync_key(q, mode, nr_exclusive, NULL); +}  EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */  /** @@ -5145,7 +5624,7 @@ SYSCALL_DEFINE1(nice, int, increment)  	if (increment > 40)  		increment = 40; -	nice = PRIO_TO_NICE(current->static_prio) + increment; +	nice = TASK_NICE(current) + increment;  	if (nice < -20)  		nice = -20;  	if (nice > 19) @@ -5944,12 +6423,7 @@ void sched_show_task(struct task_struct *p)  		printk(KERN_CONT " %016lx ", thread_saved_pc(p));  #endif  #ifdef CONFIG_DEBUG_STACK_USAGE -	{ -		unsigned long *n = end_of_stack(p); -		while (!*n) -			n++; -		free = (unsigned long)n - (unsigned long)end_of_stack(p); -	} +	free = stack_not_used(p);  #endif  	printk(KERN_CONT "%5lu %5d %6d\n", free,  		task_pid_nr(p), task_pid_nr(p->real_parent)); @@ -6423,7 +6897,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)  		if (!rq->nr_running)  			break;  		update_rq_clock(rq); -		next = pick_next_task(rq, rq->curr); +		next = pick_next_task(rq);  		if (!next)  			break;  		next->sched_class->put_prev_task(rq, next); @@ -8218,11 +8692,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  	__set_bit(MAX_RT_PRIO, array->bitmap);  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED -	rt_rq->highest_prio = MAX_RT_PRIO; +	rt_rq->highest_prio.curr = MAX_RT_PRIO; +#ifdef CONFIG_SMP +	rt_rq->highest_prio.next = MAX_RT_PRIO; +#endif  #endif  #ifdef CONFIG_SMP  	rt_rq->rt_nr_migratory = 0;  	rt_rq->overloaded = 0; +	plist_head_init(&rq->rt.pushable_tasks, &rq->lock);  #endif  	rt_rq->rt_time = 0; @@ -9224,6 +9702,16 @@ static int sched_rt_global_constraints(void)  	return ret;  } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ +	/* Don't accept realtime tasks when there is no way for them to run */ +	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) +		return 0; + +	return 1; +} +  #else /* !CONFIG_RT_GROUP_SCHED */  static int sched_rt_global_constraints(void)  { @@ -9317,8 +9805,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,  		      struct task_struct *tsk)  {  #ifdef CONFIG_RT_GROUP_SCHED -	/* Don't accept realtime tasks when there is no way for them to run */ -	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) +	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))  		return -EINVAL;  #else  	/* We don't support RT-tasks being in separate groups */ @@ -9481,7 +9968,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)  { -	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);  	u64 data;  #ifndef CONFIG_64BIT @@ -9500,7 +9987,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)  static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)  { -	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);  #ifndef CONFIG_64BIT  	/* @@ -9589,14 +10076,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)  	struct cpuacct *ca;  	int cpu; -	if (!cpuacct_subsys.active) +	if (unlikely(!cpuacct_subsys.active))  		return;  	cpu = task_cpu(tsk);  	ca = task_ca(tsk);  	for (; ca; ca = ca->parent) { -		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); +		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);  		*cpuusage += cputime;  	}  } diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index a0b0852414c..390f33234bd 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -24,11 +24,11 @@   * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat   * consistent between cpus (never more than 2 jiffies difference).   */ -#include <linux/sched.h> -#include <linux/percpu.h>  #include <linux/spinlock.h> -#include <linux/ktime.h>  #include <linux/module.h> +#include <linux/percpu.h> +#include <linux/ktime.h> +#include <linux/sched.h>  /*   * Scheduler clock - returns current time in nanosec units. @@ -43,6 +43,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)  static __read_mostly int sched_clock_running;  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__read_mostly int sched_clock_stable;  struct sched_clock_data {  	/* @@ -87,7 +88,7 @@ void sched_clock_init(void)  }  /* - * min,max except they take wrapping into account + * min, max except they take wrapping into account   */  static inline u64 wrap_min(u64 x, u64 y) @@ -111,15 +112,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)  	s64 delta = now - scd->tick_raw;  	u64 clock, min_clock, max_clock; -	WARN_ON_ONCE(!irqs_disabled()); -  	if (unlikely(delta < 0))  		delta = 0;  	/*  	 * scd->clock = clamp(scd->tick_gtod + delta, -	 * 		      max(scd->tick_gtod, scd->clock), -	 * 		      scd->tick_gtod + TICK_NSEC); +	 *		      max(scd->tick_gtod, scd->clock), +	 *		      scd->tick_gtod + TICK_NSEC);  	 */  	clock = scd->tick_gtod + delta; @@ -148,12 +147,13 @@ static void lock_double_clock(struct sched_clock_data *data1,  u64 sched_clock_cpu(int cpu)  { -	struct sched_clock_data *scd = cpu_sdc(cpu);  	u64 now, clock, this_clock, remote_clock; +	struct sched_clock_data *scd; -	if (unlikely(!sched_clock_running)) -		return 0ull; +	if (sched_clock_stable) +		return sched_clock(); +	scd = cpu_sdc(cpu);  	WARN_ON_ONCE(!irqs_disabled());  	now = sched_clock(); @@ -195,14 +195,18 @@ u64 sched_clock_cpu(int cpu)  void sched_clock_tick(void)  { -	struct sched_clock_data *scd = this_scd(); +	struct sched_clock_data *scd;  	u64 now, now_gtod; +	if (sched_clock_stable) +		return; +  	if (unlikely(!sched_clock_running))  		return;  	WARN_ON_ONCE(!irqs_disabled()); +	scd = this_scd();  	now_gtod = ktime_to_ns(ktime_get());  	now = sched_clock(); @@ -250,7 +254,7 @@ u64 sched_clock_cpu(int cpu)  	return sched_clock();  } -#endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */  unsigned long long cpu_clock(int cpu)  { diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 642a94ef8a0..9a7e859b8fb 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -25,7 +25,7 @@ struct cpupri {  #ifdef CONFIG_SMP  int  cpupri_find(struct cpupri *cp, -		 struct task_struct *p, cpumask_t *lowest_mask); +		 struct task_struct *p, struct cpumask *lowest_mask);  void cpupri_set(struct cpupri *cp, int cpu, int pri);  int cpupri_init(struct cpupri *cp, bool bootmem);  void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 16eeba4e416..467ca72f165 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu)  	P(nr_switches);  	P(nr_load_updates);  	P(nr_uninterruptible); -	SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);  	PN(next_balance);  	P(curr->pid);  	PN(clock); @@ -287,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu)  #ifdef CONFIG_SCHEDSTATS  #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n); -	P(yld_exp_empty); -	P(yld_act_empty); -	P(yld_both_empty);  	P(yld_count);  	P(sched_switch); @@ -314,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v)  	u64 now = ktime_to_ns(ktime_get());  	int cpu; -	SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", +	SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",  		init_utsname()->release,  		(int)strcspn(init_utsname()->version, " "),  		init_utsname()->version); @@ -325,6 +321,7 @@ static int sched_debug_show(struct seq_file *m, void *v)  	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))  #define PN(x) \  	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) +	P(jiffies);  	PN(sysctl_sched_latency);  	PN(sysctl_sched_min_granularity);  	PN(sysctl_sched_wakeup_granularity); @@ -397,6 +394,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  	PN(se.vruntime);  	PN(se.sum_exec_runtime);  	PN(se.avg_overlap); +	PN(se.avg_wakeup);  	nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0566f2a03c4..3816f217f11 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1314,16 +1314,63 @@ out:  }  #endif /* CONFIG_SMP */ -static unsigned long wakeup_gran(struct sched_entity *se) +/* + * Adaptive granularity + * + * se->avg_wakeup gives the average time a task runs until it does a wakeup, + * with the limit of wakeup_gran -- when it never does a wakeup. + * + * So the smaller avg_wakeup is the faster we want this task to preempt, + * but we don't want to treat the preemptee unfairly and therefore allow it + * to run for at least the amount of time we'd like to run. + * + * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one + * + * NOTE: we use *nr_running to scale with load, this nicely matches the + *       degrading latency on load. + */ +static unsigned long +adaptive_gran(struct sched_entity *curr, struct sched_entity *se) +{ +	u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; +	u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; +	u64 gran = 0; + +	if (this_run < expected_wakeup) +		gran = expected_wakeup - this_run; + +	return min_t(s64, gran, sysctl_sched_wakeup_granularity); +} + +static unsigned long +wakeup_gran(struct sched_entity *curr, struct sched_entity *se)  {  	unsigned long gran = sysctl_sched_wakeup_granularity; +	if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) +		gran = adaptive_gran(curr, se); +  	/* -	 * More easily preempt - nice tasks, while not making it harder for -	 * + nice tasks. +	 * Since its curr running now, convert the gran from real-time +	 * to virtual-time in his units.  	 */ -	if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) -		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); +	if (sched_feat(ASYM_GRAN)) { +		/* +		 * By using 'se' instead of 'curr' we penalize light tasks, so +		 * they get preempted easier. That is, if 'se' < 'curr' then +		 * the resulting gran will be larger, therefore penalizing the +		 * lighter, if otoh 'se' > 'curr' then the resulting gran will +		 * be smaller, again penalizing the lighter task. +		 * +		 * This is especially important for buddies when the leftmost +		 * task is higher priority than the buddy. +		 */ +		if (unlikely(se->load.weight != NICE_0_LOAD)) +			gran = calc_delta_fair(gran, se); +	} else { +		if (unlikely(curr->load.weight != NICE_0_LOAD)) +			gran = calc_delta_fair(gran, curr); +	}  	return gran;  } @@ -1350,7 +1397,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)  	if (vdiff <= 0)  		return -1; -	gran = wakeup_gran(curr); +	gran = wakeup_gran(curr, se);  	if (vdiff > gran)  		return 1; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index da5d93b5d2c..4569bfa7df9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,5 +1,6 @@  SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) -SCHED_FEAT(NORMALIZED_SLEEPER, 1) +SCHED_FEAT(NORMALIZED_SLEEPER, 0) +SCHED_FEAT(ADAPTIVE_GRAN, 1)  SCHED_FEAT(WAKEUP_PREEMPT, 1)  SCHED_FEAT(START_DEBIT, 1)  SCHED_FEAT(AFFINE_WAKEUPS, 1) @@ -13,3 +14,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)  SCHED_FEAT(ASYM_EFF_LOAD, 1)  SCHED_FEAT(WAKEUP_OVERLAP, 0)  SCHED_FEAT(LAST_BUDDY, 1) +SCHED_FEAT(OWNER_SPIN, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bac1061cea2..299d012b439 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -3,6 +3,40 @@   * policies)   */ +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ +	return container_of(rt_se, struct task_struct, rt); +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ +	return rt_rq->rq; +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ +	return rt_se->rt_rq; +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ +	return container_of(rt_rq, struct rq, rt); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ +	struct task_struct *p = rt_task_of(rt_se); +	struct rq *rq = task_rq(p); + +	return &rq->rt; +} + +#endif /* CONFIG_RT_GROUP_SCHED */ +  #ifdef CONFIG_SMP  static inline int rt_overloaded(struct rq *rq) @@ -37,25 +71,69 @@ static inline void rt_clear_overload(struct rq *rq)  	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);  } -static void update_rt_migration(struct rq *rq) +static void update_rt_migration(struct rt_rq *rt_rq)  { -	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { -		if (!rq->rt.overloaded) { -			rt_set_overload(rq); -			rq->rt.overloaded = 1; +	if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { +		if (!rt_rq->overloaded) { +			rt_set_overload(rq_of_rt_rq(rt_rq)); +			rt_rq->overloaded = 1;  		} -	} else if (rq->rt.overloaded) { -		rt_clear_overload(rq); -		rq->rt.overloaded = 0; +	} else if (rt_rq->overloaded) { +		rt_clear_overload(rq_of_rt_rq(rt_rq)); +		rt_rq->overloaded = 0;  	}  } -#endif /* CONFIG_SMP */ -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	if (rt_se->nr_cpus_allowed > 1) +		rt_rq->rt_nr_migratory++; + +	update_rt_migration(rt_rq); +} + +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	if (rt_se->nr_cpus_allowed > 1) +		rt_rq->rt_nr_migratory--; + +	update_rt_migration(rt_rq); +} + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ +	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +	plist_node_init(&p->pushable_tasks, p->prio); +	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ +	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +#else + +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)  { -	return container_of(rt_se, struct task_struct, rt);  } +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +static inline +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +#endif /* CONFIG_SMP */ +  static inline int on_rt_rq(struct sched_rt_entity *rt_se)  {  	return !list_empty(&rt_se->run_list); @@ -79,16 +157,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)  #define for_each_leaf_rt_rq(rt_rq, rq) \  	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ -	return rt_rq->rq; -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ -	return rt_se->rt_rq; -} -  #define for_each_sched_rt_entity(rt_se) \  	for (; rt_se; rt_se = rt_se->parent) @@ -108,7 +176,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  	if (rt_rq->rt_nr_running) {  		if (rt_se && !on_rt_rq(rt_se))  			enqueue_rt_entity(rt_se); -		if (rt_rq->highest_prio < curr->prio) +		if (rt_rq->highest_prio.curr < curr->prio)  			resched_task(curr);  	}  } @@ -176,19 +244,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)  #define for_each_leaf_rt_rq(rt_rq, rq) \  	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ -	return container_of(rt_rq, struct rq, rt); -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ -	struct task_struct *p = rt_task_of(rt_se); -	struct rq *rq = task_rq(p); - -	return &rq->rt; -} -  #define for_each_sched_rt_entity(rt_se) \  	for (; rt_se; rt_se = NULL) @@ -473,7 +528,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)  	struct rt_rq *rt_rq = group_rt_rq(rt_se);  	if (rt_rq) -		return rt_rq->highest_prio; +		return rt_rq->highest_prio.curr;  #endif  	return rt_task_of(rt_se)->prio; @@ -547,91 +602,174 @@ static void update_curr_rt(struct rq *rq)  	}  } -static inline -void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +#if defined CONFIG_SMP + +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); + +static inline int next_prio(struct rq *rq)  { -	WARN_ON(!rt_prio(rt_se_prio(rt_se))); -	rt_rq->rt_nr_running++; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED -	if (rt_se_prio(rt_se) < rt_rq->highest_prio) { -#ifdef CONFIG_SMP -		struct rq *rq = rq_of_rt_rq(rt_rq); -#endif +	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); + +	if (next && rt_prio(next->prio)) +		return next->prio; +	else +		return MAX_RT_PRIO; +} + +static void +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	if (prio < prev_prio) { + +		/* +		 * If the new task is higher in priority than anything on the +		 * run-queue, we know that the previous high becomes our +		 * next-highest. +		 */ +		rt_rq->highest_prio.next = prev_prio; -		rt_rq->highest_prio = rt_se_prio(rt_se); -#ifdef CONFIG_SMP  		if (rq->online) -			cpupri_set(&rq->rd->cpupri, rq->cpu, -				   rt_se_prio(rt_se)); -#endif -	} -#endif -#ifdef CONFIG_SMP -	if (rt_se->nr_cpus_allowed > 1) { -		struct rq *rq = rq_of_rt_rq(rt_rq); +			cpupri_set(&rq->rd->cpupri, rq->cpu, prio); -		rq->rt.rt_nr_migratory++; -	} +	} else if (prio == rt_rq->highest_prio.curr) +		/* +		 * If the next task is equal in priority to the highest on +		 * the run-queue, then we implicitly know that the next highest +		 * task cannot be any lower than current +		 */ +		rt_rq->highest_prio.next = prio; +	else if (prio < rt_rq->highest_prio.next) +		/* +		 * Otherwise, we need to recompute next-highest +		 */ +		rt_rq->highest_prio.next = next_prio(rq); +} -	update_rt_migration(rq_of_rt_rq(rt_rq)); -#endif -#ifdef CONFIG_RT_GROUP_SCHED -	if (rt_se_boosted(rt_se)) -		rt_rq->rt_nr_boosted++; +static void +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); -	if (rt_rq->tg) -		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); -#else -	start_rt_bandwidth(&def_rt_bandwidth); -#endif +	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) +		rt_rq->highest_prio.next = next_prio(rq); + +	if (rq->online && rt_rq->highest_prio.curr != prev_prio) +		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);  } +#else /* CONFIG_SMP */ +  static inline -void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -#ifdef CONFIG_SMP -	int highest_prio = rt_rq->highest_prio; -#endif +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} +static inline +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} + +#endif /* CONFIG_SMP */ -	WARN_ON(!rt_prio(rt_se_prio(rt_se))); -	WARN_ON(!rt_rq->rt_nr_running); -	rt_rq->rt_nr_running--;  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +static void +inc_rt_prio(struct rt_rq *rt_rq, int prio) +{ +	int prev_prio = rt_rq->highest_prio.curr; + +	if (prio < prev_prio) +		rt_rq->highest_prio.curr = prio; + +	inc_rt_prio_smp(rt_rq, prio, prev_prio); +} + +static void +dec_rt_prio(struct rt_rq *rt_rq, int prio) +{ +	int prev_prio = rt_rq->highest_prio.curr; +  	if (rt_rq->rt_nr_running) { -		struct rt_prio_array *array; -		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); -		if (rt_se_prio(rt_se) == rt_rq->highest_prio) { -			/* recalculate */ -			array = &rt_rq->active; -			rt_rq->highest_prio = +		WARN_ON(prio < prev_prio); + +		/* +		 * This may have been our highest task, and therefore +		 * we may have some recomputation to do +		 */ +		if (prio == prev_prio) { +			struct rt_prio_array *array = &rt_rq->active; + +			rt_rq->highest_prio.curr =  				sched_find_first_bit(array->bitmap); -		} /* otherwise leave rq->highest prio alone */ +		} +  	} else -		rt_rq->highest_prio = MAX_RT_PRIO; -#endif -#ifdef CONFIG_SMP -	if (rt_se->nr_cpus_allowed > 1) { -		struct rq *rq = rq_of_rt_rq(rt_rq); -		rq->rt.rt_nr_migratory--; -	} +		rt_rq->highest_prio.curr = MAX_RT_PRIO; -	if (rt_rq->highest_prio != highest_prio) { -		struct rq *rq = rq_of_rt_rq(rt_rq); +	dec_rt_prio_smp(rt_rq, prio, prev_prio); +} -		if (rq->online) -			cpupri_set(&rq->rd->cpupri, rq->cpu, -				   rt_rq->highest_prio); -	} +#else + +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} + +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ -	update_rt_migration(rq_of_rt_rq(rt_rq)); -#endif /* CONFIG_SMP */  #ifdef CONFIG_RT_GROUP_SCHED + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	if (rt_se_boosted(rt_se)) +		rt_rq->rt_nr_boosted++; + +	if (rt_rq->tg) +		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +} + +static void +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{  	if (rt_se_boosted(rt_se))  		rt_rq->rt_nr_boosted--;  	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); -#endif +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	start_rt_bandwidth(&def_rt_bandwidth); +} + +static inline +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	int prio = rt_se_prio(rt_se); + +	WARN_ON(!rt_prio(prio)); +	rt_rq->rt_nr_running++; + +	inc_rt_prio(rt_rq, prio); +	inc_rt_migration(rt_se, rt_rq); +	inc_rt_group(rt_se, rt_rq); +} + +static inline +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	WARN_ON(!rt_prio(rt_se_prio(rt_se))); +	WARN_ON(!rt_rq->rt_nr_running); +	rt_rq->rt_nr_running--; + +	dec_rt_prio(rt_rq, rt_se_prio(rt_se)); +	dec_rt_migration(rt_se, rt_rq); +	dec_rt_group(rt_se, rt_rq);  }  static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) @@ -718,6 +856,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)  	enqueue_rt_entity(rt_se); +	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) +		enqueue_pushable_task(rq, p); +  	inc_cpu_load(rq, p->se.load.weight);  } @@ -728,6 +869,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)  	update_curr_rt(rq);  	dequeue_rt_entity(rt_se); +	dequeue_pushable_task(rq, p); +  	dec_cpu_load(rq, p->se.load.weight);  } @@ -878,7 +1021,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,  	return next;  } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *_pick_next_task_rt(struct rq *rq)  {  	struct sched_rt_entity *rt_se;  	struct task_struct *p; @@ -900,6 +1043,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)  	p = rt_task_of(rt_se);  	p->se.exec_start = rq->clock; + +	return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ +	struct task_struct *p = _pick_next_task_rt(rq); + +	/* The running task is never eligible for pushing */ +	if (p) +		dequeue_pushable_task(rq, p); +  	return p;  } @@ -907,6 +1062,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)  {  	update_curr_rt(rq);  	p->se.exec_start = 0; + +	/* +	 * The previous task needs to be made eligible for pushing +	 * if it is still active +	 */ +	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) +		enqueue_pushable_task(rq, p);  }  #ifdef CONFIG_SMP @@ -960,12 +1122,13 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)  static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) +static inline int pick_optimal_cpu(int this_cpu, +				   const struct cpumask *mask)  {  	int first;  	/* "this_cpu" is cheaper to preempt than a remote processor */ -	if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) +	if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))  		return this_cpu;  	first = cpumask_first(mask); @@ -981,6 +1144,7 @@ static int find_lowest_rq(struct task_struct *task)  	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);  	int this_cpu = smp_processor_id();  	int cpu      = task_cpu(task); +	cpumask_var_t domain_mask;  	if (task->rt.nr_cpus_allowed == 1)  		return -1; /* No other targets possible */ @@ -1013,19 +1177,25 @@ static int find_lowest_rq(struct task_struct *task)  	if (this_cpu == cpu)  		this_cpu = -1; /* Skip this_cpu opt if the same */ -	for_each_domain(cpu, sd) { -		if (sd->flags & SD_WAKE_AFFINE) { -			cpumask_t domain_mask; -			int       best_cpu; +	if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { +		for_each_domain(cpu, sd) { +			if (sd->flags & SD_WAKE_AFFINE) { +				int best_cpu; + +				cpumask_and(domain_mask, +					    sched_domain_span(sd), +					    lowest_mask); -			cpumask_and(&domain_mask, sched_domain_span(sd), -				    lowest_mask); +				best_cpu = pick_optimal_cpu(this_cpu, +							    domain_mask); -			best_cpu = pick_optimal_cpu(this_cpu, -						    &domain_mask); -			if (best_cpu != -1) -				return best_cpu; +				if (best_cpu != -1) { +					free_cpumask_var(domain_mask); +					return best_cpu; +				} +			}  		} +		free_cpumask_var(domain_mask);  	}  	/* @@ -1072,7 +1242,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)  		}  		/* If this rq is still suitable use it. */ -		if (lowest_rq->rt.highest_prio > task->prio) +		if (lowest_rq->rt.highest_prio.curr > task->prio)  			break;  		/* try again */ @@ -1083,6 +1253,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)  	return lowest_rq;  } +static inline int has_pushable_tasks(struct rq *rq) +{ +	return !plist_head_empty(&rq->rt.pushable_tasks); +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ +	struct task_struct *p; + +	if (!has_pushable_tasks(rq)) +		return NULL; + +	p = plist_first_entry(&rq->rt.pushable_tasks, +			      struct task_struct, pushable_tasks); + +	BUG_ON(rq->cpu != task_cpu(p)); +	BUG_ON(task_current(rq, p)); +	BUG_ON(p->rt.nr_cpus_allowed <= 1); + +	BUG_ON(!p->se.on_rq); +	BUG_ON(!rt_task(p)); + +	return p; +} +  /*   * If the current CPU has more than one RT task, see if the non   * running task can migrate over to a CPU that is running a task @@ -1092,13 +1287,11 @@ static int push_rt_task(struct rq *rq)  {  	struct task_struct *next_task;  	struct rq *lowest_rq; -	int ret = 0; -	int paranoid = RT_MAX_TRIES;  	if (!rq->rt.overloaded)  		return 0; -	next_task = pick_next_highest_task_rt(rq, -1); +	next_task = pick_next_pushable_task(rq);  	if (!next_task)  		return 0; @@ -1127,16 +1320,34 @@ static int push_rt_task(struct rq *rq)  		struct task_struct *task;  		/*  		 * find lock_lowest_rq releases rq->lock -		 * so it is possible that next_task has changed. -		 * If it has, then try again. +		 * so it is possible that next_task has migrated. +		 * +		 * We need to make sure that the task is still on the same +		 * run-queue and is also still the next task eligible for +		 * pushing.  		 */ -		task = pick_next_highest_task_rt(rq, -1); -		if (unlikely(task != next_task) && task && paranoid--) { -			put_task_struct(next_task); -			next_task = task; -			goto retry; +		task = pick_next_pushable_task(rq); +		if (task_cpu(next_task) == rq->cpu && task == next_task) { +			/* +			 * If we get here, the task hasnt moved at all, but +			 * it has failed to push.  We will not try again, +			 * since the other cpus will pull from us when they +			 * are ready. +			 */ +			dequeue_pushable_task(rq, next_task); +			goto out;  		} -		goto out; + +		if (!task) +			/* No more tasks, just exit */ +			goto out; + +		/* +		 * Something has shifted, try again. +		 */ +		put_task_struct(next_task); +		next_task = task; +		goto retry;  	}  	deactivate_task(rq, next_task, 0); @@ -1147,23 +1358,12 @@ static int push_rt_task(struct rq *rq)  	double_unlock_balance(rq, lowest_rq); -	ret = 1;  out:  	put_task_struct(next_task); -	return ret; +	return 1;  } -/* - * TODO: Currently we just use the second highest prio task on - *       the queue, and stop when it can't migrate (or there's - *       no more RT tasks).  There may be a case where a lower - *       priority RT task has a different affinity than the - *       higher RT task. In this case the lower RT task could - *       possibly be able to migrate where as the higher priority - *       RT task could not.  We currently ignore this issue. - *       Enhancements are welcome! - */  static void push_rt_tasks(struct rq *rq)  {  	/* push_rt_task will return true if it moved an RT */ @@ -1174,33 +1374,35 @@ static void push_rt_tasks(struct rq *rq)  static int pull_rt_task(struct rq *this_rq)  {  	int this_cpu = this_rq->cpu, ret = 0, cpu; -	struct task_struct *p, *next; +	struct task_struct *p;  	struct rq *src_rq;  	if (likely(!rt_overloaded(this_rq)))  		return 0; -	next = pick_next_task_rt(this_rq); -  	for_each_cpu(cpu, this_rq->rd->rto_mask) {  		if (this_cpu == cpu)  			continue;  		src_rq = cpu_rq(cpu); + +		/* +		 * Don't bother taking the src_rq->lock if the next highest +		 * task is known to be lower-priority than our current task. +		 * This may look racy, but if this value is about to go +		 * logically higher, the src_rq will push this task away. +		 * And if its going logically lower, we do not care +		 */ +		if (src_rq->rt.highest_prio.next >= +		    this_rq->rt.highest_prio.curr) +			continue; +  		/*  		 * We can potentially drop this_rq's lock in  		 * double_lock_balance, and another CPU could -		 * steal our next task - hence we must cause -		 * the caller to recalculate the next task -		 * in that case: +		 * alter this_rq  		 */ -		if (double_lock_balance(this_rq, src_rq)) { -			struct task_struct *old_next = next; - -			next = pick_next_task_rt(this_rq); -			if (next != old_next) -				ret = 1; -		} +		double_lock_balance(this_rq, src_rq);  		/*  		 * Are there still pullable RT tasks? @@ -1214,7 +1416,7 @@ static int pull_rt_task(struct rq *this_rq)  		 * Do we have an RT task that preempts  		 * the to-be-scheduled task?  		 */ -		if (p && (!next || (p->prio < next->prio))) { +		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {  			WARN_ON(p == src_rq->curr);  			WARN_ON(!p->se.on_rq); @@ -1224,12 +1426,9 @@ static int pull_rt_task(struct rq *this_rq)  			 * This is just that p is wakeing up and hasn't  			 * had a chance to schedule. We only pull  			 * p if it is lower in priority than the -			 * current task on the run queue or -			 * this_rq next task is lower in prio than -			 * the current task on that rq. +			 * current task on the run queue  			 */ -			if (p->prio < src_rq->curr->prio || -			    (next && next->prio < src_rq->curr->prio)) +			if (p->prio < src_rq->curr->prio)  				goto skip;  			ret = 1; @@ -1242,13 +1441,7 @@ static int pull_rt_task(struct rq *this_rq)  			 * case there's an even higher prio task  			 * in another runqueue. (low likelyhood  			 * but possible) -			 * -			 * Update next so that we won't pick a task -			 * on another cpu with a priority lower (or equal) -			 * than the one we just picked.  			 */ -			next = p; -  		}   skip:  		double_unlock_balance(this_rq, src_rq); @@ -1260,24 +1453,27 @@ static int pull_rt_task(struct rq *this_rq)  static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)  {  	/* Try to pull RT tasks here if we lower this rq's prio */ -	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) +	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)  		pull_rt_task(rq);  } +/* + * assumes rq->lock is held + */ +static int needs_post_schedule_rt(struct rq *rq) +{ +	return has_pushable_tasks(rq); +} +  static void post_schedule_rt(struct rq *rq)  {  	/* -	 * If we have more than one rt_task queued, then -	 * see if we can push the other rt_tasks off to other CPUS. -	 * Note we may release the rq lock, and since -	 * the lock was owned by prev, we need to release it -	 * first via finish_lock_switch and then reaquire it here. +	 * This is only called if needs_post_schedule_rt() indicates that +	 * we need to push tasks away  	 */ -	if (unlikely(rq->rt.overloaded)) { -		spin_lock_irq(&rq->lock); -		push_rt_tasks(rq); -		spin_unlock_irq(&rq->lock); -	} +	spin_lock_irq(&rq->lock); +	push_rt_tasks(rq); +	spin_unlock_irq(&rq->lock);  }  /* @@ -1288,7 +1484,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)  {  	if (!task_running(rq, p) &&  	    !test_tsk_need_resched(rq->curr) && -	    rq->rt.overloaded) +	    has_pushable_tasks(rq) && +	    p->rt.nr_cpus_allowed > 1)  		push_rt_tasks(rq);  } @@ -1324,6 +1521,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,  	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {  		struct rq *rq = task_rq(p); +		if (!task_current(rq, p)) { +			/* +			 * Make sure we dequeue this task from the pushable list +			 * before going further.  It will either remain off of +			 * the list because we are no longer pushable, or it +			 * will be requeued. +			 */ +			if (p->rt.nr_cpus_allowed > 1) +				dequeue_pushable_task(rq, p); + +			/* +			 * Requeue if our weight is changing and still > 1 +			 */ +			if (weight > 1) +				enqueue_pushable_task(rq, p); + +		} +  		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {  			rq->rt.rt_nr_migratory++;  		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { @@ -1331,7 +1546,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,  			rq->rt.rt_nr_migratory--;  		} -		update_rt_migration(rq); +		update_rt_migration(&rq->rt);  	}  	cpumask_copy(&p->cpus_allowed, new_mask); @@ -1346,7 +1561,7 @@ static void rq_online_rt(struct rq *rq)  	__enable_runtime(rq); -	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); +	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);  }  /* Assumes rq->lock is held */ @@ -1438,7 +1653,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,  		 * can release the rq lock and p could migrate.  		 * Only reschedule if p is still on the same runqueue.  		 */ -		if (p->prio > rq->rt.highest_prio && rq->curr == p) +		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)  			resched_task(p);  #else  		/* For UP simply resched on drop of prio */ @@ -1509,6 +1724,9 @@ static void set_curr_task_rt(struct rq *rq)  	struct task_struct *p = rq->curr;  	p->se.exec_start = rq->clock; + +	/* The running task is never eligible for pushing */ +	dequeue_pushable_task(rq, p);  }  static const struct sched_class rt_sched_class = { @@ -1531,6 +1749,7 @@ static const struct sched_class rt_sched_class = {  	.rq_online              = rq_online_rt,  	.rq_offline             = rq_offline_rt,  	.pre_schedule		= pre_schedule_rt, +	.needs_post_schedule	= needs_post_schedule_rt,  	.post_schedule		= post_schedule_rt,  	.task_wake_up		= task_wake_up_rt,  	.switched_from		= switched_from_rt, diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index a8f93dd374e..32d2bd4061b 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -4,7 +4,7 @@   * bump this up when changing the output format or the meaning of an existing   * format, so that tools can adapt (or abort)   */ -#define SCHEDSTAT_VERSION 14 +#define SCHEDSTAT_VERSION 15  static int show_schedstat(struct seq_file *seq, void *v)  { @@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)  		/* runqueue-specific stats */  		seq_printf(seq, -		    "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", -		    cpu, rq->yld_both_empty, -		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, +		    "cpu%d %u %u %u %u %u %u %llu %llu %lu", +		    cpu, rq->yld_count,  		    rq->sched_switch, rq->sched_count, rq->sched_goidle,  		    rq->ttwu_count, rq->ttwu_local,  		    rq->rq_cpu_time, diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ad64fcb731f..57d4b13b631 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -8,6 +8,7 @@  #include <linux/seccomp.h>  #include <linux/sched.h> +#include <linux/compat.h>  /* #define SECCOMP_DEBUG 1 */  #define NR_SECCOMP_MODES 1 @@ -22,7 +23,7 @@ static int mode1_syscalls[] = {  	0, /* null terminated */  }; -#ifdef TIF_32BIT +#ifdef CONFIG_COMPAT  static int mode1_syscalls_32[] = {  	__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,  	0, /* null terminated */ @@ -37,8 +38,8 @@ void __secure_computing(int this_syscall)  	switch (mode) {  	case 1:  		syscall = mode1_syscalls; -#ifdef TIF_32BIT -		if (test_thread_flag(TIF_32BIT)) +#ifdef CONFIG_COMPAT +		if (is_compat_task())  			syscall = mode1_syscalls_32;  #endif  		do { diff --git a/kernel/signal.c b/kernel/signal.c index 2a74fe87c0d..d8034737db4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -55,10 +55,22 @@ static int sig_handler_ignored(void __user *handler, int sig)  		(handler == SIG_DFL && sig_kernel_ignore(sig));  } -static int sig_ignored(struct task_struct *t, int sig) +static int sig_task_ignored(struct task_struct *t, int sig, +		int from_ancestor_ns)  {  	void __user *handler; +	handler = sig_handler(t, sig); + +	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && +			handler == SIG_DFL && !from_ancestor_ns) +		return 1; + +	return sig_handler_ignored(handler, sig); +} + +static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +{  	/*  	 * Blocked signals are never ignored, since the  	 * signal handler may change by the time it is @@ -67,14 +79,13 @@ static int sig_ignored(struct task_struct *t, int sig)  	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))  		return 0; -	handler = sig_handler(t, sig); -	if (!sig_handler_ignored(handler, sig)) +	if (!sig_task_ignored(t, sig, from_ancestor_ns))  		return 0;  	/*  	 * Tracers may want to know about even ignored signals.  	 */ -	return !tracehook_consider_ignored_signal(t, sig, handler); +	return !tracehook_consider_ignored_signal(t, sig);  }  /* @@ -318,7 +329,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)  		return 1;  	if (handler != SIG_IGN && handler != SIG_DFL)  		return 0; -	return !tracehook_consider_fatal_signal(tsk, sig, handler); +	return !tracehook_consider_fatal_signal(tsk, sig);  } @@ -624,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,   * Returns true if the signal should be actually delivered, otherwise   * it should be dropped.   */ -static int prepare_signal(int sig, struct task_struct *p) +static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)  {  	struct signal_struct *signal = p->signal;  	struct task_struct *t; @@ -708,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)  		}  	} -	return !sig_ignored(p, sig); +	return !sig_ignored(p, sig, from_ancestor_ns);  }  /* @@ -777,7 +788,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)  	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&  	    !sigismember(&t->real_blocked, sig) &&  	    (sig == SIGKILL || -	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { +	     !tracehook_consider_fatal_signal(t, sig))) {  		/*  		 * This signal will be fatal to the whole group.  		 */ @@ -813,8 +824,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)  	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);  } -static int send_signal(int sig, struct siginfo *info, struct task_struct *t, -			int group) +static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, +			int group, int from_ancestor_ns)  {  	struct sigpending *pending;  	struct sigqueue *q; @@ -822,7 +833,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,  	trace_sched_signal_send(sig, t);  	assert_spin_locked(&t->sighand->siglock); -	if (!prepare_signal(sig, t)) + +	if (!prepare_signal(sig, t, from_ancestor_ns))  		return 0;  	pending = group ? &t->signal->shared_pending : &t->pending; @@ -871,6 +883,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,  			break;  		default:  			copy_siginfo(&q->info, info); +			if (from_ancestor_ns) +				q->info.si_pid = 0;  			break;  		}  	} else if (!is_si_special(info)) { @@ -889,6 +903,20 @@ out_set:  	return 0;  } +static int send_signal(int sig, struct siginfo *info, struct task_struct *t, +			int group) +{ +	int from_ancestor_ns = 0; + +#ifdef CONFIG_PID_NS +	if (!is_si_special(info) && SI_FROMUSER(info) && +			task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) +		from_ancestor_ns = 1; +#endif + +	return __send_signal(sig, info, t, group, from_ancestor_ns); +} +  int print_fatal_signals;  static void print_fatal_signal(struct pt_regs *regs, int signr) @@ -1133,7 +1161,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,  	if (sig && p->sighand) {  		unsigned long flags;  		spin_lock_irqsave(&p->sighand->siglock, flags); -		ret = __group_send_sig_info(sig, info, p); +		ret = __send_signal(sig, info, p, 1, 0);  		spin_unlock_irqrestore(&p->sighand->siglock, flags);  	}  out_unlock: @@ -1320,7 +1348,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)  		goto ret;  	ret = 1; /* the signal is ignored */ -	if (!prepare_signal(sig, t)) +	if (!prepare_signal(sig, t, 0))  		goto out;  	ret = 0; @@ -1575,7 +1603,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)  	read_lock(&tasklist_lock);  	if (may_ptrace_stop()) {  		do_notify_parent_cldstop(current, CLD_TRAPPED); +		/* +		 * Don't want to allow preemption here, because +		 * sys_ptrace() needs this task to be inactive. +		 * +		 * XXX: implement read_unlock_no_resched(). +		 */ +		preempt_disable();  		read_unlock(&tasklist_lock); +		preempt_enable_no_resched();  		schedule();  	} else {  		/* @@ -1836,9 +1872,16 @@ relock:  		/*  		 * Global init gets no signals it doesn't want. +		 * Container-init gets no signals it doesn't want from same +		 * container. +		 * +		 * Note that if global/container-init sees a sig_kernel_only() +		 * signal here, the signal must have been generated internally +		 * or must have come from an ancestor namespace. In either +		 * case, the signal cannot be dropped.  		 */  		if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && -		    !signal_group_exit(signal)) +				!sig_kernel_only(signr))  			continue;  		if (sig_kernel_stop(signr)) { diff --git a/kernel/slow-work.c b/kernel/slow-work.c new file mode 100644 index 00000000000..cf2bc01186e --- /dev/null +++ b/kernel/slow-work.c @@ -0,0 +1,640 @@ +/* Worker thread pool for slow items, such as filesystem lookups or mkdirs + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + * + * See Documentation/slow-work.txt + */ + +#include <linux/module.h> +#include <linux/slow-work.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/wait.h> + +#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of +					 * things to do */ +#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after +					 * OOM */ + +static void slow_work_cull_timeout(unsigned long); +static void slow_work_oom_timeout(unsigned long); + +#ifdef CONFIG_SYSCTL +static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +					void __user *, size_t *, loff_t *); + +static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +					void __user *, size_t *, loff_t *); +#endif + +/* + * The pool of threads has at least min threads in it as long as someone is + * using the facility, and may have as many as max. + * + * A portion of the pool may be processing very slow operations. + */ +static unsigned slow_work_min_threads = 2; +static unsigned slow_work_max_threads = 4; +static unsigned vslow_work_proportion = 50; /* % of threads that may process +					     * very slow work */ + +#ifdef CONFIG_SYSCTL +static const int slow_work_min_min_threads = 2; +static int slow_work_max_max_threads = 255; +static const int slow_work_min_vslow = 1; +static const int slow_work_max_vslow = 99; + +ctl_table slow_work_sysctls[] = { +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "min-threads", +		.data		= &slow_work_min_threads, +		.maxlen		= sizeof(unsigned), +		.mode		= 0644, +		.proc_handler	= slow_work_min_threads_sysctl, +		.extra1		= (void *) &slow_work_min_min_threads, +		.extra2		= &slow_work_max_threads, +	}, +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "max-threads", +		.data		= &slow_work_max_threads, +		.maxlen		= sizeof(unsigned), +		.mode		= 0644, +		.proc_handler	= slow_work_max_threads_sysctl, +		.extra1		= &slow_work_min_threads, +		.extra2		= (void *) &slow_work_max_max_threads, +	}, +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "vslow-percentage", +		.data		= &vslow_work_proportion, +		.maxlen		= sizeof(unsigned), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec_minmax, +		.extra1		= (void *) &slow_work_min_vslow, +		.extra2		= (void *) &slow_work_max_vslow, +	}, +	{ .ctl_name = 0 } +}; +#endif + +/* + * The active state of the thread pool + */ +static atomic_t slow_work_thread_count; +static atomic_t vslow_work_executing_count; + +static bool slow_work_may_not_start_new_thread; +static bool slow_work_cull; /* cull a thread due to lack of activity */ +static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); +static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); +static struct slow_work slow_work_new_thread; /* new thread starter */ + +/* + * The queues of work items and the lock governing access to them.  These are + * shared between all the CPUs.  It doesn't make sense to have per-CPU queues + * as the number of threads bears no relation to the number of CPUs. + * + * There are two queues of work items: one for slow work items, and one for + * very slow work items. + */ +static LIST_HEAD(slow_work_queue); +static LIST_HEAD(vslow_work_queue); +static DEFINE_SPINLOCK(slow_work_queue_lock); + +/* + * The thread controls.  A variable used to signal to the threads that they + * should exit when the queue is empty, a waitqueue used by the threads to wait + * for signals, and a completion set by the last thread to exit. + */ +static bool slow_work_threads_should_exit; +static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); +static DECLARE_COMPLETION(slow_work_last_thread_exited); + +/* + * The number of users of the thread pool and its lock.  Whilst this is zero we + * have no threads hanging around, and when this reaches zero, we wait for all + * active or queued work items to complete and kill all the threads we do have. + */ +static int slow_work_user_count; +static DEFINE_MUTEX(slow_work_user_lock); + +/* + * Calculate the maximum number of active threads in the pool that are + * permitted to process very slow work items. + * + * The answer is rounded up to at least 1, but may not equal or exceed the + * maximum number of the threads in the pool.  This means we always have at + * least one thread that can process slow work items, and we always have at + * least one thread that won't get tied up doing so. + */ +static unsigned slow_work_calc_vsmax(void) +{ +	unsigned vsmax; + +	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; +	vsmax /= 100; +	vsmax = max(vsmax, 1U); +	return min(vsmax, slow_work_max_threads - 1); +} + +/* + * Attempt to execute stuff queued on a slow thread.  Return true if we managed + * it, false if there was nothing to do. + */ +static bool slow_work_execute(void) +{ +	struct slow_work *work = NULL; +	unsigned vsmax; +	bool very_slow; + +	vsmax = slow_work_calc_vsmax(); + +	/* see if we can schedule a new thread to be started if we're not +	 * keeping up with the work */ +	if (!waitqueue_active(&slow_work_thread_wq) && +	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && +	    atomic_read(&slow_work_thread_count) < slow_work_max_threads && +	    !slow_work_may_not_start_new_thread) +		slow_work_enqueue(&slow_work_new_thread); + +	/* find something to execute */ +	spin_lock_irq(&slow_work_queue_lock); +	if (!list_empty(&vslow_work_queue) && +	    atomic_read(&vslow_work_executing_count) < vsmax) { +		work = list_entry(vslow_work_queue.next, +				  struct slow_work, link); +		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) +			BUG(); +		list_del_init(&work->link); +		atomic_inc(&vslow_work_executing_count); +		very_slow = true; +	} else if (!list_empty(&slow_work_queue)) { +		work = list_entry(slow_work_queue.next, +				  struct slow_work, link); +		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) +			BUG(); +		list_del_init(&work->link); +		very_slow = false; +	} else { +		very_slow = false; /* avoid the compiler warning */ +	} +	spin_unlock_irq(&slow_work_queue_lock); + +	if (!work) +		return false; + +	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) +		BUG(); + +	work->ops->execute(work); + +	if (very_slow) +		atomic_dec(&vslow_work_executing_count); +	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); + +	/* if someone tried to enqueue the item whilst we were executing it, +	 * then it'll be left unenqueued to avoid multiple threads trying to +	 * execute it simultaneously +	 * +	 * there is, however, a race between us testing the pending flag and +	 * getting the spinlock, and between the enqueuer setting the pending +	 * flag and getting the spinlock, so we use a deferral bit to tell us +	 * if the enqueuer got there first +	 */ +	if (test_bit(SLOW_WORK_PENDING, &work->flags)) { +		spin_lock_irq(&slow_work_queue_lock); + +		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && +		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) +			goto auto_requeue; + +		spin_unlock_irq(&slow_work_queue_lock); +	} + +	work->ops->put_ref(work); +	return true; + +auto_requeue: +	/* we must complete the enqueue operation +	 * - we transfer our ref on the item back to the appropriate queue +	 * - don't wake another thread up as we're awake already +	 */ +	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) +		list_add_tail(&work->link, &vslow_work_queue); +	else +		list_add_tail(&work->link, &slow_work_queue); +	spin_unlock_irq(&slow_work_queue_lock); +	return true; +} + +/** + * slow_work_enqueue - Schedule a slow work item for processing + * @work: The work item to queue + * + * Schedule a slow work item for processing.  If the item is already undergoing + * execution, this guarantees not to re-enter the execution routine until the + * first execution finishes. + * + * The item is pinned by this function as it retains a reference to it, managed + * through the item operations.  The item is unpinned once it has been + * executed. + * + * An item may hog the thread that is running it for a relatively large amount + * of time, sufficient, for example, to perform several lookup, mkdir, create + * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks. + * + * Conversely, if a number of items are awaiting processing, it may take some + * time before any given item is given attention.  The number of threads in the + * pool may be increased to deal with demand, but only up to a limit. + * + * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in + * the very slow queue, from which only a portion of the threads will be + * allowed to pick items to execute.  This ensures that very slow items won't + * overly block ones that are just ordinarily slow. + * + * Returns 0 if successful, -EAGAIN if not. + */ +int slow_work_enqueue(struct slow_work *work) +{ +	unsigned long flags; + +	BUG_ON(slow_work_user_count <= 0); +	BUG_ON(!work); +	BUG_ON(!work->ops); +	BUG_ON(!work->ops->get_ref); + +	/* when honouring an enqueue request, we only promise that we will run +	 * the work function in the future; we do not promise to run it once +	 * per enqueue request +	 * +	 * we use the PENDING bit to merge together repeat requests without +	 * having to disable IRQs and take the spinlock, whilst still +	 * maintaining our promise +	 */ +	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { +		spin_lock_irqsave(&slow_work_queue_lock, flags); + +		/* we promise that we will not attempt to execute the work +		 * function in more than one thread simultaneously +		 * +		 * this, however, leaves us with a problem if we're asked to +		 * enqueue the work whilst someone is executing the work +		 * function as simply queueing the work immediately means that +		 * another thread may try executing it whilst it is already +		 * under execution +		 * +		 * to deal with this, we set the ENQ_DEFERRED bit instead of +		 * enqueueing, and the thread currently executing the work +		 * function will enqueue the work item when the work function +		 * returns and it has cleared the EXECUTING bit +		 */ +		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { +			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); +		} else { +			if (work->ops->get_ref(work) < 0) +				goto cant_get_ref; +			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) +				list_add_tail(&work->link, &vslow_work_queue); +			else +				list_add_tail(&work->link, &slow_work_queue); +			wake_up(&slow_work_thread_wq); +		} + +		spin_unlock_irqrestore(&slow_work_queue_lock, flags); +	} +	return 0; + +cant_get_ref: +	spin_unlock_irqrestore(&slow_work_queue_lock, flags); +	return -EAGAIN; +} +EXPORT_SYMBOL(slow_work_enqueue); + +/* + * Worker thread culling algorithm + */ +static bool slow_work_cull_thread(void) +{ +	unsigned long flags; +	bool do_cull = false; + +	spin_lock_irqsave(&slow_work_queue_lock, flags); + +	if (slow_work_cull) { +		slow_work_cull = false; + +		if (list_empty(&slow_work_queue) && +		    list_empty(&vslow_work_queue) && +		    atomic_read(&slow_work_thread_count) > +		    slow_work_min_threads) { +			mod_timer(&slow_work_cull_timer, +				  jiffies + SLOW_WORK_CULL_TIMEOUT); +			do_cull = true; +		} +	} + +	spin_unlock_irqrestore(&slow_work_queue_lock, flags); +	return do_cull; +} + +/* + * Determine if there is slow work available for dispatch + */ +static inline bool slow_work_available(int vsmax) +{ +	return !list_empty(&slow_work_queue) || +		(!list_empty(&vslow_work_queue) && +		 atomic_read(&vslow_work_executing_count) < vsmax); +} + +/* + * Worker thread dispatcher + */ +static int slow_work_thread(void *_data) +{ +	int vsmax; + +	DEFINE_WAIT(wait); + +	set_freezable(); +	set_user_nice(current, -5); + +	for (;;) { +		vsmax = vslow_work_proportion; +		vsmax *= atomic_read(&slow_work_thread_count); +		vsmax /= 100; + +		prepare_to_wait(&slow_work_thread_wq, &wait, +				TASK_INTERRUPTIBLE); +		if (!freezing(current) && +		    !slow_work_threads_should_exit && +		    !slow_work_available(vsmax) && +		    !slow_work_cull) +			schedule(); +		finish_wait(&slow_work_thread_wq, &wait); + +		try_to_freeze(); + +		vsmax = vslow_work_proportion; +		vsmax *= atomic_read(&slow_work_thread_count); +		vsmax /= 100; + +		if (slow_work_available(vsmax) && slow_work_execute()) { +			cond_resched(); +			if (list_empty(&slow_work_queue) && +			    list_empty(&vslow_work_queue) && +			    atomic_read(&slow_work_thread_count) > +			    slow_work_min_threads) +				mod_timer(&slow_work_cull_timer, +					  jiffies + SLOW_WORK_CULL_TIMEOUT); +			continue; +		} + +		if (slow_work_threads_should_exit) +			break; + +		if (slow_work_cull && slow_work_cull_thread()) +			break; +	} + +	if (atomic_dec_and_test(&slow_work_thread_count)) +		complete_and_exit(&slow_work_last_thread_exited, 0); +	return 0; +} + +/* + * Handle thread cull timer expiration + */ +static void slow_work_cull_timeout(unsigned long data) +{ +	slow_work_cull = true; +	wake_up(&slow_work_thread_wq); +} + +/* + * Get a reference on slow work thread starter + */ +static int slow_work_new_thread_get_ref(struct slow_work *work) +{ +	return 0; +} + +/* + * Drop a reference on slow work thread starter + */ +static void slow_work_new_thread_put_ref(struct slow_work *work) +{ +} + +/* + * Start a new slow work thread + */ +static void slow_work_new_thread_execute(struct slow_work *work) +{ +	struct task_struct *p; + +	if (slow_work_threads_should_exit) +		return; + +	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) +		return; + +	if (!mutex_trylock(&slow_work_user_lock)) +		return; + +	slow_work_may_not_start_new_thread = true; +	atomic_inc(&slow_work_thread_count); +	p = kthread_run(slow_work_thread, NULL, "kslowd"); +	if (IS_ERR(p)) { +		printk(KERN_DEBUG "Slow work thread pool: OOM\n"); +		if (atomic_dec_and_test(&slow_work_thread_count)) +			BUG(); /* we're running on a slow work thread... */ +		mod_timer(&slow_work_oom_timer, +			  jiffies + SLOW_WORK_OOM_TIMEOUT); +	} else { +		/* ratelimit the starting of new threads */ +		mod_timer(&slow_work_oom_timer, jiffies + 1); +	} + +	mutex_unlock(&slow_work_user_lock); +} + +static const struct slow_work_ops slow_work_new_thread_ops = { +	.get_ref	= slow_work_new_thread_get_ref, +	.put_ref	= slow_work_new_thread_put_ref, +	.execute	= slow_work_new_thread_execute, +}; + +/* + * post-OOM new thread start suppression expiration + */ +static void slow_work_oom_timeout(unsigned long data) +{ +	slow_work_may_not_start_new_thread = false; +} + +#ifdef CONFIG_SYSCTL +/* + * Handle adjustment of the minimum number of threads + */ +static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, +					struct file *filp, void __user *buffer, +					size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); +	int n; + +	if (ret == 0) { +		mutex_lock(&slow_work_user_lock); +		if (slow_work_user_count > 0) { +			/* see if we need to start or stop threads */ +			n = atomic_read(&slow_work_thread_count) - +				slow_work_min_threads; + +			if (n < 0 && !slow_work_may_not_start_new_thread) +				slow_work_enqueue(&slow_work_new_thread); +			else if (n > 0) +				mod_timer(&slow_work_cull_timer, +					  jiffies + SLOW_WORK_CULL_TIMEOUT); +		} +		mutex_unlock(&slow_work_user_lock); +	} + +	return ret; +} + +/* + * Handle adjustment of the maximum number of threads + */ +static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, +					struct file *filp, void __user *buffer, +					size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); +	int n; + +	if (ret == 0) { +		mutex_lock(&slow_work_user_lock); +		if (slow_work_user_count > 0) { +			/* see if we need to stop threads */ +			n = slow_work_max_threads - +				atomic_read(&slow_work_thread_count); + +			if (n < 0) +				mod_timer(&slow_work_cull_timer, +					  jiffies + SLOW_WORK_CULL_TIMEOUT); +		} +		mutex_unlock(&slow_work_user_lock); +	} + +	return ret; +} +#endif /* CONFIG_SYSCTL */ + +/** + * slow_work_register_user - Register a user of the facility + * + * Register a user of the facility, starting up the initial threads if there + * aren't any other users at this point.  This will return 0 if successful, or + * an error if not. + */ +int slow_work_register_user(void) +{ +	struct task_struct *p; +	int loop; + +	mutex_lock(&slow_work_user_lock); + +	if (slow_work_user_count == 0) { +		printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); +		init_completion(&slow_work_last_thread_exited); + +		slow_work_threads_should_exit = false; +		slow_work_init(&slow_work_new_thread, +			       &slow_work_new_thread_ops); +		slow_work_may_not_start_new_thread = false; +		slow_work_cull = false; + +		/* start the minimum number of threads */ +		for (loop = 0; loop < slow_work_min_threads; loop++) { +			atomic_inc(&slow_work_thread_count); +			p = kthread_run(slow_work_thread, NULL, "kslowd"); +			if (IS_ERR(p)) +				goto error; +		} +		printk(KERN_NOTICE "Slow work thread pool: Ready\n"); +	} + +	slow_work_user_count++; +	mutex_unlock(&slow_work_user_lock); +	return 0; + +error: +	if (atomic_dec_and_test(&slow_work_thread_count)) +		complete(&slow_work_last_thread_exited); +	if (loop > 0) { +		printk(KERN_ERR "Slow work thread pool:" +		       " Aborting startup on ENOMEM\n"); +		slow_work_threads_should_exit = true; +		wake_up_all(&slow_work_thread_wq); +		wait_for_completion(&slow_work_last_thread_exited); +		printk(KERN_ERR "Slow work thread pool: Aborted\n"); +	} +	mutex_unlock(&slow_work_user_lock); +	return PTR_ERR(p); +} +EXPORT_SYMBOL(slow_work_register_user); + +/** + * slow_work_unregister_user - Unregister a user of the facility + * + * Unregister a user of the facility, killing all the threads if this was the + * last one. + */ +void slow_work_unregister_user(void) +{ +	mutex_lock(&slow_work_user_lock); + +	BUG_ON(slow_work_user_count <= 0); + +	slow_work_user_count--; +	if (slow_work_user_count == 0) { +		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); +		slow_work_threads_should_exit = true; +		wake_up_all(&slow_work_thread_wq); +		wait_for_completion(&slow_work_last_thread_exited); +		printk(KERN_NOTICE "Slow work thread pool:" +		       " Shut down complete\n"); +	} + +	del_timer_sync(&slow_work_cull_timer); + +	mutex_unlock(&slow_work_user_lock); +} +EXPORT_SYMBOL(slow_work_unregister_user); + +/* + * Initialise the slow work facility + */ +static int __init init_slow_work(void) +{ +	unsigned nr_cpus = num_possible_cpus(); + +	if (slow_work_max_threads < nr_cpus) +		slow_work_max_threads = nr_cpus; +#ifdef CONFIG_SYSCTL +	if (slow_work_max_max_threads < nr_cpus * 2) +		slow_work_max_max_threads = nr_cpus * 2; +#endif +	return 0; +} + +subsys_initcall(init_slow_work); diff --git a/kernel/smp.c b/kernel/smp.c index bbedbb7efe3..858baac568e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -2,40 +2,82 @@   * Generic helpers for smp ipi calls   *   * (C) Jens Axboe <jens.axboe@oracle.com> 2008 - *   */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/percpu.h>  #include <linux/rcupdate.h>  #include <linux/rculist.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/init.h>  #include <linux/smp.h> +#include <linux/cpu.h>  static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); -static LIST_HEAD(call_function_queue); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); + +static struct { +	struct list_head	queue; +	spinlock_t		lock; +} call_function __cacheline_aligned_in_smp = +	{ +		.queue		= LIST_HEAD_INIT(call_function.queue), +		.lock		= __SPIN_LOCK_UNLOCKED(call_function.lock), +	};  enum { -	CSD_FLAG_WAIT		= 0x01, -	CSD_FLAG_ALLOC		= 0x02, -	CSD_FLAG_LOCK		= 0x04, +	CSD_FLAG_LOCK		= 0x01,  };  struct call_function_data { -	struct call_single_data csd; -	spinlock_t lock; -	unsigned int refs; -	struct rcu_head rcu_head; -	unsigned long cpumask_bits[]; +	struct call_single_data	csd; +	spinlock_t		lock; +	unsigned int		refs; +	cpumask_var_t		cpumask;  };  struct call_single_queue { -	struct list_head list; -	spinlock_t lock; +	struct list_head	list; +	spinlock_t		lock; +}; + +static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { +	.lock			= __SPIN_LOCK_UNLOCKED(cfd_data.lock), +}; + +static int +hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	long cpu = (long)hcpu; +	struct call_function_data *cfd = &per_cpu(cfd_data, cpu); + +	switch (action) { +	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN: +		if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, +				cpu_to_node(cpu))) +			return NOTIFY_BAD; +		break; + +#ifdef CONFIG_CPU_HOTPLUG +	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN: + +	case CPU_DEAD: +	case CPU_DEAD_FROZEN: +		free_cpumask_var(cfd->cpumask); +		break; +#endif +	}; + +	return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { +	.notifier_call		= hotplug_cfd,  };  static int __cpuinit init_call_single_data(void)  { +	void *cpu = (void *)(long)smp_processor_id();  	int i;  	for_each_possible_cpu(i) { @@ -44,29 +86,63 @@ static int __cpuinit init_call_single_data(void)  		spin_lock_init(&q->lock);  		INIT_LIST_HEAD(&q->list);  	} + +	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); +	register_cpu_notifier(&hotplug_cfd_notifier); +  	return 0;  }  early_initcall(init_call_single_data); -static void csd_flag_wait(struct call_single_data *data) +/* + * csd_lock/csd_unlock used to serialize access to per-cpu csd resources + * + * For non-synchronous ipi calls the csd can still be in use by the + * previous function call. For multi-cpu calls its even more interesting + * as we'll have to ensure no other cpu is observing our csd. + */ +static void csd_lock_wait(struct call_single_data *data)  { -	/* Wait for response */ -	do { -		if (!(data->flags & CSD_FLAG_WAIT)) -			break; +	while (data->flags & CSD_FLAG_LOCK)  		cpu_relax(); -	} while (1); +} + +static void csd_lock(struct call_single_data *data) +{ +	csd_lock_wait(data); +	data->flags = CSD_FLAG_LOCK; + +	/* +	 * prevent CPU from reordering the above assignment +	 * to ->flags with any subsequent assignments to other +	 * fields of the specified call_single_data structure: +	 */ +	smp_mb(); +} + +static void csd_unlock(struct call_single_data *data) +{ +	WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + +	/* +	 * ensure we're all done before releasing data: +	 */ +	smp_mb(); + +	data->flags &= ~CSD_FLAG_LOCK;  }  /* - * Insert a previously allocated call_single_data element for execution - * on the given CPU. data must already have ->func, ->info, and ->flags set. + * Insert a previously allocated call_single_data element + * for execution on the given CPU. data must already have + * ->func, ->info, and ->flags set.   */ -static void generic_exec_single(int cpu, struct call_single_data *data) +static +void generic_exec_single(int cpu, struct call_single_data *data, int wait)  {  	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); -	int wait = data->flags & CSD_FLAG_WAIT, ipi;  	unsigned long flags; +	int ipi;  	spin_lock_irqsave(&dst->lock, flags);  	ipi = list_empty(&dst->list); @@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)  	spin_unlock_irqrestore(&dst->lock, flags);  	/* -	 * Make the list addition visible before sending the ipi. +	 * The list addition should be visible before sending the IPI +	 * handler locks the list to pull the entry off it because of +	 * normal cache coherency rules implied by spinlocks. +	 * +	 * If IPIs can go out of order to the cache coherency protocol +	 * in an architecture, sufficient synchronisation should be added +	 * to arch code to make it appear to obey cache coherency WRT +	 * locking and barrier primitives. Generic code isn't really +	 * equipped to do the right thing...  	 */ -	smp_mb(); -  	if (ipi)  		arch_send_call_function_single_ipi(cpu);  	if (wait) -		csd_flag_wait(data); -} - -static void rcu_free_call_data(struct rcu_head *head) -{ -	struct call_function_data *data; - -	data = container_of(head, struct call_function_data, rcu_head); - -	kfree(data); +		csd_lock_wait(data);  }  /* @@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt(void)  	int cpu = get_cpu();  	/* -	 * It's ok to use list_for_each_rcu() here even though we may delete -	 * 'pos', since list_del_rcu() doesn't clear ->next +	 * Ensure entry is visible on call_function_queue after we have +	 * entered the IPI. See comment in smp_call_function_many. +	 * If we don't have this, then we may miss an entry on the list +	 * and never get another IPI to process it.  	 */ -	rcu_read_lock(); -	list_for_each_entry_rcu(data, &call_function_queue, csd.list) { +	smp_mb(); + +	/* +	 * It's ok to use list_for_each_rcu() here even though we may +	 * delete 'pos', since list_del_rcu() doesn't clear ->next +	 */ +	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {  		int refs; -		if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) +		spin_lock(&data->lock); +		if (!cpumask_test_cpu(cpu, data->cpumask)) { +			spin_unlock(&data->lock);  			continue; +		} +		cpumask_clear_cpu(cpu, data->cpumask); +		spin_unlock(&data->lock);  		data->csd.func(data->csd.info);  		spin_lock(&data->lock); -		cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));  		WARN_ON(data->refs == 0); -		data->refs--; -		refs = data->refs; +		refs = --data->refs; +		if (!refs) { +			spin_lock(&call_function.lock); +			list_del_rcu(&data->csd.list); +			spin_unlock(&call_function.lock); +		}  		spin_unlock(&data->lock);  		if (refs)  			continue; -		spin_lock(&call_function_lock); -		list_del_rcu(&data->csd.list); -		spin_unlock(&call_function_lock); - -		if (data->csd.flags & CSD_FLAG_WAIT) { -			/* -			 * serialize stores to data with the flag clear -			 * and wakeup -			 */ -			smp_wmb(); -			data->csd.flags &= ~CSD_FLAG_WAIT; -		} -		if (data->csd.flags & CSD_FLAG_ALLOC) -			call_rcu(&data->rcu_head, rcu_free_call_data); +		csd_unlock(&data->csd);  	} -	rcu_read_unlock();  	put_cpu();  }  /* - * Invoked by arch to handle an IPI for call function single. Must be called - * from the arch with interrupts disabled. + * Invoked by arch to handle an IPI for call function single. Must be + * called from the arch with interrupts disabled.   */  void generic_smp_call_function_single_interrupt(void)  {  	struct call_single_queue *q = &__get_cpu_var(call_single_queue); +	unsigned int data_flags;  	LIST_HEAD(list); -	/* -	 * Need to see other stores to list head for checking whether -	 * list is empty without holding q->lock -	 */ -	smp_read_barrier_depends(); -	while (!list_empty(&q->list)) { -		unsigned int data_flags; +	spin_lock(&q->lock); +	list_replace_init(&q->list, &list); +	spin_unlock(&q->lock); -		spin_lock(&q->lock); -		list_replace_init(&q->list, &list); -		spin_unlock(&q->lock); - -		while (!list_empty(&list)) { -			struct call_single_data *data; +	while (!list_empty(&list)) { +		struct call_single_data *data; -			data = list_entry(list.next, struct call_single_data, -						list); -			list_del(&data->list); +		data = list_entry(list.next, struct call_single_data, list); +		list_del(&data->list); -			/* -			 * 'data' can be invalid after this call if -			 * flags == 0 (when called through -			 * generic_exec_single(), so save them away before -			 * making the call. -			 */ -			data_flags = data->flags; +		/* +		 * 'data' can be invalid after this call if flags == 0 +		 * (when called through generic_exec_single()), +		 * so save them away before making the call: +		 */ +		data_flags = data->flags; -			data->func(data->info); +		data->func(data->info); -			if (data_flags & CSD_FLAG_WAIT) { -				smp_wmb(); -				data->flags &= ~CSD_FLAG_WAIT; -			} else if (data_flags & CSD_FLAG_LOCK) { -				smp_wmb(); -				data->flags &= ~CSD_FLAG_LOCK; -			} else if (data_flags & CSD_FLAG_ALLOC) -				kfree(data); -		}  		/* -		 * See comment on outer loop +		 * Unlocked CSDs are valid through generic_exec_single():  		 */ -		smp_read_barrier_depends(); +		if (data_flags & CSD_FLAG_LOCK) +			csd_unlock(data);  	}  } @@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);  int smp_call_function_single(int cpu, void (*func) (void *info), void *info,  			     int wait)  { -	struct call_single_data d; +	struct call_single_data d = { +		.flags = 0, +	};  	unsigned long flags; -	/* prevent preemption and reschedule on another processor, -	   as well as CPU removal */ -	int me = get_cpu(); +	int this_cpu;  	int err = 0; +	/* +	 * prevent preemption and reschedule on another processor, +	 * as well as CPU removal +	 */ +	this_cpu = get_cpu(); +  	/* Can deadlock when called with interrupts disabled */ -	WARN_ON(irqs_disabled()); +	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); -	if (cpu == me) { +	if (cpu == this_cpu) {  		local_irq_save(flags);  		func(info);  		local_irq_restore(flags); -	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { -		struct call_single_data *data; +	} else { +		if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { +			struct call_single_data *data = &d; + +			if (!wait) +				data = &__get_cpu_var(csd_data); -		if (!wait) { -			/* -			 * We are calling a function on a single CPU -			 * and we are not going to wait for it to finish. -			 * We first try to allocate the data, but if we -			 * fail, we fall back to use a per cpu data to pass -			 * the information to that CPU. Since all callers -			 * of this code will use the same data, we must -			 * synchronize the callers to prevent a new caller -			 * from corrupting the data before the callee -			 * can access it. -			 * -			 * The CSD_FLAG_LOCK is used to let us know when -			 * the IPI handler is done with the data. -			 * The first caller will set it, and the callee -			 * will clear it. The next caller must wait for -			 * it to clear before we set it again. This -			 * will make sure the callee is done with the -			 * data before a new caller will use it. -			 */ -			data = kmalloc(sizeof(*data), GFP_ATOMIC); -			if (data) -				data->flags = CSD_FLAG_ALLOC; -			else { -				data = &per_cpu(csd_data, me); -				while (data->flags & CSD_FLAG_LOCK) -					cpu_relax(); -				data->flags = CSD_FLAG_LOCK; -			} +			csd_lock(data); + +			data->func = func; +			data->info = info; +			generic_exec_single(cpu, data, wait);  		} else { -			data = &d; -			data->flags = CSD_FLAG_WAIT; +			err = -ENXIO;	/* CPU not online */  		} - -		data->func = func; -		data->info = info; -		generic_exec_single(cpu, data); -	} else { -		err = -ENXIO;	/* CPU not online */  	}  	put_cpu(); +  	return err;  }  EXPORT_SYMBOL(smp_call_function_single); @@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);   * @cpu: The CPU to run on.   * @data: Pre-allocated and setup data structure   * - * Like smp_call_function_single(), but allow caller to pass in a pre-allocated - * data structure. Useful for embedding @data inside other structures, for - * instance. - * + * Like smp_call_function_single(), but allow caller to pass in a + * pre-allocated data structure. Useful for embedding @data inside + * other structures, for instance.   */ -void __smp_call_function_single(int cpu, struct call_single_data *data) +void __smp_call_function_single(int cpu, struct call_single_data *data, +				int wait)  { +	csd_lock(data); +  	/* Can deadlock when called with interrupts disabled */ -	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); +	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); -	generic_exec_single(cpu, data); +	generic_exec_single(cpu, data, wait);  } -/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ +/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ +  #ifndef arch_send_call_function_ipi_mask -#define arch_send_call_function_ipi_mask(maskp) \ -	arch_send_call_function_ipi(*(maskp)) +# define arch_send_call_function_ipi_mask(maskp) \ +	 arch_send_call_function_ipi(*(maskp))  #endif  /** @@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)   * @mask: The set of cpus to run on (only runs on online subset).   * @func: The function to run. This must be fast and non-blocking.   * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. + * @wait: If true, wait (atomically) until function has completed + *        on other CPUs.   *   * If @wait is true, then returns once @func has returned. Note that @wait   * will be implicitly turned on in case of allocation failures, since @@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)   * must be disabled when calling this function.   */  void smp_call_function_many(const struct cpumask *mask, -			    void (*func)(void *), void *info, -			    bool wait) +			    void (*func)(void *), void *info, bool wait)  {  	struct call_function_data *data;  	unsigned long flags; -	int cpu, next_cpu; +	int cpu, next_cpu, this_cpu = smp_processor_id();  	/* Can deadlock when called with interrupts disabled */ -	WARN_ON(irqs_disabled()); +	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); -	/* So, what's a CPU they want?  Ignoring this one. */ +	/* So, what's a CPU they want? Ignoring this one. */  	cpu = cpumask_first_and(mask, cpu_online_mask); -	if (cpu == smp_processor_id()) +	if (cpu == this_cpu)  		cpu = cpumask_next_and(cpu, mask, cpu_online_mask); +  	/* No online cpus?  We're done. */  	if (cpu >= nr_cpu_ids)  		return;  	/* Do we have another CPU which isn't us? */  	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); -	if (next_cpu == smp_processor_id()) +	if (next_cpu == this_cpu)  		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);  	/* Fastpath: do that cpu by itself. */ @@ -347,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,  		return;  	} -	data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); -	if (unlikely(!data)) { -		/* Slow path. */ -		for_each_online_cpu(cpu) { -			if (cpu == smp_processor_id()) -				continue; -			if (cpumask_test_cpu(cpu, mask)) -				smp_call_function_single(cpu, func, info, wait); -		} -		return; -	} +	data = &__get_cpu_var(cfd_data); +	csd_lock(&data->csd); -	spin_lock_init(&data->lock); -	data->csd.flags = CSD_FLAG_ALLOC; -	if (wait) -		data->csd.flags |= CSD_FLAG_WAIT; +	spin_lock_irqsave(&data->lock, flags);  	data->csd.func = func;  	data->csd.info = info; -	cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); -	cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); -	data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); +	cpumask_and(data->cpumask, mask, cpu_online_mask); +	cpumask_clear_cpu(this_cpu, data->cpumask); +	data->refs = cpumask_weight(data->cpumask); -	spin_lock_irqsave(&call_function_lock, flags); -	list_add_tail_rcu(&data->csd.list, &call_function_queue); -	spin_unlock_irqrestore(&call_function_lock, flags); +	spin_lock(&call_function.lock); +	/* +	 * Place entry at the _HEAD_ of the list, so that any cpu still +	 * observing the entry in generic_smp_call_function_interrupt() +	 * will not miss any other list entries: +	 */ +	list_add_rcu(&data->csd.list, &call_function.queue); +	spin_unlock(&call_function.lock); + +	spin_unlock_irqrestore(&data->lock, flags);  	/*  	 * Make the list addition visible before sending the ipi. +	 * (IPIs must obey or appear to obey normal Linux cache +	 * coherency rules -- see comment in generic_exec_single).  	 */  	smp_mb();  	/* Send a message to all CPUs in the map */ -	arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); +	arch_send_call_function_ipi_mask(data->cpumask); -	/* optionally wait for the CPUs to complete */ +	/* Optionally wait for the CPUs to complete */  	if (wait) -		csd_flag_wait(&data->csd); +		csd_lock_wait(&data->csd);  }  EXPORT_SYMBOL(smp_call_function_many); @@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);   * smp_call_function(): Run a function on all other CPUs.   * @func: The function to run. This must be fast and non-blocking.   * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. + * @wait: If true, wait (atomically) until function has completed + *        on other CPUs.   *   * Returns 0.   * @@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)  	preempt_disable();  	smp_call_function_many(cpu_online_mask, func, info, wait);  	preempt_enable(); +  	return 0;  }  EXPORT_SYMBOL(smp_call_function);  void ipi_call_lock(void)  { -	spin_lock(&call_function_lock); +	spin_lock(&call_function.lock);  }  void ipi_call_unlock(void)  { -	spin_unlock(&call_function_lock); +	spin_unlock(&call_function.lock);  }  void ipi_call_lock_irq(void)  { -	spin_lock_irq(&call_function_lock); +	spin_lock_irq(&call_function.lock);  }  void ipi_call_unlock_irq(void)  { -	spin_unlock_irq(&call_function_lock); +	spin_unlock_irq(&call_function.lock);  } diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8..ea23ec087ee 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)  	account_system_vtime(current);  	__local_bh_disable((unsigned long)__builtin_return_address(0)); -	trace_softirq_enter(); +	lockdep_softirq_enter();  	cpu = smp_processor_id();  restart: @@ -220,7 +220,7 @@ restart:  	if (pending)  		wakeup_softirqd(); -	trace_softirq_exit(); +	lockdep_softirq_exit();  	account_system_vtime(current);  	_local_bh_enable(); @@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir  		cp->flags = 0;  		cp->priv = softirq; -		__smp_call_function_single(cpu, cp); +		__smp_call_function_single(cpu, cp, 0);  		return 0;  	}  	return 1; @@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu)  			preempt_enable_no_resched();  			cond_resched();  			preempt_disable(); +			rcu_qsctr_inc((long)__bind_cpu);  		}  		preempt_enable();  		set_current_state(TASK_INTERRUPTIBLE); @@ -795,6 +796,11 @@ int __init __weak early_irq_init(void)  	return 0;  } +int __init __weak arch_probe_nr_irqs(void) +{ +	return 0; +} +  int __init __weak arch_early_irq_init(void)  {  	return 0; diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 29ab20749dd..7932653c4eb 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)  	local_irq_save(flags);  	preempt_disable();  	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); -	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); +	LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock, +			     _raw_read_lock_flags, &flags);  	return flags;  }  EXPORT_SYMBOL(_read_lock_irqsave); @@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)  	local_irq_save(flags);  	preempt_disable();  	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); -	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); +	LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock, +			     _raw_write_lock_flags, &flags);  	return flags;  }  EXPORT_SYMBOL(_write_lock_irqsave); @@ -299,16 +301,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas  	local_irq_save(flags);  	preempt_disable();  	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); -	/* -	 * On lockdep we dont want the hand-coded irq-enable of -	 * _raw_spin_lock_flags() code, because lockdep assumes -	 * that interrupts are not re-enabled during lock-acquire: -	 */ -#ifdef CONFIG_LOCKDEP -	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -#else -	_raw_spin_lock_flags(lock, &flags); -#endif +	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, +				_raw_spin_lock_flags, &flags);  	return flags;  }  EXPORT_SYMBOL(_spin_lock_irqsave_nested); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0cd415ee62a..912823e2a11 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);  static int refcount;  static struct workqueue_struct *stop_machine_wq;  static struct stop_machine_data active, idle; -static const cpumask_t *active_cpus; +static const struct cpumask *active_cpus;  static void *stop_machine_work;  static void set_state(enum stopmachine_state newstate) @@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)  	 * doesn't hit this CPU until we're ready. */  	get_cpu();  	for_each_online_cpu(i) { -		sm_work = percpu_ptr(stop_machine_work, i); +		sm_work = per_cpu_ptr(stop_machine_work, i);  		INIT_WORK(sm_work, stop_cpu);  		queue_work_on(i, stop_machine_wq, sm_work);  	} diff --git a/kernel/sys.c b/kernel/sys.c index f145c415bc1..51dbb55604e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -34,6 +34,7 @@  #include <linux/seccomp.h>  #include <linux/cpu.h>  #include <linux/ptrace.h> +#include <linux/fs_struct.h>  #include <linux/compat.h>  #include <linux/syscalls.h> @@ -559,7 +560,7 @@ error:  	abort_creds(new);  	return retval;  } -   +  /*   * change the user struct in a credentials set to match the new UID   */ @@ -571,6 +572,11 @@ static int set_user(struct cred *new)  	if (!new_user)  		return -EAGAIN; +	if (!task_can_switch_user(new_user, current)) { +		free_uid(new_user); +		return -EINVAL; +	} +  	if (atomic_read(&new_user->processes) >=  				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&  			new_user != INIT_USER) { @@ -631,10 +637,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)  			goto error;  	} -	retval = -EAGAIN; -	if (new->uid != old->uid && set_user(new) < 0) -		goto error; - +	if (new->uid != old->uid) { +		retval = set_user(new); +		if (retval < 0) +			goto error; +	}  	if (ruid != (uid_t) -1 ||  	    (euid != (uid_t) -1 && euid != old->uid))  		new->suid = new->euid; @@ -680,9 +687,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)  	retval = -EPERM;  	if (capable(CAP_SETUID)) {  		new->suid = new->uid = uid; -		if (uid != old->uid && set_user(new) < 0) { -			retval = -EAGAIN; -			goto error; +		if (uid != old->uid) { +			retval = set_user(new); +			if (retval < 0) +				goto error;  		}  	} else if (uid != old->uid && uid != new->suid) {  		goto error; @@ -734,11 +742,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)  			goto error;  	} -	retval = -EAGAIN;  	if (ruid != (uid_t) -1) {  		new->uid = ruid; -		if (ruid != old->uid && set_user(new) < 0) -			goto error; +		if (ruid != old->uid) { +			retval = set_user(new); +			if (retval < 0) +				goto error; +		}  	}  	if (euid != (uid_t) -1)  		new->euid = euid; @@ -1004,10 +1014,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)  	if (err)  		goto out; -	if (task_pgrp(p) != pgrp) { +	if (task_pgrp(p) != pgrp)  		change_pid(p, PIDTYPE_PGID, pgrp); -		set_task_pgrp(p, pid_nr(pgrp)); -	}  	err = 0;  out: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c5ef44ff850..82350f8f04f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -48,6 +48,7 @@  #include <linux/acpi.h>  #include <linux/reboot.h>  #include <linux/ftrace.h> +#include <linux/slow-work.h>  #include <asm/uaccess.h>  #include <asm/processor.h> @@ -95,12 +96,9 @@ static int sixty = 60;  static int neg_one = -1;  #endif -#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) -static int two = 2; -#endif -  static int zero;  static int one = 1; +static int two = 2;  static unsigned long one_ul = 1;  static int one_hundred = 100; @@ -900,6 +898,14 @@ static struct ctl_table kern_table[] = {  		.proc_handler	= &scan_unevictable_handler,  	},  #endif +#ifdef CONFIG_SLOW_WORK +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "slow-work", +		.mode		= 0555, +		.child		= slow_work_sysctls, +	}, +#endif  /*   * NOTE: do not add new entries to this table unless you have read   * Documentation/sysctl/ctl_unnumbered.txt @@ -1010,7 +1016,7 @@ static struct ctl_table vm_table[] = {  		.data		= &dirty_expire_interval,  		.maxlen		= sizeof(dirty_expire_interval),  		.mode		= 0644, -		.proc_handler	= &proc_dointvec_userhz_jiffies, +		.proc_handler	= &proc_dointvec,  	},  	{  		.ctl_name	= VM_NR_PDFLUSH_THREADS, @@ -1373,10 +1379,7 @@ static struct ctl_table fs_table[] = {  		.data		= &lease_break_time,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= &proc_dointvec_minmax, -		.strategy	= &sysctl_intvec, -		.extra1		= &zero, -		.extra2		= &two, +		.proc_handler	= &proc_dointvec,  	},  #endif  #ifdef CONFIG_AIO @@ -1417,7 +1420,10 @@ static struct ctl_table fs_table[] = {  		.data		= &suid_dumpable,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= &proc_dointvec, +		.proc_handler	= &proc_dointvec_minmax, +		.strategy	= &sysctl_intvec, +		.extra1		= &zero, +		.extra2		= &two,  	},  #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)  	{ diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index fafeb48f27c..b38423ca711 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {  	{ NET_IPV4_CONF_ARP_IGNORE,		"arp_ignore" },  	{ NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },  	{ NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" }, +	{ NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },  	{}  }; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 905b0b50792..0b0a6366c9d 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o  obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o  obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index ea2f48af83c..d13be216a79 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev,  	if (dev->mode != mode) {  		dev->set_mode(mode, dev);  		dev->mode = mode; + +		/* +		 * A nsec2cyc multiplicator of 0 is invalid and we'd crash +		 * on it, so fix it up and emit a warning: +		 */ +		if (mode == CLOCK_EVT_MODE_ONESHOT) { +			if (unlikely(!dev->mult)) { +				dev->mult = 1; +				WARN_ON(1); +			} +		}  	}  } @@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev)  	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);  	BUG_ON(!dev->cpumask); -	/* -	 * A nsec2cyc multiplicator of 0 is invalid and we'd crash -	 * on it, so fix it up and emit a warning: -	 */ -	if (unlikely(!dev->mult)) { -		dev->mult = 1; -		WARN_ON(1); -	} -  	spin_lock(&clockevents_lock);  	list_add(&dev->list, &clockevent_devices); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ca89e1593f0..c46c931a7fe 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,82 @@  #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */  #include <linux/tick.h> +void timecounter_init(struct timecounter *tc, +		      const struct cyclecounter *cc, +		      u64 start_tstamp) +{ +	tc->cc = cc; +	tc->cycle_last = cc->read(cc); +	tc->nsec = start_tstamp; +} +EXPORT_SYMBOL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc:         Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ +	cycle_t cycle_now, cycle_delta; +	u64 ns_offset; + +	/* read cycle counter: */ +	cycle_now = tc->cc->read(tc->cc); + +	/* calculate the delta since the last timecounter_read_delta(): */ +	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + +	/* convert to nanoseconds: */ +	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); + +	/* update time stamp of timecounter_read_delta() call: */ +	tc->cycle_last = cycle_now; + +	return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ +	u64 nsec; + +	/* increment time by nanoseconds since last call */ +	nsec = timecounter_read_delta(tc); +	nsec += tc->nsec; +	tc->nsec = nsec; + +	return nsec; +} +EXPORT_SYMBOL(timecounter_read); + +u64 timecounter_cyc2time(struct timecounter *tc, +			 cycle_t cycle_tstamp) +{ +	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; +	u64 nsec; + +	/* +	 * Instead of always treating cycle_tstamp as more recent +	 * than tc->cycle_last, detect when it is too far in the +	 * future and treat it as old time stamp instead. +	 */ +	if (cycle_delta > tc->cc->mask / 2) { +		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; +		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); +	} else { +		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; +	} + +	return nsec; +} +EXPORT_SYMBOL(timecounter_cyc2time); +  /* XXX - Would like a better way for initializing curr_clocksource */  extern struct clocksource clocksource_jiffies; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f5f793d9241..7fc64375ff4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -1,71 +1,129 @@  /* - * linux/kernel/time/ntp.c - *   * NTP state machine interfaces and logic.   *   * This code was mainly moved from kernel/timer.c and kernel/time.c   * Please see those files for relevant copyright info and historical   * changelogs.   */ - -#include <linux/mm.h> -#include <linux/time.h> -#include <linux/timex.h> -#include <linux/jiffies.h> -#include <linux/hrtimer.h>  #include <linux/capability.h> -#include <linux/math64.h>  #include <linux/clocksource.h>  #include <linux/workqueue.h> -#include <asm/timex.h> +#include <linux/hrtimer.h> +#include <linux/jiffies.h> +#include <linux/math64.h> +#include <linux/timex.h> +#include <linux/time.h> +#include <linux/mm.h>  /* - * Timekeeping variables + * NTP timekeeping variables:   */ -unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */ -unsigned long tick_nsec;			/* ACTHZ period (nsec) */ -u64 tick_length; -static u64 tick_length_base; -static struct hrtimer leap_timer; +/* USER_HZ period (usecs): */ +unsigned long			tick_usec = TICK_USEC; -#define MAX_TICKADJ		500		/* microsecs */ -#define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ -				  NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +/* ACTHZ period (nsecs): */ +unsigned long			tick_nsec; + +u64				tick_length; +static u64			tick_length_base; + +static struct hrtimer		leap_timer; + +#define MAX_TICKADJ		500LL		/* usecs */ +#define MAX_TICKADJ_SCALED \ +	(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)  /*   * phase-lock loop variables   */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -static int time_state = TIME_OK;	/* clock synchronization status	*/ -int time_status = STA_UNSYNC;		/* clock status bits		*/ -static long time_tai;			/* TAI offset (s)		*/ -static s64 time_offset;			/* time adjustment (ns)		*/ -static long time_constant = 2;		/* pll time constant		*/ -long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/ -long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/ -static s64 time_freq;			/* frequency offset (scaled ns/s)*/ -static long time_reftime;		/* time at last adjustment (s)	*/ -long time_adjust; -static long ntp_tick_adj; +/* + * clock synchronization status + * + * (TIME_ERROR prevents overwriting the CMOS clock) + */ +static int			time_state = TIME_OK; + +/* clock status bits:							*/ +int				time_status = STA_UNSYNC; + +/* TAI offset (secs):							*/ +static long			time_tai; + +/* time adjustment (nsecs):						*/ +static s64			time_offset; + +/* pll time constant:							*/ +static long			time_constant = 2; + +/* maximum error (usecs):						*/ +long				time_maxerror = NTP_PHASE_LIMIT; + +/* estimated error (usecs):						*/ +long				time_esterror = NTP_PHASE_LIMIT; + +/* frequency offset (scaled nsecs/secs):				*/ +static s64			time_freq; + +/* time at last adjustment (secs):					*/ +static long			time_reftime; + +long				time_adjust; + +/* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/ +static s64			ntp_tick_adj; + +/* + * NTP methods: + */ + +/* + * Update (tick_length, tick_length_base, tick_nsec), based + * on (tick_usec, ntp_tick_adj, time_freq): + */  static void ntp_update_frequency(void)  { -	u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) -				<< NTP_SCALE_SHIFT; -	second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; -	second_length += time_freq; +	u64 second_length; +	u64 new_base; + +	second_length		 = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) +						<< NTP_SCALE_SHIFT; + +	second_length		+= ntp_tick_adj; +	second_length		+= time_freq; + +	tick_nsec		 = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; +	new_base		 = div_u64(second_length, NTP_INTERVAL_FREQ); + +	/* +	 * Don't wait for the next second_overflow, apply +	 * the change to the tick length immediately: +	 */ +	tick_length		+= new_base - tick_length_base; +	tick_length_base	 = new_base; +} + +static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +{ +	time_status &= ~STA_MODE; + +	if (secs < MINSEC) +		return 0; -	tick_length_base = second_length; +	if (!(time_status & STA_FLL) && (secs <= MAXSEC)) +		return 0; -	tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; -	tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); +	time_status |= STA_MODE; + +	return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);  }  static void ntp_update_offset(long offset)  { -	long mtemp;  	s64 freq_adj; +	s64 offset64; +	long secs;  	if (!(time_status & STA_PLL))  		return; @@ -84,24 +142,23 @@ static void ntp_update_offset(long offset)  	 * Select how the frequency is to be controlled  	 * and in which mode (PLL or FLL).  	 */ -	if (time_status & STA_FREQHOLD || time_reftime == 0) -		time_reftime = xtime.tv_sec; -	mtemp = xtime.tv_sec - time_reftime; +	secs = xtime.tv_sec - time_reftime; +	if (unlikely(time_status & STA_FREQHOLD)) +		secs = 0; +  	time_reftime = xtime.tv_sec; -	freq_adj = (s64)offset * mtemp; -	freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); -	time_status &= ~STA_MODE; -	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { -		freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL), -				    mtemp); -		time_status |= STA_MODE; -	} -	freq_adj += time_freq; -	freq_adj = min(freq_adj, MAXFREQ_SCALED); -	time_freq = max(freq_adj, -MAXFREQ_SCALED); +	offset64    = offset; +	freq_adj    = (offset64 * secs) << +			(NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + +	freq_adj    += ntp_update_offset_fll(offset64, secs); -	time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); +	freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED); + +	time_freq   = max(freq_adj, -MAXFREQ_SCALED); + +	time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);  }  /** @@ -111,15 +168,15 @@ static void ntp_update_offset(long offset)   */  void ntp_clear(void)  { -	time_adjust = 0;		/* stop active adjtime() */ -	time_status |= STA_UNSYNC; -	time_maxerror = NTP_PHASE_LIMIT; -	time_esterror = NTP_PHASE_LIMIT; +	time_adjust	= 0;		/* stop active adjtime() */ +	time_status	|= STA_UNSYNC; +	time_maxerror	= NTP_PHASE_LIMIT; +	time_esterror	= NTP_PHASE_LIMIT;  	ntp_update_frequency(); -	tick_length = tick_length_base; -	time_offset = 0; +	tick_length	= tick_length_base; +	time_offset	= 0;  }  /* @@ -140,8 +197,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)  		xtime.tv_sec--;  		wall_to_monotonic.tv_sec++;  		time_state = TIME_OOP; -		printk(KERN_NOTICE "Clock: " -		       "inserting leap second 23:59:60 UTC\n"); +		printk(KERN_NOTICE +			"Clock: inserting leap second 23:59:60 UTC\n");  		hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);  		res = HRTIMER_RESTART;  		break; @@ -150,8 +207,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)  		time_tai--;  		wall_to_monotonic.tv_sec--;  		time_state = TIME_WAIT; -		printk(KERN_NOTICE "Clock: " -		       "deleting leap second 23:59:59 UTC\n"); +		printk(KERN_NOTICE +			"Clock: deleting leap second 23:59:59 UTC\n");  		break;  	case TIME_OOP:  		time_tai++; @@ -179,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)   */  void second_overflow(void)  { -	s64 time_adj; +	s64 delta;  	/* Bump the maxerror field */  	time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -192,24 +249,30 @@ void second_overflow(void)  	 * Compute the phase adjustment for the next second. The offset is  	 * reduced by a fixed factor times the time constant.  	 */ -	tick_length = tick_length_base; -	time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); -	time_offset -= time_adj; -	tick_length += time_adj; +	tick_length	 = tick_length_base; -	if (unlikely(time_adjust)) { -		if (time_adjust > MAX_TICKADJ) { -			time_adjust -= MAX_TICKADJ; -			tick_length += MAX_TICKADJ_SCALED; -		} else if (time_adjust < -MAX_TICKADJ) { -			time_adjust += MAX_TICKADJ; -			tick_length -= MAX_TICKADJ_SCALED; -		} else { -			tick_length += (s64)(time_adjust * NSEC_PER_USEC / -					NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; -			time_adjust = 0; -		} +	delta		 = shift_right(time_offset, SHIFT_PLL + time_constant); +	time_offset	-= delta; +	tick_length	+= delta; + +	if (!time_adjust) +		return; + +	if (time_adjust > MAX_TICKADJ) { +		time_adjust -= MAX_TICKADJ; +		tick_length += MAX_TICKADJ_SCALED; +		return;  	} + +	if (time_adjust < -MAX_TICKADJ) { +		time_adjust += MAX_TICKADJ; +		tick_length -= MAX_TICKADJ_SCALED; +		return; +	} + +	tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) +							 << NTP_SCALE_SHIFT; +	time_adjust = 0;  }  #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -233,12 +296,13 @@ static void sync_cmos_clock(struct work_struct *work)  	 * This code is run on a timer.  If the clock is set, that timer  	 * may not expire at the correct time.  Thus, we adjust...  	 */ -	if (!ntp_synced()) +	if (!ntp_synced()) {  		/*  		 * Not synced, exit, do not restart a timer (if one is  		 * running, let it run out).  		 */  		return; +	}  	getnstimeofday(&now);  	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) @@ -270,7 +334,116 @@ static void notify_cmos_timer(void)  static inline void notify_cmos_timer(void) { }  #endif -/* adjtimex mainly allows reading (and writing, if superuser) of +/* + * Start the leap seconds timer: + */ +static inline void ntp_start_leap_timer(struct timespec *ts) +{ +	long now = ts->tv_sec; + +	if (time_status & STA_INS) { +		time_state = TIME_INS; +		now += 86400 - now % 86400; +		hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + +		return; +	} + +	if (time_status & STA_DEL) { +		time_state = TIME_DEL; +		now += 86400 - (now + 1) % 86400; +		hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); +	} +} + +/* + * Propagate a new txc->status value into the NTP state: + */ +static inline void process_adj_status(struct timex *txc, struct timespec *ts) +{ +	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { +		time_state = TIME_OK; +		time_status = STA_UNSYNC; +	} + +	/* +	 * If we turn on PLL adjustments then reset the +	 * reference time to current time. +	 */ +	if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) +		time_reftime = xtime.tv_sec; + +	/* only set allowed bits */ +	time_status &= STA_RONLY; +	time_status |= txc->status & ~STA_RONLY; + +	switch (time_state) { +	case TIME_OK: +		ntp_start_leap_timer(ts); +		break; +	case TIME_INS: +	case TIME_DEL: +		time_state = TIME_OK; +		ntp_start_leap_timer(ts); +	case TIME_WAIT: +		if (!(time_status & (STA_INS | STA_DEL))) +			time_state = TIME_OK; +		break; +	case TIME_OOP: +		hrtimer_restart(&leap_timer); +		break; +	} +} +/* + * Called with the xtime lock held, so we can access and modify + * all the global NTP state: + */ +static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) +{ +	if (txc->modes & ADJ_STATUS) +		process_adj_status(txc, ts); + +	if (txc->modes & ADJ_NANO) +		time_status |= STA_NANO; + +	if (txc->modes & ADJ_MICRO) +		time_status &= ~STA_NANO; + +	if (txc->modes & ADJ_FREQUENCY) { +		time_freq = txc->freq * PPM_SCALE; +		time_freq = min(time_freq, MAXFREQ_SCALED); +		time_freq = max(time_freq, -MAXFREQ_SCALED); +	} + +	if (txc->modes & ADJ_MAXERROR) +		time_maxerror = txc->maxerror; + +	if (txc->modes & ADJ_ESTERROR) +		time_esterror = txc->esterror; + +	if (txc->modes & ADJ_TIMECONST) { +		time_constant = txc->constant; +		if (!(time_status & STA_NANO)) +			time_constant += 4; +		time_constant = min(time_constant, (long)MAXTC); +		time_constant = max(time_constant, 0l); +	} + +	if (txc->modes & ADJ_TAI && txc->constant > 0) +		time_tai = txc->constant; + +	if (txc->modes & ADJ_OFFSET) +		ntp_update_offset(txc->offset); + +	if (txc->modes & ADJ_TICK) +		tick_usec = txc->tick; + +	if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) +		ntp_update_frequency(); +} + +/* + * adjtimex mainly allows reading (and writing, if superuser) of   * kernel time-keeping variables. used by xntpd.   */  int do_adjtimex(struct timex *txc) @@ -291,11 +464,14 @@ int do_adjtimex(struct timex *txc)  		 if (txc->modes && !capable(CAP_SYS_TIME))  			return -EPERM; -		/* if the quartz is off by more than 10% something is VERY wrong! */ +		/* +		 * if the quartz is off by more than 10% then +		 * something is VERY wrong! +		 */  		if (txc->modes & ADJ_TICK &&  		    (txc->tick <  900000/USER_HZ ||  		     txc->tick > 1100000/USER_HZ)) -				return -EINVAL; +			return -EINVAL;  		if (txc->modes & ADJ_STATUS && time_state != TIME_OK)  			hrtimer_cancel(&leap_timer); @@ -305,7 +481,6 @@ int do_adjtimex(struct timex *txc)  	write_seqlock_irq(&xtime_lock); -	/* If there are input parameters, then process them */  	if (txc->modes & ADJ_ADJTIME) {  		long save_adjust = time_adjust; @@ -315,98 +490,24 @@ int do_adjtimex(struct timex *txc)  			ntp_update_frequency();  		}  		txc->offset = save_adjust; -		goto adj_done; -	} -	if (txc->modes) { -		long sec; - -		if (txc->modes & ADJ_STATUS) { -			if ((time_status & STA_PLL) && -			    !(txc->status & STA_PLL)) { -				time_state = TIME_OK; -				time_status = STA_UNSYNC; -			} -			/* only set allowed bits */ -			time_status &= STA_RONLY; -			time_status |= txc->status & ~STA_RONLY; - -			switch (time_state) { -			case TIME_OK: -			start_timer: -				sec = ts.tv_sec; -				if (time_status & STA_INS) { -					time_state = TIME_INS; -					sec += 86400 - sec % 86400; -					hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); -				} else if (time_status & STA_DEL) { -					time_state = TIME_DEL; -					sec += 86400 - (sec + 1) % 86400; -					hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); -				} -				break; -			case TIME_INS: -			case TIME_DEL: -				time_state = TIME_OK; -				goto start_timer; -				break; -			case TIME_WAIT: -				if (!(time_status & (STA_INS | STA_DEL))) -					time_state = TIME_OK; -				break; -			case TIME_OOP: -				hrtimer_restart(&leap_timer); -				break; -			} -		} - -		if (txc->modes & ADJ_NANO) -			time_status |= STA_NANO; -		if (txc->modes & ADJ_MICRO) -			time_status &= ~STA_NANO; - -		if (txc->modes & ADJ_FREQUENCY) { -			time_freq = (s64)txc->freq * PPM_SCALE; -			time_freq = min(time_freq, MAXFREQ_SCALED); -			time_freq = max(time_freq, -MAXFREQ_SCALED); -		} - -		if (txc->modes & ADJ_MAXERROR) -			time_maxerror = txc->maxerror; -		if (txc->modes & ADJ_ESTERROR) -			time_esterror = txc->esterror; - -		if (txc->modes & ADJ_TIMECONST) { -			time_constant = txc->constant; -			if (!(time_status & STA_NANO)) -				time_constant += 4; -			time_constant = min(time_constant, (long)MAXTC); -			time_constant = max(time_constant, 0l); -		} - -		if (txc->modes & ADJ_TAI && txc->constant > 0) -			time_tai = txc->constant; - -		if (txc->modes & ADJ_OFFSET) -			ntp_update_offset(txc->offset); -		if (txc->modes & ADJ_TICK) -			tick_usec = txc->tick; +	} else { -		if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) -			ntp_update_frequency(); -	} +		/* If there are input parameters, then process them: */ +		if (txc->modes) +			process_adjtimex_modes(txc, &ts); -	txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, +		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,  				  NTP_SCALE_SHIFT); -	if (!(time_status & STA_NANO)) -		txc->offset /= NSEC_PER_USEC; +		if (!(time_status & STA_NANO)) +			txc->offset /= NSEC_PER_USEC; +	} -adj_done:  	result = time_state;	/* mostly `TIME_OK' */  	if (time_status & (STA_UNSYNC|STA_CLOCKERR))  		result = TIME_ERROR;  	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * -					 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); +					 PPM_SCALE_INV, NTP_SCALE_SHIFT);  	txc->maxerror	   = time_maxerror;  	txc->esterror	   = time_esterror;  	txc->status	   = time_status; @@ -425,6 +526,7 @@ adj_done:  	txc->calcnt	   = 0;  	txc->errcnt	   = 0;  	txc->stbcnt	   = 0; +  	write_sequnlock_irq(&xtime_lock);  	txc->time.tv_sec = ts.tv_sec; @@ -440,6 +542,8 @@ adj_done:  static int __init ntp_tick_adj_setup(char *str)  {  	ntp_tick_adj = simple_strtol(str, NULL, 0); +	ntp_tick_adj <<= NTP_SCALE_SHIFT; +  	return 1;  } diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c new file mode 100644 index 00000000000..71e7f1a1915 --- /dev/null +++ b/kernel/time/timecompare.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2009 Intel Corporation. + * Author: Patrick Ohly <patrick.ohly@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/timecompare.h> +#include <linux/module.h> +#include <linux/math64.h> + +/* + * fixed point arithmetic scale factor for skew + * + * Usually one would measure skew in ppb (parts per billion, 1e9), but + * using a factor of 2 simplifies the math. + */ +#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) + +ktime_t timecompare_transform(struct timecompare *sync, +			      u64 source_tstamp) +{ +	u64 nsec; + +	nsec = source_tstamp + sync->offset; +	nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / +		TIMECOMPARE_SKEW_RESOLUTION; + +	return ns_to_ktime(nsec); +} +EXPORT_SYMBOL(timecompare_transform); + +int timecompare_offset(struct timecompare *sync, +		       s64 *offset, +		       u64 *source_tstamp) +{ +	u64 start_source = 0, end_source = 0; +	struct { +		s64 offset; +		s64 duration_target; +	} buffer[10], sample, *samples; +	int counter = 0, i; +	int used; +	int index; +	int num_samples = sync->num_samples; + +	if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { +		samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); +		if (!samples) { +			samples = buffer; +			num_samples = sizeof(buffer)/sizeof(buffer[0]); +		} +	} else { +		samples = buffer; +	} + +	/* run until we have enough valid samples, but do not try forever */ +	i = 0; +	counter = 0; +	while (1) { +		u64 ts; +		ktime_t start, end; + +		start = sync->target(); +		ts = timecounter_read(sync->source); +		end = sync->target(); + +		if (!i) +			start_source = ts; + +		/* ignore negative durations */ +		sample.duration_target = ktime_to_ns(ktime_sub(end, start)); +		if (sample.duration_target >= 0) { +			/* +			 * assume symetric delay to and from source: +			 * average target time corresponds to measured +			 * source time +			 */ +			sample.offset = +				ktime_to_ns(ktime_add(end, start)) / 2 - +				ts; + +			/* simple insertion sort based on duration */ +			index = counter - 1; +			while (index >= 0) { +				if (samples[index].duration_target < +				    sample.duration_target) +					break; +				samples[index + 1] = samples[index]; +				index--; +			} +			samples[index + 1] = sample; +			counter++; +		} + +		i++; +		if (counter >= num_samples || i >= 100000) { +			end_source = ts; +			break; +		} +	} + +	*source_tstamp = (end_source + start_source) / 2; + +	/* remove outliers by only using 75% of the samples */ +	used = counter * 3 / 4; +	if (!used) +		used = counter; +	if (used) { +		/* calculate average */ +		s64 off = 0; +		for (index = 0; index < used; index++) +			off += samples[index].offset; +		*offset = div_s64(off, used); +	} + +	if (samples && samples != buffer) +		kfree(samples); + +	return used; +} +EXPORT_SYMBOL(timecompare_offset); + +void __timecompare_update(struct timecompare *sync, +			  u64 source_tstamp) +{ +	s64 offset; +	u64 average_time; + +	if (!timecompare_offset(sync, &offset, &average_time)) +		return; + +	if (!sync->last_update) { +		sync->last_update = average_time; +		sync->offset = offset; +		sync->skew = 0; +	} else { +		s64 delta_nsec = average_time - sync->last_update; + +		/* avoid division by negative or small deltas */ +		if (delta_nsec >= 10000) { +			s64 delta_offset_nsec = offset - sync->offset; +			s64 skew; /* delta_offset_nsec * +				     TIMECOMPARE_SKEW_RESOLUTION / +				     delta_nsec */ +			u64 divisor; + +			/* div_s64() is limited to 32 bit divisor */ +			skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; +			divisor = delta_nsec; +			while (unlikely(divisor >= ((s64)1) << 32)) { +				/* divide both by 2; beware, right shift +				   of negative value has undefined +				   behavior and can only be used for +				   the positive divisor */ +				skew = div_s64(skew, 2); +				divisor >>= 1; +			} +			skew = div_s64(skew, divisor); + +			/* +			 * Calculate new overall skew as 4/16 the +			 * old value and 12/16 the new one. This is +			 * a rather arbitrary tradeoff between +			 * only using the latest measurement (0/16 and +			 * 16/16) and even more weight on past measurements. +			 */ +#define TIMECOMPARE_NEW_SKEW_PER_16 12 +			sync->skew = +				div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * +					sync->skew + +					TIMECOMPARE_NEW_SKEW_PER_16 * skew, +					16); +			sync->last_update = average_time; +			sync->offset = offset; +		} +	} +} +EXPORT_SYMBOL(__timecompare_update); diff --git a/kernel/timer.c b/kernel/timer.c index 13dd64fe143..b4555568b4e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)  	debug_object_free(timer, &timer_debug_descr);  } -static void __init_timer(struct timer_list *timer); +static void __init_timer(struct timer_list *timer, +			 const char *name, +			 struct lock_class_key *key); -void init_timer_on_stack(struct timer_list *timer) +void init_timer_on_stack_key(struct timer_list *timer, +			     const char *name, +			     struct lock_class_key *key)  {  	debug_object_init_on_stack(timer, &timer_debug_descr); -	__init_timer(timer); +	__init_timer(timer, name, key);  } -EXPORT_SYMBOL_GPL(init_timer_on_stack); +EXPORT_SYMBOL_GPL(init_timer_on_stack_key);  void destroy_timer_on_stack(struct timer_list *timer)  { @@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }  static inline void debug_timer_deactivate(struct timer_list *timer) { }  #endif -static void __init_timer(struct timer_list *timer) +static void __init_timer(struct timer_list *timer, +			 const char *name, +			 struct lock_class_key *key)  {  	timer->entry.next = NULL;  	timer->base = __raw_get_cpu_var(tvec_bases); @@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)  	timer->start_pid = -1;  	memset(timer->start_comm, 0, TASK_COMM_LEN);  #endif +	lockdep_init_map(&timer->lockdep_map, name, key, 0);  }  /** @@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)   * init_timer() must be done to a timer prior calling *any* of the   * other timer functions.   */ -void init_timer(struct timer_list *timer) +void init_timer_key(struct timer_list *timer, +		    const char *name, +		    struct lock_class_key *key)  {  	debug_timer_init(timer); -	__init_timer(timer); +	__init_timer(timer, name, key);  } -EXPORT_SYMBOL(init_timer); +EXPORT_SYMBOL(init_timer_key); -void init_timer_deferrable(struct timer_list *timer) +void init_timer_deferrable_key(struct timer_list *timer, +			       const char *name, +			       struct lock_class_key *key)  { -	init_timer(timer); +	init_timer_key(timer, name, key);  	timer_set_deferrable(timer);  } -EXPORT_SYMBOL(init_timer_deferrable); +EXPORT_SYMBOL(init_timer_deferrable_key);  static inline void detach_timer(struct timer_list *timer,  				int clear_pending) @@ -589,11 +600,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,  	}  } -int __mod_timer(struct timer_list *timer, unsigned long expires) +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  {  	struct tvec_base *base, *new_base;  	unsigned long flags; -	int ret = 0; +	int ret; + +	ret = 0;  	timer_stats_timer_set_start_info(timer);  	BUG_ON(!timer->function); @@ -603,6 +617,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)  	if (timer_pending(timer)) {  		detach_timer(timer, 0);  		ret = 1; +	} else { +		if (pending_only) +			goto out_unlock;  	}  	debug_timer_activate(timer); @@ -629,42 +646,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)  	timer->expires = expires;  	internal_add_timer(base, timer); + +out_unlock:  	spin_unlock_irqrestore(&base->lock, flags);  	return ret;  } -EXPORT_SYMBOL(__mod_timer); -  /** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies   * - * This is not very scalable on SMP. Double adds are not possible. + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers.   */ -void add_timer_on(struct timer_list *timer, int cpu) +int mod_timer_pending(struct timer_list *timer, unsigned long expires)  { -	struct tvec_base *base = per_cpu(tvec_bases, cpu); -	unsigned long flags; - -	timer_stats_timer_set_start_info(timer); -	BUG_ON(timer_pending(timer) || !timer->function); -	spin_lock_irqsave(&base->lock, flags); -	timer_set_base(timer, base); -	debug_timer_activate(timer); -	internal_add_timer(base, timer); -	/* -	 * Check whether the other CPU is idle and needs to be -	 * triggered to reevaluate the timer wheel when nohz is -	 * active. We are protected against the other CPU fiddling -	 * with the timer by holding the timer base lock. This also -	 * makes sure that a CPU on the way to idle can not evaluate -	 * the timer wheel. -	 */ -	wake_up_idle_cpu(cpu); -	spin_unlock_irqrestore(&base->lock, flags); +	return __mod_timer(timer, expires, true);  } +EXPORT_SYMBOL(mod_timer_pending);  /**   * mod_timer - modify a timer's timeout @@ -688,9 +691,6 @@ void add_timer_on(struct timer_list *timer, int cpu)   */  int mod_timer(struct timer_list *timer, unsigned long expires)  { -	BUG_ON(!timer->function); - -	timer_stats_timer_set_start_info(timer);  	/*  	 * This is a common optimization triggered by the  	 * networking code - if the timer is re-modified @@ -699,12 +699,62 @@ int mod_timer(struct timer_list *timer, unsigned long expires)  	if (timer->expires == expires && timer_pending(timer))  		return 1; -	return __mod_timer(timer, expires); +	return __mod_timer(timer, expires, false);  } -  EXPORT_SYMBOL(mod_timer);  /** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(->data) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function (and if the handler uses it, ->data) + * fields must be set prior calling this function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ +	BUG_ON(timer_pending(timer)); +	mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ +	struct tvec_base *base = per_cpu(tvec_bases, cpu); +	unsigned long flags; + +	timer_stats_timer_set_start_info(timer); +	BUG_ON(timer_pending(timer) || !timer->function); +	spin_lock_irqsave(&base->lock, flags); +	timer_set_base(timer, base); +	debug_timer_activate(timer); +	internal_add_timer(base, timer); +	/* +	 * Check whether the other CPU is idle and needs to be +	 * triggered to reevaluate the timer wheel when nohz is +	 * active. We are protected against the other CPU fiddling +	 * with the timer by holding the timer base lock. This also +	 * makes sure that a CPU on the way to idle can not evaluate +	 * the timer wheel. +	 */ +	wake_up_idle_cpu(cpu); +	spin_unlock_irqrestore(&base->lock, flags); +} + +/**   * del_timer - deactive a timer.   * @timer: the timer to be deactivated   * @@ -733,7 +783,6 @@ int del_timer(struct timer_list *timer)  	return ret;  } -  EXPORT_SYMBOL(del_timer);  #ifdef CONFIG_SMP @@ -767,7 +816,6 @@ out:  	return ret;  } -  EXPORT_SYMBOL(try_to_del_timer_sync);  /** @@ -789,6 +837,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);   */  int del_timer_sync(struct timer_list *timer)  { +#ifdef CONFIG_LOCKDEP +	unsigned long flags; + +	local_irq_save(flags); +	lock_map_acquire(&timer->lockdep_map); +	lock_map_release(&timer->lockdep_map); +	local_irq_restore(flags); +#endif +  	for (;;) {  		int ret = try_to_del_timer_sync(timer);  		if (ret >= 0) @@ -796,7 +853,6 @@ int del_timer_sync(struct timer_list *timer)  		cpu_relax();  	}  } -  EXPORT_SYMBOL(del_timer_sync);  #endif @@ -861,10 +917,36 @@ static inline void __run_timers(struct tvec_base *base)  			set_running_timer(base, timer);  			detach_timer(timer, 1); +  			spin_unlock_irq(&base->lock);  			{  				int preempt_count = preempt_count(); + +#ifdef CONFIG_LOCKDEP +				/* +				 * It is permissible to free the timer from +				 * inside the function that is called from +				 * it, this we need to take into account for +				 * lockdep too. To avoid bogus "held lock +				 * freed" warnings as well as problems when +				 * looking into timer->lockdep_map, make a +				 * copy and use that here. +				 */ +				struct lockdep_map lockdep_map = +					timer->lockdep_map; +#endif +				/* +				 * Couple the lock chain with the lock chain at +				 * del_timer_sync() by acquiring the lock_map +				 * around the fn() call here and in +				 * del_timer_sync(). +				 */ +				lock_map_acquire(&lockdep_map); +  				fn(data); + +				lock_map_release(&lockdep_map); +  				if (preempt_count != preempt_count()) {  					printk(KERN_ERR "huh, entered %p "  					       "with preempt_count %08x, exited" @@ -1268,7 +1350,7 @@ signed long __sched schedule_timeout(signed long timeout)  	expire = timeout + jiffies;  	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); -	__mod_timer(&timer, expire); +	__mod_timer(&timer, expire, false);  	schedule();  	del_singleshot_timer_sync(&timer); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 34e707e5ab8..504086ab444 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -72,11 +72,10 @@ config FUNCTION_GRAPH_TRACER  	help  	  Enable the kernel to trace a function at both its return  	  and its entry. -	  It's first purpose is to trace the duration of functions and -	  draw a call graph for each thread with some informations like -	  the return value. -	  This is done by setting the current return address on the current -	  task structure into a stack of calls. +	  Its first purpose is to trace the duration of functions and +	  draw a call graph for each thread with some information like +	  the return value. This is done by setting the current return  +	  address on the current task structure into a stack of calls.  config IRQSOFF_TRACER  	bool "Interrupts-off Latency Tracer" diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fdf913dfc7e..53e8c8bc0c9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1908,7 +1908,7 @@ int register_ftrace_function(struct ftrace_ops *ops)  }  /** - * unregister_ftrace_function - unresgister a function for profiling. + * unregister_ftrace_function - unregister a function for profiling.   * @ops - ops structure that holds the function to unregister   *   * Unregister a function that was added to be called by ftrace profiling. diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 930c08e5b38..dce71a5b51b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -42,6 +42,81 @@ static struct tracer_flags tracer_flags = {  /* pid on the last trace processed */  static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; +/* Add a function return address to the trace stack on thread info.*/ +int +ftrace_push_return_trace(unsigned long ret, unsigned long long time, +			 unsigned long func, int *depth) +{ +	int index; + +	if (!current->ret_stack) +		return -EBUSY; + +	/* The return trace stack is full */ +	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { +		atomic_inc(¤t->trace_overrun); +		return -EBUSY; +	} + +	index = ++current->curr_ret_stack; +	barrier(); +	current->ret_stack[index].ret = ret; +	current->ret_stack[index].func = func; +	current->ret_stack[index].calltime = time; +	*depth = index; + +	return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +void +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +{ +	int index; + +	index = current->curr_ret_stack; + +	if (unlikely(index < 0)) { +		ftrace_graph_stop(); +		WARN_ON(1); +		/* Might as well panic, otherwise we have no where to go */ +		*ret = (unsigned long)panic; +		return; +	} + +	*ret = current->ret_stack[index].ret; +	trace->func = current->ret_stack[index].func; +	trace->calltime = current->ret_stack[index].calltime; +	trace->overrun = atomic_read(¤t->trace_overrun); +	trace->depth = index; +	barrier(); +	current->curr_ret_stack--; + +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ +	struct ftrace_graph_ret trace; +	unsigned long ret; + +	ftrace_pop_return_trace(&trace, &ret); +	trace.rettime = cpu_clock(raw_smp_processor_id()); +	ftrace_graph_return(&trace); + +	if (unlikely(!ret)) { +		ftrace_graph_stop(); +		WARN_ON(1); +		/* Might as well panic. What else to do? */ +		ret = (unsigned long)panic; +	} + +	return ret; +} +  static int graph_trace_init(struct trace_array *tr)  {  	int cpu, ret; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 43f891b05a4..00d59d048ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk)  	if (likely(tsk->mm)) {  		cputime_t time, dtime;  		struct timeval value; +		unsigned long flags;  		u64 delta; +		local_irq_save(flags);  		time = tsk->stime + tsk->utime;  		dtime = cputime_sub(time, tsk->acct_timexpd);  		jiffies_to_timeval(cputime_to_jiffies(dtime), &value); @@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk)  		delta = delta * USEC_PER_SEC + value.tv_usec;  		if (delta == 0) -			return; +			goto out;  		tsk->acct_timexpd = time;  		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);  		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; +	out: +		local_irq_restore(flags);  	}  } diff --git a/kernel/user.c b/kernel/user.c index 3551ac74239..850e0ba41c1 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -20,7 +20,7 @@  struct user_namespace init_user_ns = {  	.kref = { -		.refcount	= ATOMIC_INIT(1), +		.refcount	= ATOMIC_INIT(2),  	},  	.creator = &root_user,  }; @@ -286,14 +286,12 @@ int __init uids_sysfs_init(void)  /* work function to remove sysfs directory for a user and free up   * corresponding structures.   */ -static void remove_user_sysfs_dir(struct work_struct *w) +static void cleanup_user_struct(struct work_struct *w)  {  	struct user_struct *up = container_of(w, struct user_struct, work);  	unsigned long flags;  	int remove_user = 0; -	if (up->user_ns != &init_user_ns) -		return;  	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()  	 * atomic.  	 */ @@ -312,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w)  	if (!remove_user)  		goto done; -	kobject_uevent(&up->kobj, KOBJ_REMOVE); -	kobject_del(&up->kobj); -	kobject_put(&up->kobj); +	if (up->user_ns == &init_user_ns) { +		kobject_uevent(&up->kobj, KOBJ_REMOVE); +		kobject_del(&up->kobj); +		kobject_put(&up->kobj); +	}  	sched_destroy_user(up);  	key_put(up->uid_keyring); @@ -335,7 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags)  	atomic_inc(&up->__count);  	spin_unlock_irqrestore(&uidhash_lock, flags); -	INIT_WORK(&up->work, remove_user_sysfs_dir); +	INIT_WORK(&up->work, cleanup_user_struct);  	schedule_work(&up->work);  } @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags)  #endif +#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) +/* + * We need to check if a setuid can take place. This function should be called + * before successfully completing the setuid. + */ +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + +	return sched_rt_can_attach(up->tg, tsk); + +} +#else +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ +	return 1; +} +#endif +  /*   * Locate the user_struct for the passed UID.  If found, take a ref on it.  The   * caller must undo that ref with free_uid(). diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3b34b354593..92359cc747a 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)  		up_write(&uts_sem);  } -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL  /*   *	Special case of dostring for the UTS structure. This has locks   *	to observe. Should this be in kernel/sys.c ???? diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1f0c509b40d..32f8e0d2bf5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,8 +48,6 @@ struct cpu_workqueue_struct {  	struct workqueue_struct *wq;  	struct task_struct *thread; - -	int run_depth;		/* Detect run_workqueue() recursion depth */  } ____cacheline_aligned;  /* @@ -262,13 +260,6 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);  static void run_workqueue(struct cpu_workqueue_struct *cwq)  {  	spin_lock_irq(&cwq->lock); -	cwq->run_depth++; -	if (cwq->run_depth > 3) { -		/* morton gets to eat his hat */ -		printk("%s: recursion depth exceeded: %d\n", -			__func__, cwq->run_depth); -		dump_stack(); -	}  	while (!list_empty(&cwq->worklist)) {  		struct work_struct *work = list_entry(cwq->worklist.next,  						struct work_struct, entry); @@ -311,7 +302,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)  		spin_lock_irq(&cwq->lock);  		cwq->current_work = NULL;  	} -	cwq->run_depth--;  	spin_unlock_irq(&cwq->lock);  } @@ -368,29 +358,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,  static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)  { -	int active; +	int active = 0; +	struct wq_barrier barr; -	if (cwq->thread == current) { -		/* -		 * Probably keventd trying to flush its own queue. So simply run -		 * it by hand rather than deadlocking. -		 */ -		run_workqueue(cwq); -		active = 1; -	} else { -		struct wq_barrier barr; +	WARN_ON(cwq->thread == current); -		active = 0; -		spin_lock_irq(&cwq->lock); -		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { -			insert_wq_barrier(cwq, &barr, &cwq->worklist); -			active = 1; -		} -		spin_unlock_irq(&cwq->lock); - -		if (active) -			wait_for_completion(&barr.done); +	spin_lock_irq(&cwq->lock); +	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { +		insert_wq_barrier(cwq, &barr, &cwq->worklist); +		active = 1;  	} +	spin_unlock_irq(&cwq->lock); + +	if (active) +		wait_for_completion(&barr.done);  	return active;  } @@ -416,7 +397,7 @@ void flush_workqueue(struct workqueue_struct *wq)  	might_sleep();  	lock_map_acquire(&wq->lockdep_map);  	lock_map_release(&wq->lockdep_map); -	for_each_cpu_mask_nr(cpu, *cpu_map) +	for_each_cpu(cpu, cpu_map)  		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));  }  EXPORT_SYMBOL_GPL(flush_workqueue); @@ -547,7 +528,7 @@ static void wait_on_work(struct work_struct *work)  	wq = cwq->wq;  	cpu_map = wq_cpu_map(wq); -	for_each_cpu_mask_nr(cpu, *cpu_map) +	for_each_cpu(cpu, cpu_map)  		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);  } @@ -911,7 +892,7 @@ void destroy_workqueue(struct workqueue_struct *wq)  	list_del(&wq->list);  	spin_unlock(&workqueue_lock); -	for_each_cpu_mask_nr(cpu, *cpu_map) +	for_each_cpu(cpu, cpu_map)  		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));   	cpu_maps_update_done();  |