diff options
Diffstat (limited to 'kernel')
50 files changed, 4465 insertions, 2452 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9..6c07f30fa9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o  obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o  obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o  obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smpboot.o  ifneq ($(CONFIG_SMP),y)  obj-y += up.o  endif diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34ea..4b96415527b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -67,6 +67,7 @@  #include <linux/syscalls.h>  #include <linux/capability.h>  #include <linux/fs_struct.h> +#include <linux/compat.h>  #include "audit.h" @@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)  	audit_log_end(ab);  } -void __audit_seccomp(unsigned long syscall) +void __audit_seccomp(unsigned long syscall, long signr, int code)  {  	struct audit_buffer *ab;  	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); -	audit_log_abend(ab, "seccomp", SIGKILL); +	audit_log_abend(ab, "seccomp", signr);  	audit_log_format(ab, " syscall=%ld", syscall); +	audit_log_format(ab, " compat=%d", is_compat_task()); +	audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); +	audit_log_format(ab, " code=0x%x", code);  	audit_log_end(ab);  } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed64ccac67c..ad8eae5bb80 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -60,9 +60,13 @@  #include <linux/eventfd.h>  #include <linux/poll.h>  #include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/kthread.h>  #include <linux/atomic.h> +/* css deactivation bias, makes css->refcnt negative to deny new trygets */ +#define CSS_DEACT_BIAS		INT_MIN +  /*   * cgroup_mutex is the master lock.  Any modification to cgroup or its   * hierarchy must be performed while holding it. @@ -127,6 +131,9 @@ struct cgroupfs_root {  	/* A list running through the active hierarchies */  	struct list_head root_list; +	/* All cgroups on this root, cgroup_mutex protected */ +	struct list_head allcg_list; +  	/* Hierarchy-specific flags */  	unsigned long flags; @@ -145,6 +152,15 @@ struct cgroupfs_root {  static struct cgroupfs_root rootnode;  /* + * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. + */ +struct cfent { +	struct list_head		node; +	struct dentry			*dentry; +	struct cftype			*type; +}; + +/*   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when   * cgroup_subsys->use_id != 0.   */ @@ -239,6 +255,14 @@ int cgroup_lock_is_held(void)  EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +/* the current nr of refs, always >= 0 whether @css is deactivated or not */ +static int css_refcnt(struct cgroup_subsys_state *css) +{ +	int v = atomic_read(&css->refcnt); + +	return v >= 0 ? v : v - CSS_DEACT_BIAS; +} +  /* convenient tests for these bits */  inline int cgroup_is_removed(const struct cgroup *cgrp)  { @@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)  #define for_each_active_root(_root) \  list_for_each_entry(_root, &roots, root_list) +static inline struct cgroup *__d_cgrp(struct dentry *dentry) +{ +	return dentry->d_fsdata; +} + +static inline struct cfent *__d_cfe(struct dentry *dentry) +{ +	return dentry->d_fsdata; +} + +static inline struct cftype *__d_cft(struct dentry *dentry) +{ +	return __d_cfe(dentry)->type; +} +  /* the list of cgroups eligible for automatic release. Protected by   * release_list_lock */  static LIST_HEAD(release_list); @@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)  	struct cgroup_subsys *ss;  	int ret = 0; -	for_each_subsys(cgrp->root, ss) -		if (ss->pre_destroy) { -			ret = ss->pre_destroy(cgrp); -			if (ret) -				break; +	for_each_subsys(cgrp->root, ss) { +		if (!ss->pre_destroy) +			continue; + +		ret = ss->pre_destroy(cgrp); +		if (ret) { +			/* ->pre_destroy() failure is being deprecated */ +			WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); +			break;  		} +	}  	return ret;  } @@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)  		BUG_ON(!list_empty(&cgrp->pidlists));  		kfree_rcu(cgrp, rcu_head); +	} else { +		struct cfent *cfe = __d_cfe(dentry); +		struct cgroup *cgrp = dentry->d_parent->d_fsdata; + +		WARN_ONCE(!list_empty(&cfe->node) && +			  cgrp != &cgrp->root->top_cgroup, +			  "cfe still linked for %s\n", cfe->type->name); +		kfree(cfe);  	}  	iput(inode);  } @@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d)  	dput(parent);  } -static void cgroup_clear_directory(struct dentry *dentry) +static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  { -	struct list_head *node; +	struct cfent *cfe; + +	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); +	lockdep_assert_held(&cgroup_mutex); -	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); -	spin_lock(&dentry->d_lock); -	node = dentry->d_subdirs.next; -	while (node != &dentry->d_subdirs) { -		struct dentry *d = list_entry(node, struct dentry, d_u.d_child); +	list_for_each_entry(cfe, &cgrp->files, node) { +		struct dentry *d = cfe->dentry; -		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); -		list_del_init(node); -		if (d->d_inode) { -			/* This should never be called on a cgroup -			 * directory with child cgroups */ -			BUG_ON(d->d_inode->i_mode & S_IFDIR); -			dget_dlock(d); -			spin_unlock(&d->d_lock); -			spin_unlock(&dentry->d_lock); -			d_delete(d); -			simple_unlink(dentry->d_inode, d); -			dput(d); -			spin_lock(&dentry->d_lock); -		} else -			spin_unlock(&d->d_lock); -		node = dentry->d_subdirs.next; +		if (cft && cfe->type != cft) +			continue; + +		dget(d); +		d_delete(d); +		simple_unlink(d->d_inode, d); +		list_del_init(&cfe->node); +		dput(d); + +		return 0;  	} -	spin_unlock(&dentry->d_lock); +	return -ENOENT; +} + +static void cgroup_clear_directory(struct dentry *dir) +{ +	struct cgroup *cgrp = __d_cgrp(dir); + +	while (!list_empty(&cgrp->files)) +		cgroup_rm_file(cgrp, NULL);  }  /* @@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	if (ret)  		goto out_unlock; +	/* See feature-removal-schedule.txt */ +	if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) +		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", +			   task_tgid_nr(current), current->comm); +  	/* Don't allow flags or name to change at remount */  	if (opts.flags != root->flags ||  	    (opts.name && strcmp(opts.name, root->name))) { @@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  		goto out_unlock;  	} -	/* (re)populate subsystem files */ +	/* clear out any existing files and repopulate subsystem files */ +	cgroup_clear_directory(cgrp->dentry);  	cgroup_populate_dir(cgrp);  	if (opts.release_agent) @@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  {  	INIT_LIST_HEAD(&cgrp->sibling);  	INIT_LIST_HEAD(&cgrp->children); +	INIT_LIST_HEAD(&cgrp->files);  	INIT_LIST_HEAD(&cgrp->css_sets);  	INIT_LIST_HEAD(&cgrp->release_list);  	INIT_LIST_HEAD(&cgrp->pidlists); @@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  static void init_cgroup_root(struct cgroupfs_root *root)  {  	struct cgroup *cgrp = &root->top_cgroup; +  	INIT_LIST_HEAD(&root->subsys_list);  	INIT_LIST_HEAD(&root->root_list); +	INIT_LIST_HEAD(&root->allcg_list);  	root->number_of_cgroups = 1;  	cgrp->root = root;  	cgrp->top_cgroup = cgrp; +	list_add_tail(&cgrp->allcg_node, &root->allcg_list);  	init_cgroup_housekeeping(cgrp);  } @@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = {  static struct kobject *cgroup_kobj; -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ -	return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ -	return dentry->d_fsdata; -} -  /**   * cgroup_path - generate the path of a cgroup   * @cgrp: the cgroup in question @@ -2172,6 +2226,18 @@ retry_find_task:  	if (threadgroup)  		tsk = tsk->group_leader; + +	/* +	 * Workqueue threads may acquire PF_THREAD_BOUND and become +	 * trapped in a cpuset, or RT worker may be born in a cgroup +	 * with no rt_runtime allocated.  Just say no. +	 */ +	if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { +		ret = -EINVAL; +		rcu_read_unlock(); +		goto out_unlock_cgroup; +	} +  	get_task_struct(tsk);  	rcu_read_unlock(); @@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)  	return mode;  } -int cgroup_add_file(struct cgroup *cgrp, -		       struct cgroup_subsys *subsys, -		       const struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, +			   const struct cftype *cft)  {  	struct dentry *dir = cgrp->dentry; +	struct cgroup *parent = __d_cgrp(dir);  	struct dentry *dentry; +	struct cfent *cfe;  	int error;  	umode_t mode; -  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + +	/* does @cft->flags tell us to skip creation on @cgrp? */ +	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) +		return 0; +	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) +		return 0; +  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {  		strcpy(name, subsys->name);  		strcat(name, ".");  	}  	strcat(name, cft->name); +  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); + +	cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); +	if (!cfe) +		return -ENOMEM; +  	dentry = lookup_one_len(name, dir, strlen(name)); -	if (!IS_ERR(dentry)) { -		mode = cgroup_file_mode(cft); -		error = cgroup_create_file(dentry, mode | S_IFREG, -						cgrp->root->sb); -		if (!error) -			dentry->d_fsdata = (void *)cft; -		dput(dentry); -	} else +	if (IS_ERR(dentry)) {  		error = PTR_ERR(dentry); +		goto out; +	} + +	mode = cgroup_file_mode(cft); +	error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); +	if (!error) { +		cfe->type = (void *)cft; +		cfe->dentry = dentry; +		dentry->d_fsdata = cfe; +		list_add_tail(&cfe->node, &parent->files); +		cfe = NULL; +	} +	dput(dentry); +out: +	kfree(cfe);  	return error;  } -EXPORT_SYMBOL_GPL(cgroup_add_file); -int cgroup_add_files(struct cgroup *cgrp, -			struct cgroup_subsys *subsys, -			const struct cftype cft[], -			int count) +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, +			      const struct cftype cfts[], bool is_add)  { -	int i, err; -	for (i = 0; i < count; i++) { -		err = cgroup_add_file(cgrp, subsys, &cft[i]); -		if (err) -			return err; +	const struct cftype *cft; +	int err, ret = 0; + +	for (cft = cfts; cft->name[0] != '\0'; cft++) { +		if (is_add) +			err = cgroup_add_file(cgrp, subsys, cft); +		else +			err = cgroup_rm_file(cgrp, cft); +		if (err) { +			pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", +				   is_add ? "add" : "remove", cft->name, err); +			ret = err; +		}  	} +	return ret; +} + +static DEFINE_MUTEX(cgroup_cft_mutex); + +static void cgroup_cfts_prepare(void) +	__acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) +{ +	/* +	 * Thanks to the entanglement with vfs inode locking, we can't walk +	 * the existing cgroups under cgroup_mutex and create files. +	 * Instead, we increment reference on all cgroups and build list of +	 * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure +	 * exclusive access to the field. +	 */ +	mutex_lock(&cgroup_cft_mutex); +	mutex_lock(&cgroup_mutex); +} + +static void cgroup_cfts_commit(struct cgroup_subsys *ss, +			       const struct cftype *cfts, bool is_add) +	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) +{ +	LIST_HEAD(pending); +	struct cgroup *cgrp, *n; + +	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ +	if (cfts && ss->root != &rootnode) { +		list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { +			dget(cgrp->dentry); +			list_add_tail(&cgrp->cft_q_node, &pending); +		} +	} + +	mutex_unlock(&cgroup_mutex); + +	/* +	 * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm +	 * files for all cgroups which were created before. +	 */ +	list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { +		struct inode *inode = cgrp->dentry->d_inode; + +		mutex_lock(&inode->i_mutex); +		mutex_lock(&cgroup_mutex); +		if (!cgroup_is_removed(cgrp)) +			cgroup_addrm_files(cgrp, ss, cfts, is_add); +		mutex_unlock(&cgroup_mutex); +		mutex_unlock(&inode->i_mutex); + +		list_del_init(&cgrp->cft_q_node); +		dput(cgrp->dentry); +	} + +	mutex_unlock(&cgroup_cft_mutex); +} + +/** + * cgroup_add_cftypes - add an array of cftypes to a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Register @cfts to @ss.  Files described by @cfts are created for all + * existing cgroups to which @ss is attached and all future cgroups will + * have them too.  This function can be called anytime whether @ss is + * attached or not. + * + * Returns 0 on successful registration, -errno on failure.  Note that this + * function currently returns 0 as long as @cfts registration is successful + * even if some file creation attempts on existing cgroups fail. + */ +int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +{ +	struct cftype_set *set; + +	set = kzalloc(sizeof(*set), GFP_KERNEL); +	if (!set) +		return -ENOMEM; + +	cgroup_cfts_prepare(); +	set->cfts = cfts; +	list_add_tail(&set->node, &ss->cftsets); +	cgroup_cfts_commit(ss, cfts, true); +  	return 0;  } -EXPORT_SYMBOL_GPL(cgroup_add_files); +EXPORT_SYMBOL_GPL(cgroup_add_cftypes); + +/** + * cgroup_rm_cftypes - remove an array of cftypes from a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Unregister @cfts from @ss.  Files described by @cfts are removed from + * all existing cgroups to which @ss is attached and all future cgroups + * won't have them either.  This function can be called anytime whether @ss + * is attached or not. + * + * Returns 0 on successful unregistration, -ENOENT if @cfts is not + * registered with @ss. + */ +int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +{ +	struct cftype_set *set; + +	cgroup_cfts_prepare(); + +	list_for_each_entry(set, &ss->cftsets, node) { +		if (set->cfts == cfts) { +			list_del_init(&set->node); +			cgroup_cfts_commit(ss, cfts, false); +			return 0; +		} +	} + +	cgroup_cfts_commit(ss, NULL, false); +	return -ENOENT; +}  /**   * cgroup_task_count - count the number of tasks in a cgroup. @@ -3625,13 +3832,14 @@ static struct cftype files[] = {  		.read_u64 = cgroup_clone_children_read,  		.write_u64 = cgroup_clone_children_write,  	}, -}; - -static struct cftype cft_release_agent = { -	.name = "release_agent", -	.read_seq_string = cgroup_release_agent_show, -	.write_string = cgroup_release_agent_write, -	.max_write_len = PATH_MAX, +	{ +		.name = "release_agent", +		.flags = CFTYPE_ONLY_ON_ROOT, +		.read_seq_string = cgroup_release_agent_show, +		.write_string = cgroup_release_agent_write, +		.max_write_len = PATH_MAX, +	}, +	{ }	/* terminate */  };  static int cgroup_populate_dir(struct cgroup *cgrp) @@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)  	int err;  	struct cgroup_subsys *ss; -	/* First clear out any existing files */ -	cgroup_clear_directory(cgrp->dentry); - -	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); +	err = cgroup_addrm_files(cgrp, NULL, files, true);  	if (err < 0)  		return err; -	if (cgrp == cgrp->top_cgroup) { -		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) -			return err; -	} - +	/* process cftsets of each subsystem */  	for_each_subsys(cgrp->root, ss) { -		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) -			return err; +		struct cftype_set *set; + +		list_for_each_entry(set, &ss->cftsets, node) +			cgroup_addrm_files(cgrp, ss, set->cfts, true);  	} +  	/* This cgroup is ready now */  	for_each_subsys(cgrp->root, ss) {  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; @@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)  	return 0;  } +static void css_dput_fn(struct work_struct *work) +{ +	struct cgroup_subsys_state *css = +		container_of(work, struct cgroup_subsys_state, dput_work); + +	dput(css->cgroup->dentry); +} +  static void init_cgroup_css(struct cgroup_subsys_state *css,  			       struct cgroup_subsys *ss,  			       struct cgroup *cgrp) @@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,  		set_bit(CSS_ROOT, &css->flags);  	BUG_ON(cgrp->subsys[ss->subsys_id]);  	cgrp->subsys[ss->subsys_id] = css; + +	/* +	 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry +	 * which is put on the last css_put().  dput() requires process +	 * context, which css_put() may be called without.  @css->dput_work +	 * will be used to invoke dput() asynchronously from css_put(). +	 */ +	INIT_WORK(&css->dput_work, css_dput_fn); +	if (ss->__DEPRECATED_clear_css_refs) +		set_bit(CSS_CLEAR_CSS_REFS, &css->flags);  }  static void cgroup_lock_hierarchy(struct cgroupfs_root *root) @@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	if (err < 0)  		goto err_remove; +	/* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ +	for_each_subsys(root, ss) +		if (!ss->__DEPRECATED_clear_css_refs) +			dget(dentry); +  	/* The cgroup directory was pre-locked for us */  	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); +	list_add_tail(&cgrp->allcg_node, &root->allcg_list); +  	err = cgroup_populate_dir(cgrp);  	/* If err < 0, we have a half-filled directory - oh well ;) */ @@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);  } +/* + * Check the reference count on each subsystem. Since we already + * established that there are no tasks in the cgroup, if the css refcount + * is also 1, then there should be no outstanding references, so the + * subsystem is safe to destroy. We scan across all subsystems rather than + * using the per-hierarchy linked list of mounted subsystems since we can + * be called via check_for_release() with no synchronization other than + * RCU, and the subsystem linked list isn't RCU-safe. + */  static int cgroup_has_css_refs(struct cgroup *cgrp)  { -	/* Check the reference count on each subsystem. Since we -	 * already established that there are no tasks in the -	 * cgroup, if the css refcount is also 1, then there should -	 * be no outstanding references, so the subsystem is safe to -	 * destroy. We scan across all subsystems rather than using -	 * the per-hierarchy linked list of mounted subsystems since -	 * we can be called via check_for_release() with no -	 * synchronization other than RCU, and the subsystem linked -	 * list isn't RCU-safe */  	int i; +  	/*  	 * We won't need to lock the subsys array, because the subsystems  	 * we're concerned about aren't going anywhere since our cgroup root @@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		struct cgroup_subsys *ss = subsys[i];  		struct cgroup_subsys_state *css; +  		/* Skip subsystems not present or not in this hierarchy */  		if (ss == NULL || ss->root != cgrp->root)  			continue; +  		css = cgrp->subsys[ss->subsys_id]; -		/* When called from check_for_release() it's possible +		/* +		 * When called from check_for_release() it's possible  		 * that by this point the cgroup has been removed  		 * and the css deleted. But a false-positive doesn't  		 * matter, since it can only happen if the cgroup  		 * has been deleted and hence no longer needs the -		 * release agent to be called anyway. */ -		if (css && (atomic_read(&css->refcnt) > 1)) +		 * release agent to be called anyway. +		 */ +		if (css && css_refcnt(css) > 1)  			return 1;  	}  	return 0; @@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)   * Atomically mark all (or else none) of the cgroup's CSS objects as   * CSS_REMOVED. Return true on success, or false if the cgroup has   * busy subsystems. Call with cgroup_mutex held + * + * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or + * not, cgroup removal behaves differently. + * + * If clear is set, css refcnt for the subsystem should be zero before + * cgroup removal can be committed.  This is implemented by + * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be + * called multiple times until all css refcnts reach zero and is allowed to + * veto removal on any invocation.  This behavior is deprecated and will be + * removed as soon as the existing user (memcg) is updated. + * + * If clear is not set, each css holds an extra reference to the cgroup's + * dentry and cgroup removal proceeds regardless of css refs. + * ->pre_destroy() will be called at least once and is not allowed to fail. + * On the last put of each css, whenever that may be, the extra dentry ref + * is put so that dentry destruction happens only after all css's are + * released.   */ -  static int cgroup_clear_css_refs(struct cgroup *cgrp)  {  	struct cgroup_subsys *ss;  	unsigned long flags;  	bool failed = false; +  	local_irq_save(flags); + +	/* +	 * Block new css_tryget() by deactivating refcnt.  If all refcnts +	 * for subsystems w/ clear_css_refs set were 1 at the moment of +	 * deactivation, we succeeded. +	 */  	for_each_subsys(cgrp->root, ss) {  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -		int refcnt; -		while (1) { -			/* We can only remove a CSS with a refcnt==1 */ -			refcnt = atomic_read(&css->refcnt); -			if (refcnt > 1) { -				failed = true; -				goto done; -			} -			BUG_ON(!refcnt); -			/* -			 * Drop the refcnt to 0 while we check other -			 * subsystems. This will cause any racing -			 * css_tryget() to spin until we set the -			 * CSS_REMOVED bits or abort -			 */ -			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) -				break; -			cpu_relax(); -		} + +		WARN_ON(atomic_read(&css->refcnt) < 0); +		atomic_add(CSS_DEACT_BIAS, &css->refcnt); + +		if (ss->__DEPRECATED_clear_css_refs) +			failed |= css_refcnt(css) != 1;  	} - done: + +	/* +	 * If succeeded, set REMOVED and put all the base refs; otherwise, +	 * restore refcnts to positive values.  Either way, all in-progress +	 * css_tryget() will be released. +	 */  	for_each_subsys(cgrp->root, ss) {  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -		if (failed) { -			/* -			 * Restore old refcnt if we previously managed -			 * to clear it from 1 to 0 -			 */ -			if (!atomic_read(&css->refcnt)) -				atomic_set(&css->refcnt, 1); -		} else { -			/* Commit the fact that the CSS is removed */ + +		if (!failed) {  			set_bit(CSS_REMOVED, &css->flags); +			css_put(css); +		} else { +			atomic_sub(CSS_DEACT_BIAS, &css->refcnt);  		}  	} +  	local_irq_restore(flags);  	return !failed;  } @@ -3995,6 +4241,8 @@ again:  	list_del_init(&cgrp->sibling);  	cgroup_unlock_hierarchy(cgrp->root); +	list_del_init(&cgrp->allcg_node); +  	d = dget(cgrp->dentry);  	cgroup_d_remove_dir(d); @@ -4021,12 +4269,29 @@ again:  	return 0;  } +static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) +{ +	INIT_LIST_HEAD(&ss->cftsets); + +	/* +	 * base_cftset is embedded in subsys itself, no need to worry about +	 * deregistration. +	 */ +	if (ss->base_cftypes) { +		ss->base_cftset.cfts = ss->base_cftypes; +		list_add_tail(&ss->base_cftset.node, &ss->cftsets); +	} +} +  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  {  	struct cgroup_subsys_state *css;  	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); +	/* init base cftset */ +	cgroup_init_cftsets(ss); +  	/* Create the top cgroup state for this subsystem */  	list_add(&ss->sibling, &rootnode.subsys_list);  	ss->root = &rootnode; @@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  		return 0;  	} +	/* init base cftset */ +	cgroup_init_cftsets(ss); +  	/*  	 * need to register a subsys id before anything else - for example,  	 * init_cgroup_css needs it. @@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp)  }  /* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css, int count) +bool __css_tryget(struct cgroup_subsys_state *css) +{ +	do { +		int v = css_refcnt(css); + +		if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) +			return true; +		cpu_relax(); +	} while (!test_bit(CSS_REMOVED, &css->flags)); + +	return false; +} +EXPORT_SYMBOL_GPL(__css_tryget); + +/* Caller must verify that the css is not for root cgroup */ +void __css_put(struct cgroup_subsys_state *css)  {  	struct cgroup *cgrp = css->cgroup; -	int val; +  	rcu_read_lock(); -	val = atomic_sub_return(count, &css->refcnt); -	if (val == 1) { +	atomic_dec(&css->refcnt); +	switch (css_refcnt(css)) { +	case 1:  		if (notify_on_release(cgrp)) {  			set_bit(CGRP_RELEASABLE, &cgrp->flags);  			check_for_release(cgrp);  		}  		cgroup_wakeup_rmdir_waiter(cgrp); +		break; +	case 0: +		if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) +			schedule_work(&css->dput_work); +		break;  	}  	rcu_read_unlock(); -	WARN_ON_ONCE(val < 1);  }  EXPORT_SYMBOL_GPL(__css_put); @@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)  	 * on this or this is under rcu_read_lock(). Once css->id is allocated,  	 * it's unchanged until freed.  	 */ -	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); +	cssid = rcu_dereference_check(css->id, css_refcnt(css));  	if (cssid)  		return cssid->id; @@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)  {  	struct css_id *cssid; -	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); +	cssid = rcu_dereference_check(css->id, css_refcnt(css));  	if (cssid)  		return cssid->depth; @@ -5211,19 +5499,15 @@ static struct cftype debug_files[] =  {  		.name = "releasable",  		.read_u64 = releasable_read,  	}, -}; -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ -	return cgroup_add_files(cont, ss, debug_files, -				ARRAY_SIZE(debug_files)); -} +	{ }	/* terminate */ +};  struct cgroup_subsys debug_subsys = {  	.name = "debug",  	.create = debug_create,  	.destroy = debug_destroy, -	.populate = debug_populate,  	.subsys_id = debug_subsys_id, +	.base_cftypes = debug_files,  };  #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b6..3649fc6b3ea 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,  static struct cftype files[] = {  	{  		.name = "state", +		.flags = CFTYPE_NOT_ON_ROOT,  		.read_seq_string = freezer_read,  		.write_string = freezer_write,  	}, +	{ }	/* terminate */  }; -static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) -{ -	if (!cgroup->parent) -		return 0; -	return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); -} -  struct cgroup_subsys freezer_subsys = {  	.name		= "freezer",  	.create		= freezer_create,  	.destroy	= freezer_destroy, -	.populate	= freezer_populate,  	.subsys_id	= freezer_subsys_id,  	.can_attach	= freezer_can_attach,  	.fork		= freezer_fork, +	.base_cftypes	= files,  }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e5702..0e6353cf147 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -17,6 +17,8 @@  #include <linux/gfp.h>  #include <linux/suspend.h> +#include "smpboot.h" +  #ifdef CONFIG_SMP  /* Serializes the updates to cpu_online_mask, cpu_present_mask */  static DEFINE_MUTEX(cpu_add_remove_lock); @@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)  	int ret, nr_calls = 0;  	void *hcpu = (void *)(long)cpu;  	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; +	struct task_struct *idle;  	if (cpu_online(cpu) || !cpu_present(cpu))  		return -EINVAL;  	cpu_hotplug_begin(); + +	idle = idle_thread_get(cpu); +	if (IS_ERR(idle)) { +		ret = PTR_ERR(idle); +		goto out; +	} +  	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);  	if (ret) {  		nr_calls--; @@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)  	}  	/* Arch-specific enabling code. */ -	ret = __cpu_up(cpu); +	ret = __cpu_up(cpu, idle);  	if (ret != 0)  		goto out_notify;  	BUG_ON(!cpu_online(cpu)); @@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)  out_notify:  	if (ret != 0)  		__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); +out:  	cpu_hotplug_done();  	return ret; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 14f7070b4ba..8c8bd652dd1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1765,28 +1765,17 @@ static struct cftype files[] = {  		.write_u64 = cpuset_write_u64,  		.private = FILE_SPREAD_SLAB,  	}, -}; - -static struct cftype cft_memory_pressure_enabled = { -	.name = "memory_pressure_enabled", -	.read_u64 = cpuset_read_u64, -	.write_u64 = cpuset_write_u64, -	.private = FILE_MEMORY_PRESSURE_ENABLED, -}; -static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ -	int err; +	{ +		.name = "memory_pressure_enabled", +		.flags = CFTYPE_ONLY_ON_ROOT, +		.read_u64 = cpuset_read_u64, +		.write_u64 = cpuset_write_u64, +		.private = FILE_MEMORY_PRESSURE_ENABLED, +	}, -	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -	if (err) -		return err; -	/* memory_pressure_enabled is in root cpuset only */ -	if (!cont->parent) -		err = cgroup_add_file(cont, ss, -				      &cft_memory_pressure_enabled); -	return err; -} +	{ }	/* terminate */ +};  /*   * post_clone() is called during cgroup_create() when the @@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {  	.destroy = cpuset_destroy,  	.can_attach = cpuset_can_attach,  	.attach = cpuset_attach, -	.populate = cpuset_populate,  	.post_clone = cpuset_post_clone,  	.subsys_id = cpuset_subsys_id, +	.base_cftypes = files,  	.early_init = 1,  }; diff --git a/kernel/events/core.c b/kernel/events/core.c index fd126f82b57..91a44592585 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2039,8 +2039,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,   * accessing the event control register. If a NMI hits, then it will   * not restart the event.   */ -void __perf_event_task_sched_out(struct task_struct *task, -				 struct task_struct *next) +static void __perf_event_task_sched_out(struct task_struct *task, +					struct task_struct *next)  {  	int ctxn; @@ -2279,8 +2279,8 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,   * accessing the event control register. If a NMI hits, then it will   * keep the event running.   */ -void __perf_event_task_sched_in(struct task_struct *prev, -				struct task_struct *task) +static void __perf_event_task_sched_in(struct task_struct *prev, +				       struct task_struct *task)  {  	struct perf_event_context *ctx;  	int ctxn; @@ -2305,6 +2305,12 @@ void __perf_event_task_sched_in(struct task_struct *prev,  		perf_branch_stack_sched_in(prev, task);  } +void __perf_event_task_sched(struct task_struct *prev, struct task_struct *next) +{ +	__perf_event_task_sched_out(prev, next); +	__perf_event_task_sched_in(prev, next); +} +  static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)  {  	u64 frequency = event->attr.sample_freq; @@ -4957,7 +4963,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)  	if (rctx < 0)  		return; -	perf_sample_data_init(&data, addr); +	perf_sample_data_init(&data, addr, 0);  	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); @@ -5215,7 +5221,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,  		.data = record,  	}; -	perf_sample_data_init(&data, addr); +	perf_sample_data_init(&data, addr, 0);  	data.raw = &raw;  	hlist_for_each_entry_rcu(event, node, head, hlist_entry) { @@ -5318,7 +5324,7 @@ void perf_bp_event(struct perf_event *bp, void *data)  	struct perf_sample_data sample;  	struct pt_regs *regs = data; -	perf_sample_data_init(&sample, bp->attr.bp_addr); +	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);  	if (!bp->hw.state && !perf_exclude_event(bp, regs))  		perf_swevent_event(bp, 1, &sample, regs); @@ -5344,13 +5350,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)  	event->pmu->read(event); -	perf_sample_data_init(&data, 0); -	data.period = event->hw.last_period; +	perf_sample_data_init(&data, 0, event->hw.last_period);  	regs = get_irq_regs();  	if (regs && !perf_exclude_event(event, regs)) {  		if (!(event->attr.exclude_idle && is_idle_task(current))) -			if (perf_event_overflow(event, &data, regs)) +			if (__perf_event_overflow(event, 1, &data, regs))  				ret = HRTIMER_NORESTART;  	} diff --git a/kernel/fork.c b/kernel/fork.c index 687a15d5624..ad54c833116 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -34,6 +34,7 @@  #include <linux/cgroup.h>  #include <linux/security.h>  #include <linux/hugetlb.h> +#include <linux/seccomp.h>  #include <linux/swap.h>  #include <linux/syscalls.h>  #include <linux/jiffies.h> @@ -112,32 +113,67 @@ int nr_processes(void)  	return total;  } -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct_node(node)		\ -		kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) -# define free_task_struct(tsk)			\ -		kmem_cache_free(task_struct_cachep, (tsk)) +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR  static struct kmem_cache *task_struct_cachep; + +static inline struct task_struct *alloc_task_struct_node(int node) +{ +	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); +} + +void __weak arch_release_task_struct(struct task_struct *tsk) { } + +static inline void free_task_struct(struct task_struct *tsk) +{ +	arch_release_task_struct(tsk); +	kmem_cache_free(task_struct_cachep, tsk); +}  #endif -#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +void __weak arch_release_thread_info(struct thread_info *ti) { } + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +# if THREAD_SIZE >= PAGE_SIZE  static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,  						  int node)  { -#ifdef CONFIG_DEBUG_STACK_USAGE -	gfp_t mask = GFP_KERNEL | __GFP_ZERO; -#else -	gfp_t mask = GFP_KERNEL; -#endif -	struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); +	struct page *page = alloc_pages_node(node, THREADINFO_GFP, +					     THREAD_SIZE_ORDER);  	return page ? page_address(page) : NULL;  }  static inline void free_thread_info(struct thread_info *ti)  { +	arch_release_thread_info(ti);  	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);  } +# else +static struct kmem_cache *thread_info_cache; + +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +						  int node) +{ +	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); +} + +static void free_thread_info(struct thread_info *ti) +{ +	arch_release_thread_info(ti); +	kmem_cache_free(thread_info_cache, ti); +} + +void thread_info_cache_init(void) +{ +	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, +					      THREAD_SIZE, 0, NULL); +	BUG_ON(thread_info_cache == NULL); +} +# endif  #endif  /* SLAB cache for signal_struct structures (tsk->signal) */ @@ -171,6 +207,7 @@ void free_task(struct task_struct *tsk)  	free_thread_info(tsk->stack);  	rt_mutex_debug_task_free(tsk);  	ftrace_graph_exit_task(tsk); +	put_seccomp_filter(tsk);  	free_task_struct(tsk);  }  EXPORT_SYMBOL(free_task); @@ -204,17 +241,11 @@ void __put_task_struct(struct task_struct *tsk)  }  EXPORT_SYMBOL_GPL(__put_task_struct); -/* - * macro override instead of weak attribute alias, to workaround - * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. - */ -#ifndef arch_task_cache_init -#define arch_task_cache_init() -#endif +void __init __weak arch_task_cache_init(void) { }  void __init fork_init(unsigned long mempages)  { -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR  #ifndef ARCH_MIN_TASKALIGN  #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES  #endif @@ -1163,6 +1194,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  		goto fork_out;  	ftrace_graph_init_task(p); +	get_seccomp_filter(p);  	rt_mutex_init_task(p); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c21449f85a2..6df614912b9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)  	touch_nmi_watchdog(); -	if (sysctl_hung_task_panic) +	if (sysctl_hung_task_panic) { +		trigger_all_cpu_backtrace();  		panic("hung_task: blocked tasks"); +	}  }  /* diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3914c1e03cf..fc275e4f629 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)  	 * If its disabled or no action available  	 * keep it masked and get out of here  	 */ -	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) +	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { +		desc->istate |= IRQS_PENDING;  		goto out_unlock; +	}  	handle_irq_event(desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9b7f68a00e5..bb32326afe8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,  		 * IRQF_TRIGGER_* but the PIC does not support multiple  		 * flow-types?  		 */ -		pr_debug("No set_type function for IRQ %d (%s)\n", irq, -				chip ? (chip->name ? : "unknown") : "unknown"); +		pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, +			 chip ? (chip->name ? : "unknown") : "unknown");  		return 0;  	} @@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,  		ret = 0;  		break;  	default: -		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", +		pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",  		       flags, irq, chip->irq_set_type);  	}  	if (unmask) @@ -837,8 +837,7 @@ void exit_irq_thread(void)  	action = kthread_data(tsk); -	printk(KERN_ERR -	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", +	pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",  	       tsk->comm ? tsk->comm : "", tsk->pid, action->irq);  	desc = irq_to_desc(action->irq); @@ -878,7 +877,6 @@ static int  __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  {  	struct irqaction *old, **old_ptr; -	const char *old_name = NULL;  	unsigned long flags, thread_mask = 0;  	int ret, nested, shared = 0;  	cpumask_var_t mask; @@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		 */  		if (!((old->flags & new->flags) & IRQF_SHARED) ||  		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || -		    ((old->flags ^ new->flags) & IRQF_ONESHOT)) { -			old_name = old->name; +		    ((old->flags ^ new->flags) & IRQF_ONESHOT))  			goto mismatch; -		}  		/* All handlers must agree on per-cpuness */  		if ((old->flags & IRQF_PERCPU) != @@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		 * all existing action->thread_mask bits.  		 */  		new->thread_mask = 1 << ffz(thread_mask); + +	} else if (new->handler == irq_default_primary_handler) { +		/* +		 * The interrupt was requested with handler = NULL, so +		 * we use the default primary handler for it. But it +		 * does not have the oneshot flag set. In combination +		 * with level interrupts this is deadly, because the +		 * default primary handler just wakes the thread, then +		 * the irq lines is reenabled, but the device still +		 * has the level irq asserted. Rinse and repeat.... +		 * +		 * While this works for edge type interrupts, we play +		 * it safe and reject unconditionally because we can't +		 * say for sure which type this interrupt really +		 * has. The type flags are unreliable as the +		 * underlying chip implementation can override them. +		 */ +		pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", +		       irq); +		ret = -EINVAL; +		goto out_mask;  	}  	if (!shared) { @@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		if (nmsk != omsk)  			/* hope the handler works with current  trigger mode */ -			pr_warning("IRQ %d uses trigger mode %u; requested %u\n", +			pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",  				   irq, nmsk, omsk);  	} @@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  	return 0;  mismatch: -#ifdef CONFIG_DEBUG_SHIRQ  	if (!(new->flags & IRQF_PROBE_SHARED)) { -		printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); -		if (old_name) -			printk(KERN_ERR "current handler: %s\n", old_name); +		pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", +		       irq, new->flags, new->name, old->flags, old->name); +#ifdef CONFIG_DEBUG_SHIRQ  		dump_stack(); -	}  #endif +	}  	ret = -EBUSY;  out_mask: diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a..cb228bf2176 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -103,8 +103,13 @@ int check_wakeup_irqs(void)  	int irq;  	for_each_irq_desc(irq, desc) { +		/* +		 * Only interrupts which are marked as wakeup source +		 * and have not been disabled before the suspend check +		 * can abort suspend. +		 */  		if (irqd_is_wakeup_set(&desc->irq_data)) { -			if (desc->istate & IRQS_PENDING) +			if (desc->depth == 1 && desc->istate & IRQS_PENDING)  				return -EBUSY;  			continue;  		} diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c..6454db7b6a4 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)  	/*  	 * We do not resend level type interrupts. Level type  	 * interrupts are resent by hardware when they are still -	 * active. +	 * active. Clear the pending bit so suspend/resume does not +	 * get confused.  	 */ -	if (irq_settings_is_level(desc)) +	if (irq_settings_is_level(desc)) { +		desc->istate &= ~IRQS_PENDING;  		return; +	}  	if (desc->istate & IRQS_REPLAY)  		return;  	if (desc->istate & IRQS_PENDING) { diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e42..a4e60973ca7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2953,7 +2953,7 @@ static struct module *load_module(void __user *umod,  	/* Module is ready to execute: parsing args may do that. */  	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -			 -32768, 32767, NULL); +			 -32768, 32767, &ddebug_dyndbg_module_param_cb);  	if (err < 0)  		goto unlink; diff --git a/kernel/params.c b/kernel/params.c index f37d8263134..ed35345be53 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)  static int parse_one(char *param,  		     char *val, +		     const char *doing,  		     const struct kernel_param *params,  		     unsigned num_params,  		     s16 min_level,  		     s16 max_level, -		     int (*handle_unknown)(char *param, char *val)) +		     int (*handle_unknown)(char *param, char *val, +				     const char *doing))  {  	unsigned int i;  	int err; @@ -104,8 +106,8 @@ static int parse_one(char *param,  			if (!val && params[i].ops->set != param_set_bool  			    && params[i].ops->set != param_set_bint)  				return -EINVAL; -			pr_debug("They are equal!  Calling %p\n", -			       params[i].ops->set); +			pr_debug("handling %s with %p\n", param, +				params[i].ops->set);  			mutex_lock(¶m_lock);  			err = params[i].ops->set(val, ¶ms[i]);  			mutex_unlock(¶m_lock); @@ -114,11 +116,11 @@ static int parse_one(char *param,  	}  	if (handle_unknown) { -		pr_debug("Unknown argument: calling %p\n", handle_unknown); -		return handle_unknown(param, val); +		pr_debug("doing %s: %s='%s'\n", doing, param, val); +		return handle_unknown(param, val, doing);  	} -	pr_debug("Unknown argument `%s'\n", param); +	pr_debug("Unknown argument '%s'\n", param);  	return -ENOENT;  } @@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)  }  /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *name, +int parse_args(const char *doing,  	       char *args,  	       const struct kernel_param *params,  	       unsigned num,  	       s16 min_level,  	       s16 max_level, -	       int (*unknown)(char *param, char *val)) +	       int (*unknown)(char *param, char *val, const char *doing))  {  	char *param, *val; -	pr_debug("Parsing ARGS: %s\n", args); -  	/* Chew leading spaces */  	args = skip_spaces(args); +	if (*args) +		pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); +  	while (*args) {  		int ret;  		int irq_was_disabled;  		args = next_arg(args, ¶m, &val);  		irq_was_disabled = irqs_disabled(); -		ret = parse_one(param, val, params, num, +		ret = parse_one(param, val, doing, params, num,  				min_level, max_level, unknown); -		if (irq_was_disabled && !irqs_disabled()) { -			printk(KERN_WARNING "parse_args(): option '%s' enabled " -					"irq's!\n", param); -		} +		if (irq_was_disabled && !irqs_disabled()) +			pr_warn("%s: option '%s' enabled irq's!\n", +				doing, param); +  		switch (ret) {  		case -ENOENT: -			printk(KERN_ERR "%s: Unknown parameter `%s'\n", -			       name, param); +			pr_err("%s: Unknown parameter `%s'\n", doing, param);  			return ret;  		case -ENOSPC: -			printk(KERN_ERR -			       "%s: `%s' too large for parameter `%s'\n", -			       name, val ?: "", param); +			pr_err("%s: `%s' too large for parameter `%s'\n", +			       doing, val ?: "", param);  			return ret;  		case 0:  			break;  		default: -			printk(KERN_ERR -			       "%s: `%s' invalid for parameter `%s'\n", -			       name, val ?: "", param); +			pr_err("%s: `%s' invalid for parameter `%s'\n", +			       doing, val ?: "", param);  			return ret;  		}  	} @@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);  int param_set_charp(const char *val, const struct kernel_param *kp)  {  	if (strlen(val) > 1024) { -		printk(KERN_ERR "%s: string parameter too long\n", -		       kp->name); +		pr_err("%s: string parameter too long\n", kp->name);  		return -ENOSPC;  	} @@ -400,8 +399,7 @@ static int param_array(const char *name,  		int len;  		if (*num == max) { -			printk(KERN_ERR "%s: can only take %i arguments\n", -			       name, max); +			pr_err("%s: can only take %i arguments\n", name, max);  			return -EINVAL;  		}  		len = strcspn(val, ","); @@ -420,8 +418,7 @@ static int param_array(const char *name,  	} while (save == ',');  	if (*num < min) { -		printk(KERN_ERR "%s: needs at least %i arguments\n", -		       name, min); +		pr_err("%s: needs at least %i arguments\n", name, min);  		return -EINVAL;  	}  	return 0; @@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)  	const struct kparam_string *kps = kp->str;  	if (strlen(val)+1 > kps->maxlen) { -		printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", +		pr_err("%s: string doesn't fit in %u chars.\n",  		       kp->name, kps->maxlen-1);  		return -ENOSPC;  	} @@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)  #endif  		if (err) {  			kobject_put(&mk->kobj); -			printk(KERN_ERR -				"Module '%s' failed add to sysfs, error number %d\n", +			pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",  				name, err); -			printk(KERN_ERR -				"The system will be unstable now.\n");  			return NULL;  		} diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d3..32462d2b364 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -41,6 +41,7 @@  #include <linux/cpu.h>  #include <linux/notifier.h>  #include <linux/rculist.h> +#include <linux/poll.h>  #include <asm/uaccess.h> @@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)  {  } -#define __LOG_BUF_LEN	(1 << CONFIG_LOG_BUF_SHIFT) -  /* printk's without a loglevel use this.. */  #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);  static int console_locked, console_suspended;  /* - * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars - * It is also used in interesting ways to provide interlocking in - * console_unlock();. - */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); - -#define LOG_BUF_MASK (log_buf_len-1) -#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) - -/* - * The indices into log_buf are not constrained to log_buf_len - they - * must be masked before subscripting - */ -static unsigned log_start;	/* Index into log_buf: next char to be read by syslog() */ -static unsigned con_start;	/* Index into log_buf: next char to be sent to consoles */ -static unsigned log_end;	/* Index into log_buf: most-recently-written-char + 1 */ - -/*   * If exclusive_console is non-NULL then only this console is to be printed to.   */  static struct console *exclusive_console; @@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline);  /* Flag: console code may call schedule() */  static int console_may_schedule; +/* + * The printk log buffer consists of a chain of concatenated variable + * length records. Every record starts with a record header, containing + * the overall length of the record. + * + * The heads to the first and last entry in the buffer, as well as the + * sequence numbers of these both entries are maintained when messages + * are stored.. + * + * If the heads indicate available messages, the length in the header + * tells the start next message. A length == 0 for the next message + * indicates a wrap-around to the beginning of the buffer. + * + * Every record carries the monotonic timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual + * kernel messages use LOG_KERN; userspace-injected messages always carry + * a matching syslog facility, by default LOG_USER. The origin of every + * message can be reliably determined that way. + * + * The human readable log message directly follows the message header. The + * length of the message text is stored in the header, the stored message + * is not terminated. + * + * Optionally, a message can carry a dictionary of properties (key/value pairs), + * to provide userspace with a machine-readable message context. + * + * Examples for well-defined, commonly used property names are: + *   DEVICE=b12:8               device identifier + *                                b12:8         block dev_t + *                                c127:3        char dev_t + *                                n8            netdev ifindex + *                                +sound:card0  subsystem:devname + *   SUBSYSTEM=pci              driver-core subsystem name + * + * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value + * follows directly after a '=' character. Every property is terminated by + * a '\0' character. The last property is not terminated. + * + * Example of a message structure: + *   0000  ff 8f 00 00 00 00 00 00      monotonic time in nsec + *   0008  34 00                        record is 52 bytes long + *   000a        0b 00                  text is 11 bytes long + *   000c              1f 00            dictionary is 23 bytes long + *   000e                    03 00      LOG_KERN (facility) LOG_ERR (level) + *   0010  69 74 27 73 20 61 20 6c      "it's a l" + *         69 6e 65                     "ine" + *   001b           44 45 56 49 43      "DEVIC" + *         45 3d 62 38 3a 32 00 44      "E=b8:2\0D" + *         52 49 56 45 52 3d 62 75      "RIVER=bu" + *         67                           "g" + *   0032     00 00 00                  padding to next message header + * + * The 'struct log' buffer header must never be directly exported to + * userspace, it is a kernel-private implementation detail that might + * need to be changed in the future, when the requirements change. + * + * /dev/kmsg exports the structured data in the following line format: + *   "level,sequnum,timestamp;<message text>\n" + * + * The optional key/value pairs are attached as continuation lines starting + * with a space character and terminated by a newline. All possible + * non-prinatable characters are escaped in the "\xff" notation. + * + * Users of the export format should ignore possible additional values + * separated by ',', and find the message after the ';' character. + */ + +struct log { +	u64 ts_nsec;		/* timestamp in nanoseconds */ +	u16 len;		/* length of entire record */ +	u16 text_len;		/* length of text buffer */ +	u16 dict_len;		/* length of dictionary buffer */ +	u16 level;		/* syslog level + facility */ +}; + +/* + * The logbuf_lock protects kmsg buffer, indices, counters. It is also + * used in interesting ways to provide interlocking in console_unlock(); + */ +static DEFINE_RAW_SPINLOCK(logbuf_lock); + +/* the next printk record to read by syslog(READ) or /proc/kmsg */ +static u64 syslog_seq; +static u32 syslog_idx; + +/* index and sequence number of the first record stored in the buffer */ +static u64 log_first_seq; +static u32 log_first_idx; + +/* index and sequence number of the next record to store in the buffer */ +static u64 log_next_seq;  #ifdef CONFIG_PRINTK +static u32 log_next_idx; + +/* the next printk record to read after the last 'clear' command */ +static u64 clear_seq; +static u32 clear_idx; + +#define LOG_LINE_MAX 1024 -static char __log_buf[__LOG_BUF_LEN]; +/* record buffer */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define LOG_ALIGN 4 +#else +#define LOG_ALIGN 8 +#endif +#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);  static char *log_buf = __log_buf; -static int log_buf_len = __LOG_BUF_LEN; -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ -static int saved_console_loglevel = -1; +static u32 log_buf_len = __LOG_BUF_LEN; + +/* cpu currently holding logbuf_lock */ +static volatile unsigned int logbuf_cpu = UINT_MAX; + +/* human readable text of the record */ +static char *log_text(const struct log *msg) +{ +	return (char *)msg + sizeof(struct log); +} + +/* optional key/value pair dictionary attached to the record */ +static char *log_dict(const struct log *msg) +{ +	return (char *)msg + sizeof(struct log) + msg->text_len; +} + +/* get record by index; idx must point to valid msg */ +static struct log *log_from_idx(u32 idx) +{ +	struct log *msg = (struct log *)(log_buf + idx); + +	/* +	 * A length == 0 record is the end of buffer marker. Wrap around and +	 * read the message at the start of the buffer. +	 */ +	if (!msg->len) +		return (struct log *)log_buf; +	return msg; +} + +/* get next record; idx must point to valid msg */ +static u32 log_next(u32 idx) +{ +	struct log *msg = (struct log *)(log_buf + idx); + +	/* length == 0 indicates the end of the buffer; wrap */ +	/* +	 * A length == 0 record is the end of buffer marker. Wrap around and +	 * read the message at the start of the buffer as *this* one, and +	 * return the one after that. +	 */ +	if (!msg->len) { +		msg = (struct log *)log_buf; +		return msg->len; +	} +	return idx + msg->len; +} + +/* insert record into the buffer, discard old ones, update heads */ +static void log_store(int facility, int level, +		      const char *dict, u16 dict_len, +		      const char *text, u16 text_len) +{ +	struct log *msg; +	u32 size, pad_len; + +	/* number of '\0' padding bytes to next message */ +	size = sizeof(struct log) + text_len + dict_len; +	pad_len = (-size) & (LOG_ALIGN - 1); +	size += pad_len; + +	while (log_first_seq < log_next_seq) { +		u32 free; + +		if (log_next_idx > log_first_idx) +			free = max(log_buf_len - log_next_idx, log_first_idx); +		else +			free = log_first_idx - log_next_idx; + +		if (free > size + sizeof(struct log)) +			break; + +		/* drop old messages until we have enough contiuous space */ +		log_first_idx = log_next(log_first_idx); +		log_first_seq++; +	} + +	if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { +		/* +		 * This message + an additional empty header does not fit +		 * at the end of the buffer. Add an empty header with len == 0 +		 * to signify a wrap around. +		 */ +		memset(log_buf + log_next_idx, 0, sizeof(struct log)); +		log_next_idx = 0; +	} + +	/* fill message */ +	msg = (struct log *)(log_buf + log_next_idx); +	memcpy(log_text(msg), text, text_len); +	msg->text_len = text_len; +	memcpy(log_dict(msg), dict, dict_len); +	msg->dict_len = dict_len; +	msg->level = (facility << 3) | (level & 7); +	msg->ts_nsec = local_clock(); +	memset(log_dict(msg) + dict_len, 0, pad_len); +	msg->len = sizeof(struct log) + text_len + dict_len + pad_len; + +	/* insert message */ +	log_next_idx += msg->len; +	log_next_seq++; +} + +/* /dev/kmsg - userspace message inject/listen interface */ +struct devkmsg_user { +	u64 seq; +	u32 idx; +	struct mutex lock; +	char buf[8192]; +}; + +static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, +			      unsigned long count, loff_t pos) +{ +	char *buf, *line; +	int i; +	int level = default_message_loglevel; +	int facility = 1;	/* LOG_USER */ +	size_t len = iov_length(iv, count); +	ssize_t ret = len; + +	if (len > LOG_LINE_MAX) +		return -EINVAL; +	buf = kmalloc(len+1, GFP_KERNEL); +	if (buf == NULL) +		return -ENOMEM; + +	line = buf; +	for (i = 0; i < count; i++) { +		if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) +			goto out; +		line += iv[i].iov_len; +	} + +	/* +	 * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace +	 * the decimal value represents 32bit, the lower 3 bit are the log +	 * level, the rest are the log facility. +	 * +	 * If no prefix or no userspace facility is specified, we +	 * enforce LOG_USER, to be able to reliably distinguish +	 * kernel-generated messages from userspace-injected ones. +	 */ +	line = buf; +	if (line[0] == '<') { +		char *endp = NULL; + +		i = simple_strtoul(line+1, &endp, 10); +		if (endp && endp[0] == '>') { +			level = i & 7; +			if (i >> 3) +				facility = i >> 3; +			endp++; +			len -= endp - line; +			line = endp; +		} +	} +	line[len] = '\0'; + +	printk_emit(facility, level, NULL, 0, "%s", line); +out: +	kfree(buf); +	return ret; +} + +static ssize_t devkmsg_read(struct file *file, char __user *buf, +			    size_t count, loff_t *ppos) +{ +	struct devkmsg_user *user = file->private_data; +	struct log *msg; +	u64 ts_usec; +	size_t i; +	size_t len; +	ssize_t ret; + +	if (!user) +		return -EBADF; + +	mutex_lock(&user->lock); +	raw_spin_lock(&logbuf_lock); +	while (user->seq == log_next_seq) { +		if (file->f_flags & O_NONBLOCK) { +			ret = -EAGAIN; +			raw_spin_unlock(&logbuf_lock); +			goto out; +		} + +		raw_spin_unlock(&logbuf_lock); +		ret = wait_event_interruptible(log_wait, +					       user->seq != log_next_seq); +		if (ret) +			goto out; +		raw_spin_lock(&logbuf_lock); +	} + +	if (user->seq < log_first_seq) { +		/* our last seen message is gone, return error and reset */ +		user->idx = log_first_idx; +		user->seq = log_first_seq; +		ret = -EPIPE; +		raw_spin_unlock(&logbuf_lock); +		goto out; +	} + +	msg = log_from_idx(user->idx); +	ts_usec = msg->ts_nsec; +	do_div(ts_usec, 1000); +	len = sprintf(user->buf, "%u,%llu,%llu;", +		      msg->level, user->seq, ts_usec); + +	/* escape non-printable characters */ +	for (i = 0; i < msg->text_len; i++) { +		unsigned char c = log_text(msg)[i]; + +		if (c < ' ' || c >= 128) +			len += sprintf(user->buf + len, "\\x%02x", c); +		else +			user->buf[len++] = c; +	} +	user->buf[len++] = '\n'; + +	if (msg->dict_len) { +		bool line = true; + +		for (i = 0; i < msg->dict_len; i++) { +			unsigned char c = log_dict(msg)[i]; + +			if (line) { +				user->buf[len++] = ' '; +				line = false; +			} + +			if (c == '\0') { +				user->buf[len++] = '\n'; +				line = true; +				continue; +			} + +			if (c < ' ' || c >= 128) { +				len += sprintf(user->buf + len, "\\x%02x", c); +				continue; +			} + +			user->buf[len++] = c; +		} +		user->buf[len++] = '\n'; +	} + +	user->idx = log_next(user->idx); +	user->seq++; +	raw_spin_unlock(&logbuf_lock); + +	if (len > count) { +		ret = -EINVAL; +		goto out; +	} + +	if (copy_to_user(buf, user->buf, len)) { +		ret = -EFAULT; +		goto out; +	} +	ret = len; +out: +	mutex_unlock(&user->lock); +	return ret; +} + +static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) +{ +	struct devkmsg_user *user = file->private_data; +	loff_t ret = 0; + +	if (!user) +		return -EBADF; +	if (offset) +		return -ESPIPE; + +	raw_spin_lock(&logbuf_lock); +	switch (whence) { +	case SEEK_SET: +		/* the first record */ +		user->idx = log_first_idx; +		user->seq = log_first_seq; +		break; +	case SEEK_DATA: +		/* +		 * The first record after the last SYSLOG_ACTION_CLEAR, +		 * like issued by 'dmesg -c'. Reading /dev/kmsg itself +		 * changes no global state, and does not clear anything. +		 */ +		user->idx = clear_idx; +		user->seq = clear_seq; +		break; +	case SEEK_END: +		/* after the last record */ +		user->idx = log_next_idx; +		user->seq = log_next_seq; +		break; +	default: +		ret = -EINVAL; +	} +	raw_spin_unlock(&logbuf_lock); +	return ret; +} + +static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +{ +	struct devkmsg_user *user = file->private_data; +	int ret = 0; + +	if (!user) +		return POLLERR|POLLNVAL; + +	poll_wait(file, &log_wait, wait); + +	raw_spin_lock(&logbuf_lock); +	if (user->seq < log_next_seq) { +		/* return error when data has vanished underneath us */ +		if (user->seq < log_first_seq) +			ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; +		ret = POLLIN|POLLRDNORM; +	} +	raw_spin_unlock(&logbuf_lock); + +	return ret; +} + +static int devkmsg_open(struct inode *inode, struct file *file) +{ +	struct devkmsg_user *user; +	int err; + +	/* write-only does not need any file context */ +	if ((file->f_flags & O_ACCMODE) == O_WRONLY) +		return 0; + +	err = security_syslog(SYSLOG_ACTION_READ_ALL); +	if (err) +		return err; + +	user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); +	if (!user) +		return -ENOMEM; + +	mutex_init(&user->lock); + +	raw_spin_lock(&logbuf_lock); +	user->idx = log_first_idx; +	user->seq = log_first_seq; +	raw_spin_unlock(&logbuf_lock); + +	file->private_data = user; +	return 0; +} + +static int devkmsg_release(struct inode *inode, struct file *file) +{ +	struct devkmsg_user *user = file->private_data; + +	if (!user) +		return 0; + +	mutex_destroy(&user->lock); +	kfree(user); +	return 0; +} + +const struct file_operations kmsg_fops = { +	.open = devkmsg_open, +	.read = devkmsg_read, +	.aio_write = devkmsg_writev, +	.llseek = devkmsg_llseek, +	.poll = devkmsg_poll, +	.release = devkmsg_release, +};  #ifdef CONFIG_KEXEC  /* @@ -165,9 +624,9 @@ static int saved_console_loglevel = -1;  void log_buf_kexec_setup(void)  {  	VMCOREINFO_SYMBOL(log_buf); -	VMCOREINFO_SYMBOL(log_end);  	VMCOREINFO_SYMBOL(log_buf_len); -	VMCOREINFO_SYMBOL(logged_chars); +	VMCOREINFO_SYMBOL(log_first_idx); +	VMCOREINFO_SYMBOL(log_next_idx);  }  #endif @@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup);  void __init setup_log_buf(int early)  {  	unsigned long flags; -	unsigned start, dest_idx, offset;  	char *new_log_buf;  	int free; @@ -219,20 +677,8 @@ void __init setup_log_buf(int early)  	log_buf_len = new_log_buf_len;  	log_buf = new_log_buf;  	new_log_buf_len = 0; -	free = __LOG_BUF_LEN - log_end; - -	offset = start = min(con_start, log_start); -	dest_idx = 0; -	while (start != log_end) { -		unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); - -		log_buf[dest_idx] = __log_buf[log_idx_mask]; -		start++; -		dest_idx++; -	} -	log_start -= offset; -	con_start -= offset; -	log_end -= offset; +	free = __LOG_BUF_LEN - log_next_idx; +	memcpy(log_buf, __log_buf, __LOG_BUF_LEN);  	raw_spin_unlock_irqrestore(&logbuf_lock, flags);  	pr_info("log_buf_len: %d\n", log_buf_len); @@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file)  	return 0;  } +#if defined(CONFIG_PRINTK_TIME) +static bool printk_time = 1; +#else +static bool printk_time; +#endif +module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); + +static size_t print_prefix(const struct log *msg, bool syslog, char *buf) +{ +	size_t len = 0; + +	if (syslog) { +		if (buf) { +			len += sprintf(buf, "<%u>", msg->level); +		} else { +			len += 3; +			if (msg->level > 9) +				len++; +			if (msg->level > 99) +				len++; +		} +	} + +	if (printk_time) { +		if (buf) { +			unsigned long long ts = msg->ts_nsec; +			unsigned long rem_nsec = do_div(ts, 1000000000); + +			len += sprintf(buf + len, "[%5lu.%06lu] ", +					 (unsigned long) ts, rem_nsec / 1000); +		} else { +			len += 15; +		} +	} + +	return len; +} + +static size_t msg_print_text(const struct log *msg, bool syslog, +			     char *buf, size_t size) +{ +	const char *text = log_text(msg); +	size_t text_size = msg->text_len; +	size_t len = 0; + +	do { +		const char *next = memchr(text, '\n', text_size); +		size_t text_len; + +		if (next) { +			text_len = next - text; +			next++; +			text_size -= next - text; +		} else { +			text_len = text_size; +		} + +		if (buf) { +			if (print_prefix(msg, syslog, NULL) + +			    text_len + 1>= size - len) +				break; + +			len += print_prefix(msg, syslog, buf + len); +			memcpy(buf + len, text, text_len); +			len += text_len; +			buf[len++] = '\n'; +		} else { +			/* SYSLOG_ACTION_* buffer size only calculation */ +			len += print_prefix(msg, syslog, NULL); +			len += text_len + 1; +		} + +		text = next; +	} while (text); + +	return len; +} + +static int syslog_print(char __user *buf, int size) +{ +	char *text; +	struct log *msg; +	int len; + +	text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); +	if (!text) +		return -ENOMEM; + +	raw_spin_lock_irq(&logbuf_lock); +	if (syslog_seq < log_first_seq) { +		/* messages are gone, move to first one */ +		syslog_seq = log_first_seq; +		syslog_idx = log_first_idx; +	} +	msg = log_from_idx(syslog_idx); +	len = msg_print_text(msg, true, text, LOG_LINE_MAX); +	syslog_idx = log_next(syslog_idx); +	syslog_seq++; +	raw_spin_unlock_irq(&logbuf_lock); + +	if (len > 0 && copy_to_user(buf, text, len)) +		len = -EFAULT; + +	kfree(text); +	return len; +} + +static int syslog_print_all(char __user *buf, int size, bool clear) +{ +	char *text; +	int len = 0; + +	text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); +	if (!text) +		return -ENOMEM; + +	raw_spin_lock_irq(&logbuf_lock); +	if (buf) { +		u64 next_seq; +		u64 seq; +		u32 idx; + +		if (clear_seq < log_first_seq) { +			/* messages are gone, move to first available one */ +			clear_seq = log_first_seq; +			clear_idx = log_first_idx; +		} + +		/* +		 * Find first record that fits, including all following records, +		 * into the user-provided buffer for this dump. +		*/ +		seq = clear_seq; +		idx = clear_idx; +		while (seq < log_next_seq) { +			struct log *msg = log_from_idx(idx); + +			len += msg_print_text(msg, true, NULL, 0); +			idx = log_next(idx); +			seq++; +		} +		seq = clear_seq; +		idx = clear_idx; +		while (len > size && seq < log_next_seq) { +			struct log *msg = log_from_idx(idx); + +			len -= msg_print_text(msg, true, NULL, 0); +			idx = log_next(idx); +			seq++; +		} + +		/* last message in this dump */ +		next_seq = log_next_seq; + +		len = 0; +		while (len >= 0 && seq < next_seq) { +			struct log *msg = log_from_idx(idx); +			int textlen; + +			textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); +			if (textlen < 0) { +				len = textlen; +				break; +			} +			idx = log_next(idx); +			seq++; + +			raw_spin_unlock_irq(&logbuf_lock); +			if (copy_to_user(buf + len, text, textlen)) +				len = -EFAULT; +			else +				len += textlen; +			raw_spin_lock_irq(&logbuf_lock); + +			if (seq < log_first_seq) { +				/* messages are gone, move to next one */ +				seq = log_first_seq; +				idx = log_first_idx; +			} +		} +	} + +	if (clear) { +		clear_seq = log_next_seq; +		clear_idx = log_next_idx; +	} +	raw_spin_unlock_irq(&logbuf_lock); + +	kfree(text); +	return len; +} +  int do_syslog(int type, char __user *buf, int len, bool from_file)  { -	unsigned i, j, limit, count; -	int do_clear = 0; -	char c; +	bool clear = false; +	static int saved_console_loglevel = -1;  	int error;  	error = check_syslog_permissions(type, from_file); @@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			goto out;  		}  		error = wait_event_interruptible(log_wait, -							(log_start - log_end)); +						 syslog_seq != log_next_seq);  		if (error)  			goto out; -		i = 0; -		raw_spin_lock_irq(&logbuf_lock); -		while (!error && (log_start != log_end) && i < len) { -			c = LOG_BUF(log_start); -			log_start++; -			raw_spin_unlock_irq(&logbuf_lock); -			error = __put_user(c,buf); -			buf++; -			i++; -			cond_resched(); -			raw_spin_lock_irq(&logbuf_lock); -		} -		raw_spin_unlock_irq(&logbuf_lock); -		if (!error) -			error = i; +		error = syslog_print(buf, len);  		break;  	/* Read/clear last kernel messages */  	case SYSLOG_ACTION_READ_CLEAR: -		do_clear = 1; +		clear = true;  		/* FALL THRU */  	/* Read last kernel messages */  	case SYSLOG_ACTION_READ_ALL: @@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			error = -EFAULT;  			goto out;  		} -		count = len; -		if (count > log_buf_len) -			count = log_buf_len; -		raw_spin_lock_irq(&logbuf_lock); -		if (count > logged_chars) -			count = logged_chars; -		if (do_clear) -			logged_chars = 0; -		limit = log_end; -		/* -		 * __put_user() could sleep, and while we sleep -		 * printk() could overwrite the messages -		 * we try to copy to user space. Therefore -		 * the messages are copied in reverse. <manfreds> -		 */ -		for (i = 0; i < count && !error; i++) { -			j = limit-1-i; -			if (j + log_buf_len < log_end) -				break; -			c = LOG_BUF(j); -			raw_spin_unlock_irq(&logbuf_lock); -			error = __put_user(c,&buf[count-1-i]); -			cond_resched(); -			raw_spin_lock_irq(&logbuf_lock); -		} -		raw_spin_unlock_irq(&logbuf_lock); -		if (error) -			break; -		error = i; -		if (i != count) { -			int offset = count-error; -			/* buffer overflow during copy, correct user buffer. */ -			for (i = 0; i < error; i++) { -				if (__get_user(c,&buf[i+offset]) || -				    __put_user(c,&buf[i])) { -					error = -EFAULT; -					break; -				} -				cond_resched(); -			} -		} +		error = syslog_print_all(buf, len, clear);  		break;  	/* Clear ring buffer */  	case SYSLOG_ACTION_CLEAR: -		logged_chars = 0; -		break; +		syslog_print_all(NULL, 0, true);  	/* Disable logging to console */  	case SYSLOG_ACTION_CONSOLE_OFF:  		if (saved_console_loglevel == -1) @@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  		break;  	/* Number of chars in the log buffer */  	case SYSLOG_ACTION_SIZE_UNREAD: -		error = log_end - log_start; +		raw_spin_lock_irq(&logbuf_lock); +		if (syslog_seq < log_first_seq) { +			/* messages are gone, move to first one */ +			syslog_seq = log_first_seq; +			syslog_idx = log_first_idx; +		} +		if (from_file) { +			/* +			 * Short-cut for poll(/"proc/kmsg") which simply checks +			 * for pending data, not the size; return the count of +			 * records, not the length. +			 */ +			error = log_next_idx - syslog_idx; +		} else { +			u64 seq; +			u32 idx; + +			error = 0; +			seq = syslog_seq; +			idx = syslog_idx; +			while (seq < log_next_seq) { +				struct log *msg = log_from_idx(idx); + +				error += msg_print_text(msg, true, NULL, 0); +				idx = log_next(idx); +				seq++; +			} +		} +		raw_spin_unlock_irq(&logbuf_lock);  		break;  	/* Size of the log buffer */  	case SYSLOG_ACTION_SIZE_BUFFER: @@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4])  {  	syslog_data[0] = log_buf;  	syslog_data[1] = log_buf + log_buf_len; -	syslog_data[2] = log_buf + log_end - -		(logged_chars < log_buf_len ? logged_chars : log_buf_len); -	syslog_data[3] = log_buf + log_end; +	syslog_data[2] = log_buf + log_first_idx; +	syslog_data[3] = log_buf + log_next_idx;  }  #endif	/* CONFIG_KGDB_KDB */ -/* - * Call the console drivers on a range of log_buf - */ -static void __call_console_drivers(unsigned start, unsigned end) -{ -	struct console *con; - -	for_each_console(con) { -		if (exclusive_console && con != exclusive_console) -			continue; -		if ((con->flags & CON_ENABLED) && con->write && -				(cpu_online(smp_processor_id()) || -				(con->flags & CON_ANYTIME))) -			con->write(con, &LOG_BUF(start), end - start); -	} -} -  static bool __read_mostly ignore_loglevel;  static int __init ignore_loglevel_setup(char *str) @@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"  	"print all kernel messages to the console.");  /* - * Write out chars from start to end - 1 inclusive - */ -static void _call_console_drivers(unsigned start, -				unsigned end, int msg_log_level) -{ -	trace_console(&LOG_BUF(0), start, end, log_buf_len); - -	if ((msg_log_level < console_loglevel || ignore_loglevel) && -			console_drivers && start != end) { -		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { -			/* wrapped write */ -			__call_console_drivers(start & LOG_BUF_MASK, -						log_buf_len); -			__call_console_drivers(0, end & LOG_BUF_MASK); -		} else { -			__call_console_drivers(start, end); -		} -	} -} - -/* - * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the - * lower 3 bit are the log level, the rest are the log facility. In case - * userspace passes usual userspace syslog messages to /dev/kmsg or - * /dev/ttyprintk, the log prefix might contain the facility. Printk needs - * to extract the correct log level for in-kernel processing, and not mangle - * the original value. - * - * If a prefix is found, the length of the prefix is returned. If 'level' is - * passed, it will be filled in with the log level without a possible facility - * value. If 'special' is passed, the special printk prefix chars are accepted - * and returned. If no valid header is found, 0 is returned and the passed - * variables are not touched. - */ -static size_t log_prefix(const char *p, unsigned int *level, char *special) -{ -	unsigned int lev = 0; -	char sp = '\0'; -	size_t len; - -	if (p[0] != '<' || !p[1]) -		return 0; -	if (p[2] == '>') { -		/* usual single digit level number or special char */ -		switch (p[1]) { -		case '0' ... '7': -			lev = p[1] - '0'; -			break; -		case 'c': /* KERN_CONT */ -		case 'd': /* KERN_DEFAULT */ -			sp = p[1]; -			break; -		default: -			return 0; -		} -		len = 3; -	} else { -		/* multi digit including the level and facility number */ -		char *endp = NULL; - -		lev = (simple_strtoul(&p[1], &endp, 10) & 7); -		if (endp == NULL || endp[0] != '>') -			return 0; -		len = (endp + 1) - p; -	} - -	/* do not accept special char if not asked for */ -	if (sp && !special) -		return 0; - -	if (special) { -		*special = sp; -		/* return special char, do not touch level */ -		if (sp) -			return len; -	} - -	if (level) -		*level = lev; -	return len; -} - -/*   * Call the console drivers, asking them to write out   * log_buf[start] to log_buf[end - 1].   * The console_lock must be held.   */ -static void call_console_drivers(unsigned start, unsigned end) +static void call_console_drivers(int level, const char *text, size_t len)  { -	unsigned cur_index, start_print; -	static int msg_level = -1; +	struct console *con; -	BUG_ON(((int)(start - end)) > 0); +	trace_console(text, 0, len, len); -	cur_index = start; -	start_print = start; -	while (cur_index != end) { -		if (msg_level < 0 && ((end - cur_index) > 2)) { -			/* strip log prefix */ -			cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); -			start_print = cur_index; -		} -		while (cur_index != end) { -			char c = LOG_BUF(cur_index); +	if (level >= console_loglevel && !ignore_loglevel) +		return; +	if (!console_drivers) +		return; -			cur_index++; -			if (c == '\n') { -				if (msg_level < 0) { -					/* -					 * printk() has already given us loglevel tags in -					 * the buffer.  This code is here in case the -					 * log buffer has wrapped right round and scribbled -					 * on those tags -					 */ -					msg_level = default_message_loglevel; -				} -				_call_console_drivers(start_print, cur_index, msg_level); -				msg_level = -1; -				start_print = cur_index; -				break; -			} -		} +	for_each_console(con) { +		if (exclusive_console && con != exclusive_console) +			continue; +		if (!(con->flags & CON_ENABLED)) +			continue; +		if (!con->write) +			continue; +		if (!cpu_online(smp_processor_id()) && +		    !(con->flags & CON_ANYTIME)) +			continue; +		con->write(con, text, len);  	} -	_call_console_drivers(start_print, end, msg_level); -} - -static void emit_log_char(char c) -{ -	LOG_BUF(log_end) = c; -	log_end++; -	if (log_end - log_start > log_buf_len) -		log_start = log_end - log_buf_len; -	if (log_end - con_start > log_buf_len) -		con_start = log_end - log_buf_len; -	if (logged_chars < log_buf_len) -		logged_chars++;  }  /* @@ -700,16 +1183,6 @@ static void zap_locks(void)  	sema_init(&console_sem, 1);  } -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time = 0; -#endif -module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); - -static bool always_kmsg_dump; -module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); -  /* Check if we have any console registered that can be called early in boot. */  static int have_callable_console(void)  { @@ -722,51 +1195,6 @@ static int have_callable_console(void)  	return 0;  } -/** - * printk - print a kernel message - * @fmt: format string - * - * This is printk().  It can be called from any context.  We want it to work. - * - * We try to grab the console_lock.  If we succeed, it's easy - we log the output and - * call the console drivers.  If we fail to get the semaphore we place the output - * into the log buffer and return.  The current holder of the console_sem will - * notice the new output in console_unlock(); and will send it to the - * consoles before releasing the lock. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - * - * See the vsnprintf() documentation for format string extensions over C99. - */ - -asmlinkage int printk(const char *fmt, ...) -{ -	va_list args; -	int r; - -#ifdef CONFIG_KGDB_KDB -	if (unlikely(kdb_trap_printk)) { -		va_start(args, fmt); -		r = vkdb_printf(fmt, args); -		va_end(args); -		return r; -	} -#endif -	va_start(args, fmt); -	r = vprintk(fmt, args); -	va_end(args); - -	return r; -} - -/* cpu currently holding logbuf_lock */ -static volatile unsigned int printk_cpu = UINT_MAX; -  /*   * Can we actually use the console at this time on this cpu?   * @@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu)  			retval = 0;  		}  	} -	printk_cpu = UINT_MAX; +	logbuf_cpu = UINT_MAX;  	if (wake)  		up(&console_sem);  	raw_spin_unlock(&logbuf_lock);  	return retval;  } -static const char recursion_bug_msg [] = -		KERN_CRIT "BUG: recent printk recursion!\n"; -static int recursion_bug; -static int new_text_line = 1; -static char printk_buf[1024];  int printk_delay_msec __read_mostly; @@ -836,15 +1259,23 @@ static inline void printk_delay(void)  	}  } -asmlinkage int vprintk(const char *fmt, va_list args) +asmlinkage int vprintk_emit(int facility, int level, +			    const char *dict, size_t dictlen, +			    const char *fmt, va_list args)  { -	int printed_len = 0; -	int current_log_level = default_message_loglevel; +	static int recursion_bug; +	static char cont_buf[LOG_LINE_MAX]; +	static size_t cont_len; +	static int cont_level; +	static struct task_struct *cont_task; +	static char textbuf[LOG_LINE_MAX]; +	char *text = textbuf; +	size_t text_len;  	unsigned long flags;  	int this_cpu; -	char *p; -	size_t plen; -	char special; +	bool newline = false; +	bool prefix = false; +	int printed_len = 0;  	boot_delay_msec();  	printk_delay(); @@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)  	/*  	 * Ouch, printk recursed into itself!  	 */ -	if (unlikely(printk_cpu == this_cpu)) { +	if (unlikely(logbuf_cpu == this_cpu)) {  		/*  		 * If a crash is occurring during printk() on this CPU,  		 * then try to get the crash message out but make sure @@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args)  	lockdep_off();  	raw_spin_lock(&logbuf_lock); -	printk_cpu = this_cpu; +	logbuf_cpu = this_cpu;  	if (recursion_bug) { +		static const char recursion_msg[] = +			"BUG: recent printk recursion!"; +  		recursion_bug = 0; -		strcpy(printk_buf, recursion_bug_msg); -		printed_len = strlen(recursion_bug_msg); +		printed_len += strlen(recursion_msg); +		/* emit KERN_CRIT message */ +		log_store(0, 2, NULL, 0, recursion_msg, printed_len);  	} -	/* Emit the output into the temporary buffer */ -	printed_len += vscnprintf(printk_buf + printed_len, -				  sizeof(printk_buf) - printed_len, fmt, args); -	p = printk_buf; +	/* +	 * The printf needs to come first; we need the syslog +	 * prefix which might be passed-in as a parameter. +	 */ +	text_len = vscnprintf(text, sizeof(textbuf), fmt, args); -	/* Read log level and handle special printk prefix */ -	plen = log_prefix(p, ¤t_log_level, &special); -	if (plen) { -		p += plen; +	/* mark and strip a trailing newline */ +	if (text_len && text[text_len-1] == '\n') { +		text_len--; +		newline = true; +	} -		switch (special) { -		case 'c': /* Strip <c> KERN_CONT, continue line */ -			plen = 0; -			break; -		case 'd': /* Strip <d> KERN_DEFAULT, start new line */ -			plen = 0; -		default: -			if (!new_text_line) { -				emit_log_char('\n'); -				new_text_line = 1; -			} +	/* strip syslog prefix and extract log level or control flags */ +	if (text[0] == '<' && text[1] && text[2] == '>') { +		switch (text[1]) { +		case '0' ... '7': +			if (level == -1) +				level = text[1] - '0'; +		case 'd':	/* KERN_DEFAULT */ +			prefix = true; +		case 'c':	/* KERN_CONT */ +			text += 3; +			text_len -= 3;  		}  	} -	/* -	 * Copy the output into log_buf. If the caller didn't provide -	 * the appropriate log prefix, we insert them here -	 */ -	for (; *p; p++) { -		if (new_text_line) { -			new_text_line = 0; +	if (level == -1) +		level = default_message_loglevel; -			if (plen) { -				/* Copy original log prefix */ -				int i; - -				for (i = 0; i < plen; i++) -					emit_log_char(printk_buf[i]); -				printed_len += plen; -			} else { -				/* Add log prefix */ -				emit_log_char('<'); -				emit_log_char(current_log_level + '0'); -				emit_log_char('>'); -				printed_len += 3; -			} +	if (dict) { +		prefix = true; +		newline = true; +	} -			if (printk_time) { -				/* Add the current time stamp */ -				char tbuf[50], *tp; -				unsigned tlen; -				unsigned long long t; -				unsigned long nanosec_rem; +	if (!newline) { +		if (cont_len && (prefix || cont_task != current)) { +			/* +			 * Flush earlier buffer, which is either from a +			 * different thread, or when we got a new prefix. +			 */ +			log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); +			cont_len = 0; +		} -				t = cpu_clock(printk_cpu); -				nanosec_rem = do_div(t, 1000000000); -				tlen = sprintf(tbuf, "[%5lu.%06lu] ", -						(unsigned long) t, -						nanosec_rem / 1000); +		if (!cont_len) { +			cont_level = level; +			cont_task = current; +		} -				for (tp = tbuf; tp < tbuf + tlen; tp++) -					emit_log_char(*tp); -				printed_len += tlen; +		/* buffer or append to earlier buffer from the same thread */ +		if (cont_len + text_len > sizeof(cont_buf)) +			text_len = sizeof(cont_buf) - cont_len; +		memcpy(cont_buf + cont_len, text, text_len); +		cont_len += text_len; +	} else { +		if (cont_len && cont_task == current) { +			if (prefix) { +				/* +				 * New prefix from the same thread; flush. We +				 * either got no earlier newline, or we race +				 * with an interrupt. +				 */ +				log_store(facility, cont_level, +					  NULL, 0, cont_buf, cont_len); +				cont_len = 0;  			} -			if (!*p) -				break; +			/* append to the earlier buffer and flush */ +			if (cont_len + text_len > sizeof(cont_buf)) +				text_len = sizeof(cont_buf) - cont_len; +			memcpy(cont_buf + cont_len, text, text_len); +			cont_len += text_len; +			log_store(facility, cont_level, +				  NULL, 0, cont_buf, cont_len); +			cont_len = 0; +			cont_task = NULL; +			printed_len = cont_len; +		} else { +			/* ordinary single and terminated line */ +			log_store(facility, level, +				  dict, dictlen, text, text_len); +			printed_len = text_len;  		} - -		emit_log_char(*p); -		if (*p == '\n') -			new_text_line = 1;  	}  	/* -	 * Try to acquire and then immediately release the -	 * console semaphore. The release will do all the -	 * actual magic (print out buffers, wake up klogd, -	 * etc).  +	 * Try to acquire and then immediately release the console semaphore. +	 * The release will print out buffers and wake up /dev/kmsg and syslog() +	 * users.  	 * -	 * The console_trylock_for_printk() function -	 * will release 'logbuf_lock' regardless of whether it -	 * actually gets the semaphore or not. +	 * The console_trylock_for_printk() function will release 'logbuf_lock' +	 * regardless of whether it actually gets the console semaphore or not.  	 */  	if (console_trylock_for_printk(this_cpu))  		console_unlock(); @@ -974,16 +1418,81 @@ out_restore_irqs:  	return printed_len;  } -EXPORT_SYMBOL(printk); -EXPORT_SYMBOL(vprintk); +EXPORT_SYMBOL(vprintk_emit); -#else +asmlinkage int vprintk(const char *fmt, va_list args) +{ +	return vprintk_emit(0, -1, NULL, 0, fmt, args); +} +EXPORT_SYMBOL(vprintk); -static void call_console_drivers(unsigned start, unsigned end) +asmlinkage int printk_emit(int facility, int level, +			   const char *dict, size_t dictlen, +			   const char *fmt, ...)  { +	va_list args; +	int r; + +	va_start(args, fmt); +	r = vprintk_emit(facility, level, dict, dictlen, fmt, args); +	va_end(args); + +	return r;  } +EXPORT_SYMBOL(printk_emit); + +/** + * printk - print a kernel message + * @fmt: format string + * + * This is printk(). It can be called from any context. We want it to work. + * + * We try to grab the console_lock. If we succeed, it's easy - we log the + * output and call the console drivers.  If we fail to get the semaphore, we + * place the output into the log buffer and return. The current holder of + * the console_sem will notice the new output in console_unlock(); and will + * send it to the consoles before releasing the lock. + * + * One effect of this deferred printing is that code which calls printk() and + * then changes console_loglevel may break. This is because console_loglevel + * is inspected when the actual printing occurs. + * + * See also: + * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. + */ +asmlinkage int printk(const char *fmt, ...) +{ +	va_list args; +	int r; +#ifdef CONFIG_KGDB_KDB +	if (unlikely(kdb_trap_printk)) { +		va_start(args, fmt); +		r = vkdb_printf(fmt, args); +		va_end(args); +		return r; +	}  #endif +	va_start(args, fmt); +	r = vprintk_emit(0, -1, NULL, 0, fmt, args); +	va_end(args); + +	return r; +} +EXPORT_SYMBOL(printk); + +#else + +#define LOG_LINE_MAX 0 +static struct log *log_from_idx(u32 idx) { return NULL; } +static u32 log_next(u32 idx) { return 0; } +static void call_console_drivers(int level, const char *text, size_t len) {} +static size_t msg_print_text(const struct log *msg, bool syslog, +			     char *buf, size_t size) { return 0; } + +#endif /* CONFIG_PRINTK */  static int __add_preferred_console(char *name, int idx, char *options,  				   char *brl_options) @@ -1217,7 +1726,7 @@ int is_console_locked(void)  }  /* - * Delayed printk facility, for scheduler-internal messages: + * Delayed printk version, for scheduler-internal messages:   */  #define PRINTK_BUF_SIZE		512 @@ -1253,6 +1762,10 @@ void wake_up_klogd(void)  		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);  } +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +  /**   * console_unlock - unlock the console system   * @@ -1263,15 +1776,16 @@ void wake_up_klogd(void)   * by printk().  If this is the case, console_unlock(); emits   * the output prior to releasing the lock.   * - * If there is output waiting for klogd, we wake it up. + * If there is output waiting, we wake /dev/kmsg and syslog() users.   *   * console_unlock(); may be called from any context.   */  void console_unlock(void)  { +	static u64 seen_seq;  	unsigned long flags; -	unsigned _con_start, _log_end; -	unsigned wake_klogd = 0, retry = 0; +	bool wake_klogd = false; +	bool retry;  	if (console_suspended) {  		up(&console_sem); @@ -1281,17 +1795,38 @@ void console_unlock(void)  	console_may_schedule = 0;  again: -	for ( ; ; ) { +	for (;;) { +		struct log *msg; +		static char text[LOG_LINE_MAX]; +		size_t len; +		int level; +  		raw_spin_lock_irqsave(&logbuf_lock, flags); -		wake_klogd |= log_start - log_end; -		if (con_start == log_end) -			break;			/* Nothing to print */ -		_con_start = con_start; -		_log_end = log_end; -		con_start = log_end;		/* Flush */ +		if (seen_seq != log_next_seq) { +			wake_klogd = true; +			seen_seq = log_next_seq; +		} + +		if (console_seq < log_first_seq) { +			/* messages are gone, move to first one */ +			console_seq = log_first_seq; +			console_idx = log_first_idx; +		} + +		if (console_seq == log_next_seq) +			break; + +		msg = log_from_idx(console_idx); +		level = msg->level & 7; + +		len = msg_print_text(msg, false, text, sizeof(text)); + +		console_idx = log_next(console_idx); +		console_seq++;  		raw_spin_unlock(&logbuf_lock); +  		stop_critical_timings();	/* don't trace print latency */ -		call_console_drivers(_con_start, _log_end); +		call_console_drivers(level, text, len);  		start_critical_timings();  		local_irq_restore(flags);  	} @@ -1312,8 +1847,7 @@ again:  	 * flush, no worries.  	 */  	raw_spin_lock(&logbuf_lock); -	if (con_start != log_end) -		retry = 1; +	retry = console_seq != log_next_seq;  	raw_spin_unlock_irqrestore(&logbuf_lock, flags);  	if (retry && console_trylock()) @@ -1549,7 +2083,8 @@ void register_console(struct console *newcon)  		 * for us.  		 */  		raw_spin_lock_irqsave(&logbuf_lock, flags); -		con_start = log_start; +		console_seq = syslog_seq; +		console_idx = syslog_idx;  		raw_spin_unlock_irqrestore(&logbuf_lock, flags);  		/*  		 * We're about to replay the log buffer.  Only do this to the @@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)  }  EXPORT_SYMBOL_GPL(kmsg_dump_unregister); +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +  /**   * kmsg_dump - dump kernel log to kernel message dumpers.   * @reason: the reason (oops, panic etc) for dumping @@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);   */  void kmsg_dump(enum kmsg_dump_reason reason)  { -	unsigned long end; -	unsigned chars; +	u64 idx;  	struct kmsg_dumper *dumper;  	const char *s1, *s2;  	unsigned long l1, l2; @@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason)  	/* Theoretically, the log could move on after we do this, but  	   there's not a lot we can do about that. The new messages  	   will overwrite the start of what we dump. */ +  	raw_spin_lock_irqsave(&logbuf_lock, flags); -	end = log_end & LOG_BUF_MASK; -	chars = logged_chars; -	raw_spin_unlock_irqrestore(&logbuf_lock, flags); +	if (syslog_seq < log_first_seq) +		idx = syslog_idx; +	else +		idx = log_first_idx; -	if (chars > end) { -		s1 = log_buf + log_buf_len - chars + end; -		l1 = chars - end; +	if (idx > log_next_idx) { +		s1 = log_buf; +		l1 = log_next_idx; -		s2 = log_buf; -		l2 = end; +		s2 = log_buf + idx; +		l2 = log_buf_len - idx;  	} else {  		s1 = "";  		l1 = 0; -		s2 = log_buf + end - chars; -		l2 = chars; +		s2 = log_buf + idx; +		l2 = log_next_idx - idx;  	} +	raw_spin_unlock_irqrestore(&logbuf_lock, flags);  	rcu_read_lock();  	list_for_each_entry_rcu(dumper, &dump_list, list) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc2..95cba41ce1e 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -51,6 +51,34 @@  #include "rcu.h" +#ifdef CONFIG_PREEMPT_RCU + +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so.  No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ +	struct task_struct *t = current; + +	if (likely(list_empty(¤t->rcu_node_entry))) +		return; +	t->rcu_read_lock_nesting = 1; +	barrier(); +	t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; +	__rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +void exit_rcu(void) +{ +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +  #ifdef CONFIG_DEBUG_LOCK_ALLOC  static struct lock_class_key rcu_lock_key;  struct lockdep_map rcu_lock_map = diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb6..fc31a2d6510 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)  	return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;  } -/* - * Check for a task exiting while in a preemptible -RCU read-side - * critical section, clean up if so.  No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ -	struct task_struct *t = current; - -	if (t->rcu_read_lock_nesting == 0) -		return; -	t->rcu_read_lock_nesting = 1; -	__rcu_read_unlock(); -} -  #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */  #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6..e66b34ab755 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -64,6 +64,7 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */  static int fqs_duration;	/* Duration of bursts (us), 0 to disable. */  static int fqs_holdoff;		/* Hold time within burst (us). */  static int fqs_stutter = 3;	/* Wait time between bursts (s). */ +static int n_barrier_cbs;	/* Number of callbacks to test RCU barriers. */  static int onoff_interval;	/* Wait time between CPU hotplugs, 0=disable. */  static int onoff_holdoff;	/* Seconds after boot before CPU hotplugs. */  static int shutdown_secs;	/* Shutdown time (s).  <=0 for no shutdown. */ @@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);  MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");  module_param(fqs_stutter, int, 0444);  MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(n_barrier_cbs, int, 0444); +MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");  module_param(onoff_interval, int, 0444);  MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");  module_param(onoff_holdoff, int, 0444); @@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;  static struct task_struct *onoff_task;  #endif /* #ifdef CONFIG_HOTPLUG_CPU */  static struct task_struct *stall_task; +static struct task_struct **barrier_cbs_tasks; +static struct task_struct *barrier_task;  #define RCU_TORTURE_PIPE_LEN 10 @@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;  static atomic_t n_rcu_torture_free;  static atomic_t n_rcu_torture_mberror;  static atomic_t n_rcu_torture_error; +static long n_rcu_torture_barrier_error;  static long n_rcu_torture_boost_ktrerror;  static long n_rcu_torture_boost_rterror;  static long n_rcu_torture_boost_failure; @@ -173,6 +179,8 @@ static long n_offline_attempts;  static long n_offline_successes;  static long n_online_attempts;  static long n_online_successes; +static long n_barrier_attempts; +static long n_barrier_successes;  static struct list_head rcu_torture_removed;  static cpumask_var_t shuffle_tmp_mask; @@ -197,6 +205,10 @@ static unsigned long shutdown_time;	/* jiffies to system shutdown. */  static unsigned long boost_starttime;	/* jiffies of next boost test start. */  DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */  					/*  and boost task create/destroy. */ +static atomic_t barrier_cbs_count;	/* Barrier callbacks registered. */ +static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */ +static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ +static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);  /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */ @@ -327,6 +339,7 @@ struct rcu_torture_ops {  	int (*completed)(void);  	void (*deferred_free)(struct rcu_torture *p);  	void (*sync)(void); +	void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));  	void (*cb_barrier)(void);  	void (*fqs)(void);  	int (*stats)(char *page); @@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {  	.completed	= rcu_torture_completed,  	.deferred_free	= rcu_torture_deferred_free,  	.sync		= synchronize_rcu, +	.call		= call_rcu,  	.cb_barrier	= rcu_barrier,  	.fqs		= rcu_force_quiescent_state,  	.stats		= NULL, @@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {  	.completed	= rcu_torture_completed,  	.deferred_free	= rcu_sync_torture_deferred_free,  	.sync		= synchronize_rcu, +	.call		= NULL,  	.cb_barrier	= NULL,  	.fqs		= rcu_force_quiescent_state,  	.stats		= NULL, @@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {  	.completed	= rcu_no_completed,  	.deferred_free	= rcu_sync_torture_deferred_free,  	.sync		= synchronize_rcu_expedited, +	.call		= NULL,  	.cb_barrier	= NULL,  	.fqs		= rcu_force_quiescent_state,  	.stats		= NULL, @@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {  	.completed	= rcu_bh_torture_completed,  	.deferred_free	= rcu_bh_torture_deferred_free,  	.sync		= synchronize_rcu_bh, +	.call		= call_rcu_bh,  	.cb_barrier	= rcu_barrier_bh,  	.fqs		= rcu_bh_force_quiescent_state,  	.stats		= NULL, @@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {  	.completed	= rcu_bh_torture_completed,  	.deferred_free	= rcu_sync_torture_deferred_free,  	.sync		= synchronize_rcu_bh, +	.call		= NULL,  	.cb_barrier	= NULL,  	.fqs		= rcu_bh_force_quiescent_state,  	.stats		= NULL, @@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {  	.completed	= rcu_bh_torture_completed,  	.deferred_free	= rcu_sync_torture_deferred_free,  	.sync		= synchronize_rcu_bh_expedited, +	.call		= NULL,  	.cb_barrier	= NULL,  	.fqs		= rcu_bh_force_quiescent_state,  	.stats		= NULL, @@ -606,6 +625,11 @@ static int srcu_torture_completed(void)  	return srcu_batches_completed(&srcu_ctl);  } +static void srcu_torture_deferred_free(struct rcu_torture *rp) +{ +	call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); +} +  static void srcu_torture_synchronize(void)  {  	synchronize_srcu(&srcu_ctl); @@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)  	cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",  		       torture_type, TORTURE_FLAG, idx);  	for_each_possible_cpu(cpu) { -		cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, +		cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,  			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],  			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);  	} @@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {  	.read_delay	= srcu_read_delay,  	.readunlock	= srcu_torture_read_unlock,  	.completed	= srcu_torture_completed, -	.deferred_free	= rcu_sync_torture_deferred_free, +	.deferred_free	= srcu_torture_deferred_free,  	.sync		= srcu_torture_synchronize, +	.call		= NULL,  	.cb_barrier	= NULL,  	.stats		= srcu_torture_stats,  	.name		= "srcu"  }; +static struct rcu_torture_ops srcu_sync_ops = { +	.init		= srcu_torture_init, +	.cleanup	= srcu_torture_cleanup, +	.readlock	= srcu_torture_read_lock, +	.read_delay	= srcu_read_delay, +	.readunlock	= srcu_torture_read_unlock, +	.completed	= srcu_torture_completed, +	.deferred_free	= rcu_sync_torture_deferred_free, +	.sync		= srcu_torture_synchronize, +	.call		= NULL, +	.cb_barrier	= NULL, +	.stats		= srcu_torture_stats, +	.name		= "srcu_sync" +}; +  static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)  {  	return srcu_read_lock_raw(&srcu_ctl); @@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {  	.read_delay	= srcu_read_delay,  	.readunlock	= srcu_torture_read_unlock_raw,  	.completed	= srcu_torture_completed, -	.deferred_free	= rcu_sync_torture_deferred_free, +	.deferred_free	= srcu_torture_deferred_free,  	.sync		= srcu_torture_synchronize, +	.call		= NULL,  	.cb_barrier	= NULL,  	.stats		= srcu_torture_stats,  	.name		= "srcu_raw"  }; +static struct rcu_torture_ops srcu_raw_sync_ops = { +	.init		= srcu_torture_init, +	.cleanup	= srcu_torture_cleanup, +	.readlock	= srcu_torture_read_lock_raw, +	.read_delay	= srcu_read_delay, +	.readunlock	= srcu_torture_read_unlock_raw, +	.completed	= srcu_torture_completed, +	.deferred_free	= rcu_sync_torture_deferred_free, +	.sync		= srcu_torture_synchronize, +	.call		= NULL, +	.cb_barrier	= NULL, +	.stats		= srcu_torture_stats, +	.name		= "srcu_raw_sync" +}; +  static void srcu_torture_synchronize_expedited(void)  {  	synchronize_srcu_expedited(&srcu_ctl); @@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {  	.completed	= srcu_torture_completed,  	.deferred_free	= rcu_sync_torture_deferred_free,  	.sync		= srcu_torture_synchronize_expedited, +	.call		= NULL,  	.cb_barrier	= NULL,  	.stats		= srcu_torture_stats,  	.name		= "srcu_expedited" @@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)  		       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "  		       "rtmbe: %d rtbke: %ld rtbre: %ld "  		       "rtbf: %ld rtb: %ld nt: %ld " -		       "onoff: %ld/%ld:%ld/%ld", +		       "onoff: %ld/%ld:%ld/%ld " +		       "barrier: %ld/%ld:%ld",  		       rcu_torture_current,  		       rcu_torture_current_version,  		       list_empty(&rcu_torture_freelist), @@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)  		       n_online_successes,  		       n_online_attempts,  		       n_offline_successes, -		       n_offline_attempts); +		       n_offline_attempts, +		       n_barrier_successes, +		       n_barrier_attempts, +		       n_rcu_torture_barrier_error); +	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);  	if (atomic_read(&n_rcu_torture_mberror) != 0 || +	    n_rcu_torture_barrier_error != 0 ||  	    n_rcu_torture_boost_ktrerror != 0 ||  	    n_rcu_torture_boost_rterror != 0 || -	    n_rcu_torture_boost_failure != 0) -		cnt += sprintf(&page[cnt], " !!!"); -	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); -	if (i > 1) { +	    n_rcu_torture_boost_failure != 0 || +	    i > 1) {  		cnt += sprintf(&page[cnt], "!!! ");  		atomic_inc(&n_rcu_torture_error);  		WARN_ON_ONCE(1); @@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)  	/* This must be outside of the mutex, otherwise deadlock! */  	kthread_stop(t); +	boost_tasks[cpu] = NULL;  }  static int rcutorture_booster_init(int cpu) @@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)  		return;  	VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");  	kthread_stop(onoff_task); +	onoff_task = NULL;  }  #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void +static int  rcu_torture_onoff_init(void)  { +	return 0;  }  static void rcu_torture_onoff_cleanup(void) @@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)  		return;  	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");  	kthread_stop(stall_task); +	stall_task = NULL; +} + +/* Callback function for RCU barrier testing. */ +void rcu_torture_barrier_cbf(struct rcu_head *rcu) +{ +	atomic_inc(&barrier_cbs_invoked); +} + +/* kthread function to register callbacks used to test RCU barriers. */ +static int rcu_torture_barrier_cbs(void *arg) +{ +	long myid = (long)arg; +	struct rcu_head rcu; + +	init_rcu_head_on_stack(&rcu); +	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); +	set_user_nice(current, 19); +	do { +		wait_event(barrier_cbs_wq[myid], +			   atomic_read(&barrier_cbs_count) == n_barrier_cbs || +			   kthread_should_stop() || +			   fullstop != FULLSTOP_DONTSTOP); +		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) +			break; +		cur_ops->call(&rcu, rcu_torture_barrier_cbf); +		if (atomic_dec_and_test(&barrier_cbs_count)) +			wake_up(&barrier_wq); +	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); +	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); +	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); +	while (!kthread_should_stop()) +		schedule_timeout_interruptible(1); +	cur_ops->cb_barrier(); +	destroy_rcu_head_on_stack(&rcu); +	return 0; +} + +/* kthread function to drive and coordinate RCU barrier testing. */ +static int rcu_torture_barrier(void *arg) +{ +	int i; + +	VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); +	do { +		atomic_set(&barrier_cbs_invoked, 0); +		atomic_set(&barrier_cbs_count, n_barrier_cbs); +		/* wake_up() path contains the required barriers. */ +		for (i = 0; i < n_barrier_cbs; i++) +			wake_up(&barrier_cbs_wq[i]); +		wait_event(barrier_wq, +			   atomic_read(&barrier_cbs_count) == 0 || +			   kthread_should_stop() || +			   fullstop != FULLSTOP_DONTSTOP); +		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) +			break; +		n_barrier_attempts++; +		cur_ops->cb_barrier(); +		if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { +			n_rcu_torture_barrier_error++; +			WARN_ON_ONCE(1); +		} +		n_barrier_successes++; +		schedule_timeout_interruptible(HZ / 10); +	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); +	VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); +	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); +	while (!kthread_should_stop()) +		schedule_timeout_interruptible(1); +	return 0; +} + +/* Initialize RCU barrier testing. */ +static int rcu_torture_barrier_init(void) +{ +	int i; +	int ret; + +	if (n_barrier_cbs == 0) +		return 0; +	if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { +		printk(KERN_ALERT "%s" TORTURE_FLAG +		       " Call or barrier ops missing for %s,\n", +		       torture_type, cur_ops->name); +		printk(KERN_ALERT "%s" TORTURE_FLAG +		       " RCU barrier testing omitted from run.\n", +		       torture_type); +		return 0; +	} +	atomic_set(&barrier_cbs_count, 0); +	atomic_set(&barrier_cbs_invoked, 0); +	barrier_cbs_tasks = +		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), +			GFP_KERNEL); +	barrier_cbs_wq = +		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), +			GFP_KERNEL); +	if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) +		return -ENOMEM; +	for (i = 0; i < n_barrier_cbs; i++) { +		init_waitqueue_head(&barrier_cbs_wq[i]); +		barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, +						   (void *)(long)i, +						   "rcu_torture_barrier_cbs"); +		if (IS_ERR(barrier_cbs_tasks[i])) { +			ret = PTR_ERR(barrier_cbs_tasks[i]); +			VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); +			barrier_cbs_tasks[i] = NULL; +			return ret; +		} +	} +	barrier_task = kthread_run(rcu_torture_barrier, NULL, +				   "rcu_torture_barrier"); +	if (IS_ERR(barrier_task)) { +		ret = PTR_ERR(barrier_task); +		VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); +		barrier_task = NULL; +	} +	return 0; +} + +/* Clean up after RCU barrier testing. */ +static void rcu_torture_barrier_cleanup(void) +{ +	int i; + +	if (barrier_task != NULL) { +		VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); +		kthread_stop(barrier_task); +		barrier_task = NULL; +	} +	if (barrier_cbs_tasks != NULL) { +		for (i = 0; i < n_barrier_cbs; i++) { +			if (barrier_cbs_tasks[i] != NULL) { +				VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); +				kthread_stop(barrier_cbs_tasks[i]); +				barrier_cbs_tasks[i] = NULL; +			} +		} +		kfree(barrier_cbs_tasks); +		barrier_cbs_tasks = NULL; +	} +	if (barrier_cbs_wq != NULL) { +		kfree(barrier_cbs_wq); +		barrier_cbs_wq = NULL; +	}  }  static int rcutorture_cpu_notify(struct notifier_block *self, @@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)  	fullstop = FULLSTOP_RMMOD;  	mutex_unlock(&fullstop_mutex);  	unregister_reboot_notifier(&rcutorture_shutdown_nb); +	rcu_torture_barrier_cleanup();  	rcu_torture_stall_cleanup();  	if (stutter_task) {  		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); @@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)  		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");  		kthread_stop(shutdown_task);  	} +	shutdown_task = NULL;  	rcu_torture_onoff_cleanup();  	/* Wait for all RCU callbacks to fire.  */ @@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)  	if (cur_ops->cleanup)  		cur_ops->cleanup(); -	if (atomic_read(&n_rcu_torture_error)) +	if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)  		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");  	else if (n_online_successes != n_online_attempts ||  		 n_offline_successes != n_offline_attempts) @@ -1692,10 +1904,12 @@ rcu_torture_init(void)  	int i;  	int cpu;  	int firsterr = 0; +	int retval;  	static struct rcu_torture_ops *torture_ops[] =  		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,  		  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, -		  &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, +		  &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, +		  &srcu_raw_sync_ops, &srcu_expedited_ops,  		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };  	mutex_lock(&fullstop_mutex); @@ -1749,6 +1963,7 @@ rcu_torture_init(void)  	atomic_set(&n_rcu_torture_free, 0);  	atomic_set(&n_rcu_torture_mberror, 0);  	atomic_set(&n_rcu_torture_error, 0); +	n_rcu_torture_barrier_error = 0;  	n_rcu_torture_boost_ktrerror = 0;  	n_rcu_torture_boost_rterror = 0;  	n_rcu_torture_boost_failure = 0; @@ -1872,7 +2087,6 @@ rcu_torture_init(void)  		test_boost_duration = 2;  	if ((test_boost == 1 && cur_ops->can_boost) ||  	    test_boost == 2) { -		int retval;  		boost_starttime = jiffies + test_boost_interval * HZ;  		register_cpu_notifier(&rcutorture_cpu_nb); @@ -1897,9 +2111,22 @@ rcu_torture_init(void)  			goto unwind;  		}  	} -	rcu_torture_onoff_init(); +	i = rcu_torture_onoff_init(); +	if (i != 0) { +		firsterr = i; +		goto unwind; +	}  	register_reboot_notifier(&rcutorture_shutdown_nb); -	rcu_torture_stall_init(); +	i = rcu_torture_stall_init(); +	if (i != 0) { +		firsterr = i; +		goto unwind; +	} +	retval = rcu_torture_barrier_init(); +	if (retval != 0) { +		firsterr = retval; +		goto unwind; +	}  	rcutorture_record_test_transition();  	mutex_unlock(&fullstop_mutex);  	return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d0c5baf1ab1..0da7b88d92d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];  	.gpnum = -300, \  	.completed = -300, \  	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ +	.orphan_nxttail = &structname##_state.orphan_nxtlist, \ +	.orphan_donetail = &structname##_state.orphan_donelist, \  	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \  	.n_force_qs = 0, \  	.n_force_qs_ngp = 0, \ @@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);  unsigned long rcutorture_testseq;  unsigned long rcutorture_vernum; +/* State information for rcu_barrier() and friends. */ + +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; +  /*   * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s   * permit this function to be invoked without holding the root rcu_node @@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)  {  	trace_rcu_utilization("Start context switch");  	rcu_sched_qs(cpu); -	rcu_preempt_note_context_switch(cpu);  	trace_rcu_utilization("End context switch");  }  EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)  #ifdef CONFIG_HOTPLUG_CPU  /* - * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Also record a quiescent state for this CPU for the current grace period. - * Synchronization and interrupt disabling are not required because - * this function executes in stop_machine() context.  Therefore, cleanup - * operations that might block must be done later from the CPU_DEAD - * notifier. - * - * Note that the outgoing CPU's bit has already been cleared in the - * cpu_online_mask.  This allows us to randomly pick a callback - * destination from the bits set in that mask. + * Send the specified CPU's RCU callbacks to the orphanage.  The + * specified CPU must be offline, and the caller must hold the + * ->onofflock.   */ -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +static void +rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, +			  struct rcu_node *rnp, struct rcu_data *rdp)  {  	int i; -	unsigned long mask; -	int receive_cpu = cpumask_any(cpu_online_mask); -	struct rcu_data *rdp = this_cpu_ptr(rsp->rda); -	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); -	RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ -	/* First, adjust the counts. */ +	/* +	 * Orphan the callbacks.  First adjust the counts.  This is safe +	 * because ->onofflock excludes _rcu_barrier()'s adoption of +	 * the callbacks, thus no memory barrier is required. +	 */  	if (rdp->nxtlist != NULL) { -		receive_rdp->qlen_lazy += rdp->qlen_lazy; -		receive_rdp->qlen += rdp->qlen; +		rsp->qlen_lazy += rdp->qlen_lazy; +		rsp->qlen += rdp->qlen; +		rdp->n_cbs_orphaned += rdp->qlen;  		rdp->qlen_lazy = 0;  		rdp->qlen = 0;  	}  	/* -	 * Next, move ready-to-invoke callbacks to be invoked on some -	 * other CPU.  These will not be required to pass through another -	 * grace period:  They are done, regardless of CPU. +	 * Next, move those callbacks still needing a grace period to +	 * the orphanage, where some other CPU will pick them up. +	 * Some of the callbacks might have gone partway through a grace +	 * period, but that is too bad.  They get to start over because we +	 * cannot assume that grace periods are synchronized across CPUs. +	 * We don't bother updating the ->nxttail[] array yet, instead +	 * we just reset the whole thing later on.  	 */ -	if (rdp->nxtlist != NULL && -	    rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { -		struct rcu_head *oldhead; -		struct rcu_head **oldtail; -		struct rcu_head **newtail; - -		oldhead = rdp->nxtlist; -		oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; -		rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; -		*rdp->nxttail[RCU_DONE_TAIL] = *oldtail; -		*receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; -		newtail = rdp->nxttail[RCU_DONE_TAIL]; -		for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { -			if (receive_rdp->nxttail[i] == oldtail) -				receive_rdp->nxttail[i] = newtail; -			if (rdp->nxttail[i] == newtail) -				rdp->nxttail[i] = &rdp->nxtlist; -		} +	if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { +		*rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; +		rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; +		*rdp->nxttail[RCU_DONE_TAIL] = NULL;  	}  	/* -	 * Finally, put the rest of the callbacks at the end of the list. -	 * The ones that made it partway through get to start over:  We -	 * cannot assume that grace periods are synchronized across CPUs. -	 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but -	 * this does not seem compelling.  Not yet, anyway.) +	 * Then move the ready-to-invoke callbacks to the orphanage, +	 * where some other CPU will pick them up.  These will not be +	 * required to pass though another grace period: They are done.  	 */  	if (rdp->nxtlist != NULL) { -		*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; -		receive_rdp->nxttail[RCU_NEXT_TAIL] = -				rdp->nxttail[RCU_NEXT_TAIL]; -		receive_rdp->n_cbs_adopted += rdp->qlen; -		rdp->n_cbs_orphaned += rdp->qlen; - -		rdp->nxtlist = NULL; -		for (i = 0; i < RCU_NEXT_SIZE; i++) -			rdp->nxttail[i] = &rdp->nxtlist; +		*rsp->orphan_donetail = rdp->nxtlist; +		rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];  	} +	/* Finally, initialize the rcu_data structure's list to empty.  */ +	rdp->nxtlist = NULL; +	for (i = 0; i < RCU_NEXT_SIZE; i++) +		rdp->nxttail[i] = &rdp->nxtlist; +} + +/* + * Adopt the RCU callbacks from the specified rcu_state structure's + * orphanage.  The caller must hold the ->onofflock. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +	int i; +	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); +  	/* -	 * Record a quiescent state for the dying CPU.  This is safe -	 * only because we have already cleared out the callbacks. -	 * (Otherwise, the RCU core might try to schedule the invocation -	 * of callbacks on this now-offline CPU, which would be bad.) +	 * If there is an rcu_barrier() operation in progress, then +	 * only the task doing that operation is permitted to adopt +	 * callbacks.  To do otherwise breaks rcu_barrier() and friends +	 * by causing them to fail to wait for the callbacks in the +	 * orphanage.  	 */ -	mask = rdp->grpmask;	/* rnp->grplo is constant. */ +	if (rsp->rcu_barrier_in_progress && +	    rsp->rcu_barrier_in_progress != current) +		return; + +	/* Do the accounting first. */ +	rdp->qlen_lazy += rsp->qlen_lazy; +	rdp->qlen += rsp->qlen; +	rdp->n_cbs_adopted += rsp->qlen; +	rsp->qlen_lazy = 0; +	rsp->qlen = 0; + +	/* +	 * We do not need a memory barrier here because the only way we +	 * can get here if there is an rcu_barrier() in flight is if +	 * we are the task doing the rcu_barrier(). +	 */ + +	/* First adopt the ready-to-invoke callbacks. */ +	if (rsp->orphan_donelist != NULL) { +		*rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; +		*rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; +		for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) +			if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) +				rdp->nxttail[i] = rsp->orphan_donetail; +		rsp->orphan_donelist = NULL; +		rsp->orphan_donetail = &rsp->orphan_donelist; +	} + +	/* And then adopt the callbacks that still need a grace period. */ +	if (rsp->orphan_nxtlist != NULL) { +		*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; +		rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; +		rsp->orphan_nxtlist = NULL; +		rsp->orphan_nxttail = &rsp->orphan_nxtlist; +	} +} + +/* + * Trace the fact that this CPU is going offline. + */ +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +{ +	RCU_TRACE(unsigned long mask); +	RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); +	RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + +	RCU_TRACE(mask = rdp->grpmask);  	trace_rcu_grace_period(rsp->name,  			       rnp->gpnum + 1 - !!(rnp->qsmask & mask),  			       "cpuofl"); -	rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); -	/* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */  }  /*   * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context.  Do the remainder of the cleanup. + * this fact from process context.  Do the remainder of the cleanup, + * including orphaning the outgoing CPU's RCU callbacks, and also + * adopting them, if there is no _rcu_barrier() instance running.   * There can only be one CPU hotplug operation at a time, so no other   * CPU can be attempting to update rcu_cpu_kthread_task.   */ @@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  	unsigned long mask;  	int need_report = 0;  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); -	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */ +	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */  	/* Adjust any no-longer-needed kthreads. */  	rcu_stop_cpu_kthread(cpu);  	rcu_node_kthread_setaffinity(rnp, -1); -	/* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ +	/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */  	/* Exclude any attempts to start a new grace period. */  	raw_spin_lock_irqsave(&rsp->onofflock, flags); +	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ +	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); +	rcu_adopt_orphan_cbs(rsp); +  	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */  	mask = rdp->grpmask;	/* rnp->grplo is constant. */  	do { @@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} +  static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)  {  } @@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  			    rcu_is_callbacks_kthread());  	/* Update count, and requeue any remaining callbacks. */ -	rdp->qlen_lazy -= count_lazy; -	rdp->qlen -= count; -	rdp->n_cbs_invoked += count;  	if (list != NULL) {  		*tail = rdp->nxtlist;  		rdp->nxtlist = list; @@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  			else  				break;  	} +	smp_mb(); /* List handling before counting for rcu_barrier(). */ +	rdp->qlen_lazy -= count_lazy; +	rdp->qlen -= count; +	rdp->n_cbs_invoked += count;  	/* Reinstate batch limit if we have worked down the excess. */  	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) @@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  	rdp = this_cpu_ptr(rsp->rda);  	/* Add the callback to our list. */ -	*rdp->nxttail[RCU_NEXT_TAIL] = head; -	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;  	rdp->qlen++;  	if (lazy)  		rdp->qlen_lazy++; +	else +		rcu_idle_count_callbacks_posted(); +	smp_mb();  /* Count before adding callback for rcu_barrier(). */ +	*rdp->nxttail[RCU_NEXT_TAIL] = head; +	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;  	if (__is_kfree_rcu_offset((unsigned long)func))  		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, @@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  }  EXPORT_SYMBOL_GPL(call_rcu_bh); +/* + * Because a context switch is a grace period for RCU-sched and RCU-bh, + * any blocking grace-period wait automatically implies a grace period + * if there is only one CPU online at any point time during execution + * of either synchronize_sched() or synchronize_rcu_bh().  It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds + * some overhead: RCU still operates correctly. + * + * Of course, sampling num_online_cpus() with preemption enabled can + * give erroneous results if there are concurrent CPU-hotplug operations. + * For example, given a demonic sequence of preemptions in num_online_cpus() + * and CPU-hotplug operations, there could be two or more CPUs online at + * all times, but num_online_cpus() might well return one (or even zero). + * + * However, all such demonic sequences require at least one CPU-offline + * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer + * is only a problem if there is an RCU read-side critical section executing + * throughout.  But RCU-sched and RCU-bh read-side critical sections + * disable either preemption or bh, which prevents a CPU from going offline. + * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return + * that there is only one CPU when in fact there was more than one throughout + * is when there were no RCU readers in the system.  If there are no + * RCU readers, the grace period by definition can be of zero length, + * regardless of the number of online CPUs. + */ +static inline int rcu_blocking_is_gp(void) +{ +	might_sleep();  /* Check for RCU read-side critical section. */ +	return num_online_cpus() <= 1; +} +  /**   * synchronize_sched - wait until an rcu-sched grace period has elapsed.   * @@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)  	       rcu_preempt_cpu_has_callbacks(cpu);  } -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - +/* + * RCU callback function for _rcu_barrier().  If we are last, wake + * up the task executing _rcu_barrier(). + */  static void rcu_barrier_callback(struct rcu_head *notused)  {  	if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,  			 void (*call_rcu_func)(struct rcu_head *head,  					       void (*func)(struct rcu_head *head)))  { -	BUG_ON(in_interrupt()); +	int cpu; +	unsigned long flags; +	struct rcu_data *rdp; +	struct rcu_head rh; + +	init_rcu_head_on_stack(&rh); +  	/* Take mutex to serialize concurrent rcu_barrier() requests. */  	mutex_lock(&rcu_barrier_mutex); -	init_completion(&rcu_barrier_completion); + +	smp_mb();  /* Prevent any prior operations from leaking in. */ +  	/* -	 * Initialize rcu_barrier_cpu_count to 1, then invoke -	 * rcu_barrier_func() on each CPU, so that each CPU also has -	 * incremented rcu_barrier_cpu_count.  Only then is it safe to -	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU -	 * might complete its grace period before all of the other CPUs -	 * did their increment, causing this function to return too -	 * early.  Note that on_each_cpu() disables irqs, which prevents -	 * any CPUs from coming online or going offline until each online -	 * CPU has queued its RCU-barrier callback. +	 * Initialize the count to one rather than to zero in order to +	 * avoid a too-soon return to zero in case of a short grace period +	 * (or preemption of this task).  Also flag this task as doing +	 * an rcu_barrier().  This will prevent anyone else from adopting +	 * orphaned callbacks, which could cause otherwise failure if a +	 * CPU went offline and quickly came back online.  To see this, +	 * consider the following sequence of events: +	 * +	 * 1.	We cause CPU 0 to post an rcu_barrier_callback() callback. +	 * 2.	CPU 1 goes offline, orphaning its callbacks. +	 * 3.	CPU 0 adopts CPU 1's orphaned callbacks. +	 * 4.	CPU 1 comes back online. +	 * 5.	We cause CPU 1 to post an rcu_barrier_callback() callback. +	 * 6.	Both rcu_barrier_callback() callbacks are invoked, awakening +	 *	us -- but before CPU 1's orphaned callbacks are invoked!!!  	 */ +	init_completion(&rcu_barrier_completion);  	atomic_set(&rcu_barrier_cpu_count, 1); -	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); +	raw_spin_lock_irqsave(&rsp->onofflock, flags); +	rsp->rcu_barrier_in_progress = current; +	raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + +	/* +	 * Force every CPU with callbacks to register a new callback +	 * that will tell us when all the preceding callbacks have +	 * been invoked.  If an offline CPU has callbacks, wait for +	 * it to either come back online or to finish orphaning those +	 * callbacks. +	 */ +	for_each_possible_cpu(cpu) { +		preempt_disable(); +		rdp = per_cpu_ptr(rsp->rda, cpu); +		if (cpu_is_offline(cpu)) { +			preempt_enable(); +			while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) +				schedule_timeout_interruptible(1); +		} else if (ACCESS_ONCE(rdp->qlen)) { +			smp_call_function_single(cpu, rcu_barrier_func, +						 (void *)call_rcu_func, 1); +			preempt_enable(); +		} else { +			preempt_enable(); +		} +	} + +	/* +	 * Now that all online CPUs have rcu_barrier_callback() callbacks +	 * posted, we can adopt all of the orphaned callbacks and place +	 * an rcu_barrier_callback() callback after them.  When that is done, +	 * we are guaranteed to have an rcu_barrier_callback() callback +	 * following every callback that could possibly have been +	 * registered before _rcu_barrier() was called. +	 */ +	raw_spin_lock_irqsave(&rsp->onofflock, flags); +	rcu_adopt_orphan_cbs(rsp); +	rsp->rcu_barrier_in_progress = NULL; +	raw_spin_unlock_irqrestore(&rsp->onofflock, flags); +	atomic_inc(&rcu_barrier_cpu_count); +	smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ +	call_rcu_func(&rh, rcu_barrier_callback); + +	/* +	 * Now that we have an rcu_barrier_callback() callback on each +	 * CPU, and thus each counted, remove the initial count. +	 */  	if (atomic_dec_and_test(&rcu_barrier_cpu_count))  		complete(&rcu_barrier_completion); + +	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */  	wait_for_completion(&rcu_barrier_completion); + +	/* Other rcu_barrier() invocations can now safely proceed. */  	mutex_unlock(&rcu_barrier_mutex); + +	destroy_rcu_head_on_stack(&rh);  }  /** @@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)  	for (i = NUM_RCU_LVLS - 1; i > 0; i--)  		rsp->levelspread[i] = CONFIG_RCU_FANOUT; -	rsp->levelspread[0] = RCU_FANOUT_LEAF; +	rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;  }  #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */  static void __init rcu_init_levelspread(struct rcu_state *rsp) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a407..7f5d138dedf 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -29,18 +29,14 @@  #include <linux/seqlock.h>  /* - * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF.   * In theory, it should be possible to add more levels straightforwardly.   * In practice, this did work well going from three levels to four.   * Of course, your mileage may vary.   */  #define MAX_RCU_LVLS 4 -#if CONFIG_RCU_FANOUT > 16 -#define RCU_FANOUT_LEAF       16 -#else /* #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT) -#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF) +#define RCU_FANOUT_1	      (CONFIG_RCU_FANOUT_LEAF)  #define RCU_FANOUT_2	      (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)  #define RCU_FANOUT_3	      (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)  #define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) @@ -371,6 +367,17 @@ struct rcu_state {  	raw_spinlock_t onofflock;		/* exclude on/offline and */  						/*  starting new GP. */ +	struct rcu_head *orphan_nxtlist;	/* Orphaned callbacks that */ +						/*  need a grace period. */ +	struct rcu_head **orphan_nxttail;	/* Tail of above. */ +	struct rcu_head *orphan_donelist;	/* Orphaned callbacks that */ +						/*  are ready to invoke. */ +	struct rcu_head **orphan_donetail;	/* Tail of above. */ +	long qlen_lazy;				/* Number of lazy callbacks. */ +	long qlen;				/* Total number of callbacks. */ +	struct task_struct *rcu_barrier_in_progress; +						/* Task doing rcu_barrier(), */ +						/*  or NULL if no barrier. */  	raw_spinlock_t fqslock;			/* Only one task forcing */  						/*  quiescent states. */  	unsigned long jiffies_force_qs;		/* Time at which to invoke */ @@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);  /* Forward declarations for rcutree_plugin.h */  static void rcu_bootup_announce(void);  long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu);  static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);  #ifdef CONFIG_HOTPLUG_CPU  static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);  static void rcu_prepare_for_idle_init(int cpu);  static void rcu_cleanup_after_idle(int cpu);  static void rcu_prepare_for_idle(int cpu); +static void rcu_idle_count_callbacks_posted(void);  static void print_cpu_stall_info_begin(void);  static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);  static void print_cpu_stall_info_end(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816b..2411000d986 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)   *   * Caller must disable preemption.   */ -static void rcu_preempt_note_context_switch(int cpu) +void rcu_preempt_note_context_switch(void)  {  	struct task_struct *t = current;  	unsigned long flags; @@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)  	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {  		/* Possibly blocking in an RCU read-side critical section. */ -		rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); +		rdp = __this_cpu_ptr(rcu_preempt_state.rda);  		rnp = rdp->mynode;  		raw_spin_lock_irqsave(&rnp->lock, flags);  		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)  	 * means that we continue to block the current grace period.  	 */  	local_irq_save(flags); -	rcu_preempt_qs(cpu); +	rcu_preempt_qs(smp_processor_id());  	local_irq_restore(flags);  } @@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)  	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);  } -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so.  No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ -	struct task_struct *t = current; - -	if (t->rcu_read_lock_nesting == 0) -		return; -	t->rcu_read_lock_nesting = 1; -	__rcu_read_unlock(); -} -  #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */  static struct rcu_state *rcu_state = &rcu_sched_state; @@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);  /* - * Because preemptible RCU does not exist, we never have to check for - * CPUs being in quiescent states. - */ -static void rcu_preempt_note_context_switch(int cpu) -{ -} - -/*   * Because preemptible RCU does not exist, there are never any preempted   * RCU readers.   */ @@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)  {  } +/* + * Don't bother keeping a running count of the number of RCU callbacks + * posted because CONFIG_RCU_FAST_NO_HZ=n. + */ +static void rcu_idle_count_callbacks_posted(void) +{ +} +  #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */  /* @@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)  #define RCU_IDLE_GP_DELAY 6		/* Roughly one grace period. */  #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */ +/* Loop counter for rcu_prepare_for_idle(). */  static DEFINE_PER_CPU(int, rcu_dyntick_drain); +/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */  static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait;	/* If some non-lazy callbacks. */ -static ktime_t rcu_idle_lazy_gp_wait;	/* If only lazy callbacks. */ +/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ +static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); +/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ +static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); +/* Enable special processing on first attempt to enter dyntick-idle mode. */ +static DEFINE_PER_CPU(bool, rcu_idle_first_pass); +/* Running count of non-lazy callbacks posted, never decremented. */ +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); +/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);  /*   * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait;	/* If only lazy callbacks. */   */  int rcu_needs_cpu(int cpu)  { +	/* Flag a new idle sojourn to the idle-entry state machine. */ +	per_cpu(rcu_idle_first_pass, cpu) = 1;  	/* If no callbacks, RCU doesn't need the CPU. */  	if (!rcu_cpu_has_callbacks(cpu))  		return 0; @@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)  }  /* + * Handler for smp_call_function_single().  The only point of this + * handler is to wake the CPU up, so the handler does only tracing. + */ +void rcu_idle_demigrate(void *unused) +{ +	trace_rcu_prep_idle("Demigrate"); +} + +/*   * Timer handler used to force CPU to start pushing its remaining RCU   * callbacks in the case where it entered dyntick-idle mode with callbacks   * pending.  The hander doesn't really need to do anything because the   * real work is done upon re-entry to idle, or by the next scheduling-clock   * interrupt should idle not be re-entered. + * + * One special case: the timer gets migrated without awakening the CPU + * on which the timer was scheduled on.  In this case, we must wake up + * that CPU.  We do so with smp_call_function_single().   */ -static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +static void rcu_idle_gp_timer_func(unsigned long cpu_in)  { +	int cpu = (int)cpu_in; +  	trace_rcu_prep_idle("Timer"); -	return HRTIMER_NORESTART; +	if (cpu != smp_processor_id()) +		smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); +	else +		WARN_ON_ONCE(1); /* Getting here can hang the system... */  }  /* @@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)   */  static void rcu_prepare_for_idle_init(int cpu)  { -	static int firsttime = 1; -	struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); - -	hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -	hrtp->function = rcu_idle_gp_timer_func; -	if (firsttime) { -		unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); - -		rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); -		upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); -		rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); -		firsttime = 0; -	} +	per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; +	setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), +		    rcu_idle_gp_timer_func, cpu); +	per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; +	per_cpu(rcu_idle_first_pass, cpu) = 1;  }  /* @@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)   */  static void rcu_cleanup_after_idle(int cpu)  { -	hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); +	del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); +	trace_rcu_prep_idle("Cleanup after idle");  }  /* @@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)   */  static void rcu_prepare_for_idle(int cpu)  { +	struct timer_list *tp; + +	/* +	 * If this is an idle re-entry, for example, due to use of +	 * RCU_NONIDLE() or the new idle-loop tracing API within the idle +	 * loop, then don't take any state-machine actions, unless the +	 * momentary exit from idle queued additional non-lazy callbacks. +	 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks +	 * pending. +	 */ +	if (!per_cpu(rcu_idle_first_pass, cpu) && +	    (per_cpu(rcu_nonlazy_posted, cpu) == +	     per_cpu(rcu_nonlazy_posted_snap, cpu))) { +		if (rcu_cpu_has_callbacks(cpu)) { +			tp = &per_cpu(rcu_idle_gp_timer, cpu); +			mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); +		} +		return; +	} +	per_cpu(rcu_idle_first_pass, cpu) = 0; +	per_cpu(rcu_nonlazy_posted_snap, cpu) = +		per_cpu(rcu_nonlazy_posted, cpu) - 1; +  	/*  	 * If there are no callbacks on this CPU, enter dyntick-idle mode.  	 * Also reset state to avoid prejudicing later attempts. @@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)  		per_cpu(rcu_dyntick_drain, cpu) = 0;  		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;  		if (rcu_cpu_has_nonlazy_callbacks(cpu)) -			hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), -				      rcu_idle_gp_wait, HRTIMER_MODE_REL); +			per_cpu(rcu_idle_gp_timer_expires, cpu) = +					   jiffies + RCU_IDLE_GP_DELAY;  		else -			hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), -				      rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); +			per_cpu(rcu_idle_gp_timer_expires, cpu) = +					   jiffies + RCU_IDLE_LAZY_GP_DELAY; +		tp = &per_cpu(rcu_idle_gp_timer, cpu); +		mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); +		per_cpu(rcu_nonlazy_posted_snap, cpu) = +			per_cpu(rcu_nonlazy_posted, cpu);  		return; /* Nothing more to do immediately. */  	} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {  		/* We have hit the limit, so time to give up. */ @@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)  		trace_rcu_prep_idle("Callbacks drained");  } +/* + * Keep a running count of the number of non-lazy callbacks posted + * on this CPU.  This running counter (which is never decremented) allows + * rcu_prepare_for_idle() to detect when something out of the idle loop + * posts a callback, even if an equal number of callbacks are invoked. + * Of course, callbacks should only be posted from within a trace event + * designed to be called from idle or from within RCU_NONIDLE(). + */ +static void rcu_idle_count_callbacks_posted(void) +{ +	__this_cpu_add(rcu_nonlazy_posted, 1); +} +  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */  #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)  static void print_cpu_stall_fast_no_hz(char *cp, int cpu)  { -	struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); +	struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); -	sprintf(cp, "drain=%d %c timer=%lld", +	sprintf(cp, "drain=%d %c timer=%lu",  		per_cpu(rcu_dyntick_drain, cpu),  		per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', -		hrtimer_active(hrtp) -			? ktime_to_us(hrtimer_get_remaining(hrtp)) -			: -1); +		timer_pending(tltp) ? tltp->expires - jiffies : -1);  }  #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff4..d4bc16ddd1d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)  	gpnum = rsp->gpnum;  	seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " -		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", +		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",  		   rsp->completed, gpnum, rsp->fqs_state,  		   (long)(rsp->jiffies_force_qs - jiffies),  		   (int)(jiffies & 0xffff),  		   rsp->n_force_qs, rsp->n_force_qs_ngp,  		   rsp->n_force_qs - rsp->n_force_qs_ngp, -		   rsp->n_force_qs_lh); +		   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);  	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {  		if (rnp->level != level) {  			seq_puts(m, "\n"); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d508363858b..bebe2b170d4 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)  	counter->parent = parent;  } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val) +int res_counter_charge_locked(struct res_counter *counter, unsigned long val, +			      bool force)  { +	int ret = 0; +  	if (counter->usage + val > counter->limit) {  		counter->failcnt++; -		return -ENOMEM; +		ret = -ENOMEM; +		if (!force) +			return ret;  	}  	counter->usage += val;  	if (counter->usage > counter->max_usage)  		counter->max_usage = counter->usage; -	return 0; +	return ret;  } -int res_counter_charge(struct res_counter *counter, unsigned long val, -			struct res_counter **limit_fail_at) +static int __res_counter_charge(struct res_counter *counter, unsigned long val, +				struct res_counter **limit_fail_at, bool force)  { -	int ret; +	int ret, r;  	unsigned long flags;  	struct res_counter *c, *u; +	r = ret = 0;  	*limit_fail_at = NULL;  	local_irq_save(flags);  	for (c = counter; c != NULL; c = c->parent) {  		spin_lock(&c->lock); -		ret = res_counter_charge_locked(c, val); +		r = res_counter_charge_locked(c, val, force);  		spin_unlock(&c->lock); -		if (ret < 0) { +		if (r < 0 && !ret) { +			ret = r;  			*limit_fail_at = c; -			goto undo; +			if (!force) +				break;  		}  	} -	ret = 0; -	goto done; -undo: -	for (u = counter; u != c; u = u->parent) { -		spin_lock(&u->lock); -		res_counter_uncharge_locked(u, val); -		spin_unlock(&u->lock); + +	if (ret < 0 && !force) { +		for (u = counter; u != c; u = u->parent) { +			spin_lock(&u->lock); +			res_counter_uncharge_locked(u, val); +			spin_unlock(&u->lock); +		}  	} -done:  	local_irq_restore(flags); +  	return ret;  } +int res_counter_charge(struct res_counter *counter, unsigned long val, +			struct res_counter **limit_fail_at) +{ +	return __res_counter_charge(counter, val, limit_fail_at, false); +} +  int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,  			      struct res_counter **limit_fail_at)  { -	int ret, r; -	unsigned long flags; -	struct res_counter *c; - -	r = ret = 0; -	*limit_fail_at = NULL; -	local_irq_save(flags); -	for (c = counter; c != NULL; c = c->parent) { -		spin_lock(&c->lock); -		r = res_counter_charge_locked(c, val); -		if (r) -			c->usage += val; -		spin_unlock(&c->lock); -		if (r < 0 && ret == 0) { -			*limit_fail_at = c; -			ret = r; -		} -	} -	local_irq_restore(flags); - -	return ret; +	return __res_counter_charge(counter, val, limit_fail_at, true);  } +  void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)  {  	if (WARN_ON(counter->usage < val)) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a..173ea52f3af 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o  obj-$(CONFIG_SCHEDSTATS) += stats.o  obj-$(CONFIG_SCHED_DEBUG) += debug.o - - diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5212ae294f..d833cc94eed 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -83,6 +83,7 @@  #include "sched.h"  #include "../workqueue_sched.h" +#include "../smpboot.h"  #define CREATE_TRACE_POINTS  #include <trace/events/sched.h> @@ -692,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)  }  #endif -void update_cpu_load(struct rq *this_rq); -  static void set_load_weight(struct task_struct *p)  {  	int prio = p->static_prio - MAX_RT_PRIO; @@ -1913,7 +1912,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,  		    struct task_struct *next)  {  	sched_info_switch(prev, next); -	perf_event_task_sched_out(prev, next); +	perf_event_task_sched(prev, next);  	fire_sched_out_preempt_notifiers(prev, next);  	prepare_lock_switch(rq, next);  	prepare_arch_switch(next); @@ -1956,13 +1955,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	 */  	prev_state = prev->state;  	finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -	local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -	perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -	local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */  	finish_lock_switch(rq, prev);  	finish_arch_post_lock_switch(); @@ -2083,6 +2075,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  #endif  	/* Here we just switch the register state and the stack. */ +	rcu_switch_from(prev);  	switch_to(prev, next, prev);  	barrier(); @@ -2486,22 +2479,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)   * scheduler tick (TICK_NSEC). With tickless idle this will not be called   * every tick. We fix it up based on jiffies.   */ -void update_cpu_load(struct rq *this_rq) +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, +			      unsigned long pending_updates)  { -	unsigned long this_load = this_rq->load.weight; -	unsigned long curr_jiffies = jiffies; -	unsigned long pending_updates;  	int i, scale;  	this_rq->nr_load_updates++; -	/* Avoid repeated calls on same jiffy, when moving in and out of idle */ -	if (curr_jiffies == this_rq->last_load_update_tick) -		return; - -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	this_rq->last_load_update_tick = curr_jiffies; -  	/* Update our load: */  	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */  	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -2526,9 +2510,45 @@ void update_cpu_load(struct rq *this_rq)  	sched_avg_update(this_rq);  } +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ +	unsigned long curr_jiffies = jiffies; +	unsigned long load = this_rq->load.weight; +	unsigned long pending_updates; + +	/* +	 * Bloody broken means of dealing with nohz, but better than nothing.. +	 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy +	 * update and see 0 difference the one time and 2 the next, even though +	 * we ticked at roughtly the same rate. +	 * +	 * Hence we only use this from nohz_idle_balance() and skip this +	 * nonsense when called from the scheduler_tick() since that's +	 * guaranteed a stable rate. +	 */ +	if (load || curr_jiffies == this_rq->last_load_update_tick) +		return; + +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	this_rq->last_load_update_tick = curr_jiffies; + +	__update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from scheduler_tick() + */  static void update_cpu_load_active(struct rq *this_rq)  { -	update_cpu_load(this_rq); +	/* +	 * See the mess in update_idle_cpu_load(). +	 */ +	this_rq->last_load_update_tick = jiffies; +	__update_cpu_load(this_rq, this_rq->load.weight, 1);  	calc_load_account_active(this_rq);  } @@ -3113,6 +3133,7 @@ static noinline void __schedule_bug(struct task_struct *prev)  	if (irqs_disabled())  		print_irqtrace_events(prev);  	dump_stack(); +	add_taint(TAINT_WARN);  }  /* @@ -5560,7 +5581,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  			break;  		} -		if (cpumask_intersects(groupmask, sched_group_cpus(group))) { +		if (!(sd->flags & SD_OVERLAP) && +		    cpumask_intersects(groupmask, sched_group_cpus(group))) {  			printk(KERN_CONT "\n");  			printk(KERN_ERR "ERROR: repeated CPUs\n");  			break; @@ -5898,99 +5920,11 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ -	int i, n, val, min_val, best_node = -1; - -	min_val = INT_MAX; - -	for (i = 0; i < nr_node_ids; i++) { -		/* Start at @node */ -		n = (node + i) % nr_node_ids; - -		if (!nr_cpus_node(n)) -			continue; - -		/* Skip already used nodes */ -		if (node_isset(n, *used_nodes)) -			continue; - -		/* Simple min distance search */ -		val = node_distance(node, n); - -		if (val < min_val) { -			min_val = val; -			best_node = n; -		} -	} - -	if (best_node != -1) -		node_set(best_node, *used_nodes); -	return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ -	nodemask_t used_nodes; -	int i; - -	cpumask_clear(span); -	nodes_clear(used_nodes); - -	cpumask_or(span, span, cpumask_of_node(node)); -	node_set(node, used_nodes); - -	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { -		int next_node = find_next_best_node(node, &used_nodes); -		if (next_node < 0) -			break; -		cpumask_or(span, span, cpumask_of_node(next_node)); -	} -} - -static const struct cpumask *cpu_node_mask(int cpu) -{ -	lockdep_assert_held(&sched_domains_mutex); - -	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); - -	return sched_domains_tmpmask; -} - -static const struct cpumask *cpu_allnodes_mask(int cpu) -{ -	return cpu_possible_mask; -} -#endif /* CONFIG_NUMA */ -  static const struct cpumask *cpu_cpu_mask(int cpu)  {  	return cpumask_of_node(cpu_to_node(cpu));  } -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -  struct sd_data {  	struct sched_domain **__percpu sd;  	struct sched_group **__percpu sg; @@ -6020,6 +5954,7 @@ struct sched_domain_topology_level {  	sched_domain_init_f init;  	sched_domain_mask_f mask;  	int		    flags; +	int		    numa_level;  	struct sd_data      data;  }; @@ -6211,10 +6146,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\  }  SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif  #ifdef CONFIG_SCHED_SMT   SD_INIT_FUNC(SIBLING)  #endif @@ -6336,15 +6267,184 @@ static struct sched_domain_topology_level default_topology[] = {  	{ sd_init_BOOK, cpu_book_mask, },  #endif  	{ sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_NUMA -	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, -	{ sd_init_ALLNODES, cpu_allnodes_mask, }, -#endif  	{ NULL, },  };  static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_NUMA + +static int sched_domains_numa_levels; +static int sched_domains_numa_scale; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; + +static inline int sd_local_flags(int level) +{ +	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) +		return 0; + +	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; +} + +static struct sched_domain * +sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +{ +	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); +	int level = tl->numa_level; +	int sd_weight = cpumask_weight( +			sched_domains_numa_masks[level][cpu_to_node(cpu)]); + +	*sd = (struct sched_domain){ +		.min_interval		= sd_weight, +		.max_interval		= 2*sd_weight, +		.busy_factor		= 32, +		.imbalance_pct		= 125, +		.cache_nice_tries	= 2, +		.busy_idx		= 3, +		.idle_idx		= 2, +		.newidle_idx		= 0, +		.wake_idx		= 0, +		.forkexec_idx		= 0, + +		.flags			= 1*SD_LOAD_BALANCE +					| 1*SD_BALANCE_NEWIDLE +					| 0*SD_BALANCE_EXEC +					| 0*SD_BALANCE_FORK +					| 0*SD_BALANCE_WAKE +					| 0*SD_WAKE_AFFINE +					| 0*SD_PREFER_LOCAL +					| 0*SD_SHARE_CPUPOWER +					| 0*SD_SHARE_PKG_RESOURCES +					| 1*SD_SERIALIZE +					| 0*SD_PREFER_SIBLING +					| sd_local_flags(level) +					, +		.last_balance		= jiffies, +		.balance_interval	= sd_weight, +	}; +	SD_INIT_NAME(sd, NUMA); +	sd->private = &tl->data; + +	/* +	 * Ugly hack to pass state to sd_numa_mask()... +	 */ +	sched_domains_curr_level = tl->numa_level; + +	return sd; +} + +static const struct cpumask *sd_numa_mask(int cpu) +{ +	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_init_numa(void) +{ +	int next_distance, curr_distance = node_distance(0, 0); +	struct sched_domain_topology_level *tl; +	int level = 0; +	int i, j, k; + +	sched_domains_numa_scale = curr_distance; +	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); +	if (!sched_domains_numa_distance) +		return; + +	/* +	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the +	 * unique distances in the node_distance() table. +	 * +	 * Assumes node_distance(0,j) includes all distances in +	 * node_distance(i,j) in order to avoid cubic time. +	 * +	 * XXX: could be optimized to O(n log n) by using sort() +	 */ +	next_distance = curr_distance; +	for (i = 0; i < nr_node_ids; i++) { +		for (j = 0; j < nr_node_ids; j++) { +			int distance = node_distance(0, j); +			if (distance > curr_distance && +					(distance < next_distance || +					 next_distance == curr_distance)) +				next_distance = distance; +		} +		if (next_distance != curr_distance) { +			sched_domains_numa_distance[level++] = next_distance; +			sched_domains_numa_levels = level; +			curr_distance = next_distance; +		} else break; +	} +	/* +	 * 'level' contains the number of unique distances, excluding the +	 * identity distance node_distance(i,i). +	 * +	 * The sched_domains_nume_distance[] array includes the actual distance +	 * numbers. +	 */ + +	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); +	if (!sched_domains_numa_masks) +		return; + +	/* +	 * Now for each level, construct a mask per node which contains all +	 * cpus of nodes that are that many hops away from us. +	 */ +	for (i = 0; i < level; i++) { +		sched_domains_numa_masks[i] = +			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); +		if (!sched_domains_numa_masks[i]) +			return; + +		for (j = 0; j < nr_node_ids; j++) { +			struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); +			if (!mask) +				return; + +			sched_domains_numa_masks[i][j] = mask; + +			for (k = 0; k < nr_node_ids; k++) { +				if (node_distance(j, k) > sched_domains_numa_distance[i]) +					continue; + +				cpumask_or(mask, mask, cpumask_of_node(k)); +			} +		} +	} + +	tl = kzalloc((ARRAY_SIZE(default_topology) + level) * +			sizeof(struct sched_domain_topology_level), GFP_KERNEL); +	if (!tl) +		return; + +	/* +	 * Copy the default topology bits.. +	 */ +	for (i = 0; default_topology[i].init; i++) +		tl[i] = default_topology[i]; + +	/* +	 * .. and append 'j' levels of NUMA goodness. +	 */ +	for (j = 0; j < level; i++, j++) { +		tl[i] = (struct sched_domain_topology_level){ +			.init = sd_numa_init, +			.mask = sd_numa_mask, +			.flags = SDTL_OVERLAP, +			.numa_level = j, +		}; +	} + +	sched_domain_topology = tl; +} +#else +static inline void sched_init_numa(void) +{ +} +#endif /* CONFIG_NUMA */ +  static int __sdt_alloc(const struct cpumask *cpu_map)  {  	struct sched_domain_topology_level *tl; @@ -6712,97 +6812,6 @@ match2:  	mutex_unlock(&sched_domains_mutex);  } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void reinit_sched_domains(void) -{ -	get_online_cpus(); - -	/* Destroy domains first to force the rebuild */ -	partition_sched_domains(0, NULL, NULL); - -	rebuild_sched_domains(); -	put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ -	unsigned int level = 0; - -	if (sscanf(buf, "%u", &level) != 1) -		return -EINVAL; - -	/* -	 * level is always be positive so don't check for -	 * level < POWERSAVINGS_BALANCE_NONE which is 0 -	 * What happens on 0 or 1 byte write, -	 * need to check for count as well? -	 */ - -	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) -		return -EINVAL; - -	if (smt) -		sched_smt_power_savings = level; -	else -		sched_mc_power_savings = level; - -	reinit_sched_domains(); - -	return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct device *dev, -					   struct device_attribute *attr, -					   char *buf) -{ -	return sprintf(buf, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct device *dev, -					    struct device_attribute *attr, -					    const char *buf, size_t count) -{ -	return sched_power_savings_store(buf, count, 0); -} -static DEVICE_ATTR(sched_mc_power_savings, 0644, -		   sched_mc_power_savings_show, -		   sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct device *dev, -					    struct device_attribute *attr, -					    char *buf) -{ -	return sprintf(buf, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct device *dev, -					    struct device_attribute *attr, -					     const char *buf, size_t count) -{ -	return sched_power_savings_store(buf, count, 1); -} -static DEVICE_ATTR(sched_smt_power_savings, 0644, -		   sched_smt_power_savings_show, -		   sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct device *dev) -{ -	int err = 0; - -#ifdef CONFIG_SCHED_SMT -	if (smt_capable()) -		err = device_create_file(dev, &dev_attr_sched_smt_power_savings); -#endif -#ifdef CONFIG_SCHED_MC -	if (!err && mc_capable()) -		err = device_create_file(dev, &dev_attr_sched_mc_power_savings); -#endif -	return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -  /*   * Update cpusets according to cpu_active mask.  If cpusets are   * disabled, cpuset_update_active_cpus() becomes a simple wrapper @@ -6840,6 +6849,8 @@ void __init sched_init_smp(void)  	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);  	alloc_cpumask_var(&fallback_doms, GFP_KERNEL); +	sched_init_numa(); +  	get_online_cpus();  	mutex_lock(&sched_domains_mutex);  	init_sched_domains(cpu_active_mask); @@ -7061,6 +7072,7 @@ void __init sched_init(void)  	/* May be allocated at isolcpus cmdline parse time */  	if (cpu_isolated_map == NULL)  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); +	idle_thread_set_boot_cpu();  #endif  	init_sched_fair_class(); @@ -7982,13 +7994,9 @@ static struct cftype cpu_files[] = {  		.write_u64 = cpu_rt_period_write_uint,  	},  #endif +	{ }	/* terminate */  }; -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ -	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); -} -  struct cgroup_subsys cpu_cgroup_subsys = {  	.name		= "cpu",  	.create		= cpu_cgroup_create, @@ -7996,8 +8004,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {  	.can_attach	= cpu_cgroup_can_attach,  	.attach		= cpu_cgroup_attach,  	.exit		= cpu_cgroup_exit, -	.populate	= cpu_cgroup_populate,  	.subsys_id	= cpu_cgroup_subsys_id, +	.base_cftypes	= cpu_files,  	.early_init	= 1,  }; @@ -8182,13 +8190,9 @@ static struct cftype files[] = {  		.name = "stat",  		.read_map = cpuacct_stats_show,  	}, +	{ }	/* terminate */  }; -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ -	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); -} -  /*   * charge this task's execution time to its accounting group.   * @@ -8220,7 +8224,7 @@ struct cgroup_subsys cpuacct_subsys = {  	.name = "cpuacct",  	.create = cpuacct_create,  	.destroy = cpuacct_destroy, -	.populate = cpuacct_populate,  	.subsys_id = cpuacct_subsys_id, +	.base_cftypes = files,  };  #endif	/* CONFIG_CGROUP_CPUACCT */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161..6f79596e0ea 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  			SPLIT_NS(spread0));  	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",  			cfs_rq->nr_spread_over); -	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); +	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);  	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);  #ifdef CONFIG_FAIR_GROUP_SCHED  #ifdef CONFIG_SMP @@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)  	SEQ_printf(m, "\ncpu#%d\n", cpu);  #endif -#define P(x) \ -	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define P(x)								\ +do {									\ +	if (sizeof(rq->x) == 4)						\ +		SEQ_printf(m, "  .%-30s: %ld\n", #x, (long)(rq->x));	\ +	else								\ +		SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x));\ +} while (0) +  #define PN(x) \  	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9553640c1c..940e6d17cf9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)  		 * If power savings logic is enabled for a domain, see if we  		 * are not overloaded, if so, don't balance wider.  		 */ -		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { +		if (tmp->flags & (SD_PREFER_LOCAL)) {  			unsigned long power = 0;  			unsigned long nr_running = 0;  			unsigned long capacity; @@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)  			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); -			if (tmp->flags & SD_POWERSAVINGS_BALANCE) -				nr_running /= 2; -  			if (nr_running < capacity)  				want_sd = 0;  		} @@ -3082,7 +3079,7 @@ struct lb_env {  	struct rq		*dst_rq;  	enum cpu_idle_type	idle; -	long			load_move; +	long			imbalance;  	unsigned int		flags;  	unsigned int		loop; @@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p);  static const unsigned int sched_nr_migrate_break = 32;  /* - * move_tasks tries to move up to load_move weighted load from busiest to + * move_tasks tries to move up to imbalance weighted load from busiest to   * this_rq, as part of a balancing operation within domain "sd".   * Returns 1 if successful and 0 otherwise.   * @@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env)  	unsigned long load;  	int pulled = 0; -	if (env->load_move <= 0) +	if (env->imbalance <= 0)  		return 0;  	while (!list_empty(tasks)) { @@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env)  		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)  			goto next; -		if ((load / 2) > env->load_move) +		if ((load / 2) > env->imbalance)  			goto next;  		if (!can_migrate_task(p, env)) @@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env)  		move_task(p, env);  		pulled++; -		env->load_move -= load; +		env->imbalance -= load;  #ifdef CONFIG_PREEMPT  		/* @@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env)  		 * We only want to steal up to the prescribed amount of  		 * weighted load.  		 */ -		if (env->load_move <= 0) +		if (env->imbalance <= 0)  			break;  		continue; @@ -3435,14 +3432,6 @@ struct sd_lb_stats {  	unsigned int  busiest_group_weight;  	int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	int power_savings_balance; /* Is powersave balance needed for this sd */ -	struct sched_group *group_min; /* Least loaded group in sd */ -	struct sched_group *group_leader; /* Group which relieves group_min */ -	unsigned long min_load_per_task; /* load_per_task in group_min */ -	unsigned long leader_nr_running; /* Nr running of group_leader */ -	unsigned long min_nr_running; /* Nr running of group_min */ -#endif  };  /* @@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,  	return load_idx;  } - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. - * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. - */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, -	struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ -	/* -	 * Busy processors will not participate in power savings -	 * balance. -	 */ -	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) -		sds->power_savings_balance = 0; -	else { -		sds->power_savings_balance = 1; -		sds->min_nr_running = ULONG_MAX; -		sds->leader_nr_running = 0; -	} -} - -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * 		load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, -	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - -	if (!sds->power_savings_balance) -		return; - -	/* -	 * If the local group is idle or completely loaded -	 * no need to do power savings balance at this domain -	 */ -	if (local_group && (sds->this_nr_running >= sgs->group_capacity || -				!sds->this_nr_running)) -		sds->power_savings_balance = 0; - -	/* -	 * If a group is already running at full capacity or idle, -	 * don't include that group in power savings calculations -	 */ -	if (!sds->power_savings_balance || -		sgs->sum_nr_running >= sgs->group_capacity || -		!sgs->sum_nr_running) -		return; - -	/* -	 * Calculate the group which has the least non-idle load. -	 * This is the group from where we need to pick up the load -	 * for saving power -	 */ -	if ((sgs->sum_nr_running < sds->min_nr_running) || -	    (sgs->sum_nr_running == sds->min_nr_running && -	     group_first_cpu(group) > group_first_cpu(sds->group_min))) { -		sds->group_min = group; -		sds->min_nr_running = sgs->sum_nr_running; -		sds->min_load_per_task = sgs->sum_weighted_load / -						sgs->sum_nr_running; -	} - -	/* -	 * Calculate the group which is almost near its -	 * capacity but still has some space to pick up some load -	 * from other group and save more power -	 */ -	if (sgs->sum_nr_running + 1 > sgs->group_capacity) -		return; - -	if (sgs->sum_nr_running > sds->leader_nr_running || -	    (sgs->sum_nr_running == sds->leader_nr_running && -	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) { -		sds->group_leader = group; -		sds->leader_nr_running = sgs->sum_nr_running; -	} -} - -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @sds: Variable containing the statistics of the sched_domain - *	under consideration. - * @this_cpu: Cpu at which we're currently performing load-balancing. - * @imbalance: Variable to store the imbalance. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, -					int this_cpu, unsigned long *imbalance) -{ -	if (!sds->power_savings_balance) -		return 0; - -	if (sds->this != sds->group_leader || -			sds->group_leader == sds->group_min) -		return 0; - -	*imbalance = sds->min_load_per_task; -	sds->busiest = sds->group_min; - -	return 1; - -} -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, -	struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ -	return; -} - -static inline void update_sd_power_savings_stats(struct sched_group *group, -	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ -	return; -} - -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, -					int this_cpu, unsigned long *imbalance) -{ -	return 0; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - -  unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)  {  	return SCHED_POWER_SCALE; @@ -3765,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)   * update_sg_lb_stats - Update sched_group's statistics for load balancing.   * @sd: The sched_domain whose statistics are to be updated.   * @group: sched_group whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu   * @load_idx: Load index of sched_domain of this_cpu for load calc.   * @local_group: Does group contain this_cpu.   * @cpus: Set of cpus considered for load balancing.   * @balance: Should we balance.   * @sgs: variable to hold the statistics for this group.   */ -static inline void update_sg_lb_stats(struct sched_domain *sd, -			struct sched_group *group, int this_cpu, -			enum cpu_idle_type idle, int load_idx, +static inline void update_sg_lb_stats(struct lb_env *env, +			struct sched_group *group, int load_idx,  			int local_group, const struct cpumask *cpus,  			int *balance, struct sg_lb_stats *sgs)  { -	unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; -	int i; +	unsigned long nr_running, max_nr_running, min_nr_running; +	unsigned long load, max_cpu_load, min_cpu_load;  	unsigned int balance_cpu = -1, first_idle_cpu = 0;  	unsigned long avg_load_per_task = 0; +	int i;  	if (local_group)  		balance_cpu = group_first_cpu(group); @@ -3791,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,  	max_cpu_load = 0;  	min_cpu_load = ~0UL;  	max_nr_running = 0; +	min_nr_running = ~0UL;  	for_each_cpu_and(i, sched_group_cpus(group), cpus) {  		struct rq *rq = cpu_rq(i); +		nr_running = rq->nr_running; +  		/* Bias balancing toward cpus of our domain */  		if (local_group) {  			if (idle_cpu(i) && !first_idle_cpu) { @@ -3805,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,  			load = target_load(i, load_idx);  		} else {  			load = source_load(i, load_idx); -			if (load > max_cpu_load) { +			if (load > max_cpu_load)  				max_cpu_load = load; -				max_nr_running = rq->nr_running; -			}  			if (min_cpu_load > load)  				min_cpu_load = load; + +			if (nr_running > max_nr_running) +				max_nr_running = nr_running; +			if (min_nr_running > nr_running) +				min_nr_running = nr_running;  		}  		sgs->group_load += load; -		sgs->sum_nr_running += rq->nr_running; +		sgs->sum_nr_running += nr_running;  		sgs->sum_weighted_load += weighted_cpuload(i);  		if (idle_cpu(i))  			sgs->idle_cpus++; @@ -3827,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,  	 * to do the newly idle load balance.  	 */  	if (local_group) { -		if (idle != CPU_NEWLY_IDLE) { -			if (balance_cpu != this_cpu) { +		if (env->idle != CPU_NEWLY_IDLE) { +			if (balance_cpu != env->dst_cpu) {  				*balance = 0;  				return;  			} -			update_group_power(sd, this_cpu); +			update_group_power(env->sd, env->dst_cpu);  		} else if (time_after_eq(jiffies, group->sgp->next_update)) -			update_group_power(sd, this_cpu); +			update_group_power(env->sd, env->dst_cpu);  	}  	/* Adjust by relative CPU power of the group */ @@ -3852,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,  	if (sgs->sum_nr_running)  		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; -	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) +	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && +	    (max_nr_running - min_nr_running) > 1)  		sgs->group_imb = 1;  	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,  						SCHED_POWER_SCALE);  	if (!sgs->group_capacity) -		sgs->group_capacity = fix_small_capacity(sd, group); +		sgs->group_capacity = fix_small_capacity(env->sd, group);  	sgs->group_weight = group->group_weight;  	if (sgs->group_capacity > sgs->sum_nr_running) @@ -3876,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,   * Determine if @sg is a busier group than the previously selected   * busiest group.   */ -static bool update_sd_pick_busiest(struct sched_domain *sd, +static bool update_sd_pick_busiest(struct lb_env *env,  				   struct sd_lb_stats *sds,  				   struct sched_group *sg, -				   struct sg_lb_stats *sgs, -				   int this_cpu) +				   struct sg_lb_stats *sgs)  {  	if (sgs->avg_load <= sds->max_load)  		return false; @@ -3896,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,  	 * numbered CPUs in the group, therefore mark all groups  	 * higher than ourself as busy.  	 */ -	if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && -	    this_cpu < group_first_cpu(sg)) { +	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && +	    env->dst_cpu < group_first_cpu(sg)) {  		if (!sds->busiest)  			return true; @@ -3917,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,   * @balance: Should we balance.   * @sds: variable to hold the statistics for this sched_domain.   */ -static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, -			enum cpu_idle_type idle, const struct cpumask *cpus, -			int *balance, struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, +				      const struct cpumask *cpus, +				      int *balance, struct sd_lb_stats *sds)  { -	struct sched_domain *child = sd->child; -	struct sched_group *sg = sd->groups; +	struct sched_domain *child = env->sd->child; +	struct sched_group *sg = env->sd->groups;  	struct sg_lb_stats sgs;  	int load_idx, prefer_sibling = 0;  	if (child && child->flags & SD_PREFER_SIBLING)  		prefer_sibling = 1; -	init_sd_power_savings_stats(sd, sds, idle); -	load_idx = get_sd_load_idx(sd, idle); +	load_idx = get_sd_load_idx(env->sd, env->idle);  	do {  		int local_group; -		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); +		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));  		memset(&sgs, 0, sizeof(sgs)); -		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, -				local_group, cpus, balance, &sgs); +		update_sg_lb_stats(env, sg, load_idx, local_group, +				   cpus, balance, &sgs);  		if (local_group && !(*balance))  			return; @@ -3966,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,  			sds->this_load_per_task = sgs.sum_weighted_load;  			sds->this_has_capacity = sgs.group_has_capacity;  			sds->this_idle_cpus = sgs.idle_cpus; -		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { +		} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {  			sds->max_load = sgs.avg_load;  			sds->busiest = sg;  			sds->busiest_nr_running = sgs.sum_nr_running; @@ -3978,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,  			sds->group_imb = sgs.group_imb;  		} -		update_sd_power_savings_stats(sg, sds, local_group, &sgs);  		sg = sg->next; -	} while (sg != sd->groups); +	} while (sg != env->sd->groups);  }  /** @@ -4008,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,   * @this_cpu: The cpu at whose sched_domain we're performing load-balance.   * @imbalance: returns amount of imbalanced due to packing.   */ -static int check_asym_packing(struct sched_domain *sd, -			      struct sd_lb_stats *sds, -			      int this_cpu, unsigned long *imbalance) +static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)  {  	int busiest_cpu; -	if (!(sd->flags & SD_ASYM_PACKING)) +	if (!(env->sd->flags & SD_ASYM_PACKING))  		return 0;  	if (!sds->busiest)  		return 0;  	busiest_cpu = group_first_cpu(sds->busiest); -	if (this_cpu > busiest_cpu) +	if (env->dst_cpu > busiest_cpu)  		return 0; -	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, -				       SCHED_POWER_SCALE); +	env->imbalance = DIV_ROUND_CLOSEST( +		sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); +  	return 1;  } @@ -4037,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd,   * @this_cpu: The cpu at whose sched_domain we're performing load-balance.   * @imbalance: Variable to store the imbalance.   */ -static inline void fix_small_imbalance(struct sd_lb_stats *sds, -				int this_cpu, unsigned long *imbalance) +static inline +void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)  {  	unsigned long tmp, pwr_now = 0, pwr_move = 0;  	unsigned int imbn = 2; @@ -4049,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,  		if (sds->busiest_load_per_task >  				sds->this_load_per_task)  			imbn = 1; -	} else +	} else {  		sds->this_load_per_task = -			cpu_avg_load_per_task(this_cpu); +			cpu_avg_load_per_task(env->dst_cpu); +	}  	scaled_busy_load_per_task = sds->busiest_load_per_task  					 * SCHED_POWER_SCALE; @@ -4059,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,  	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=  			(scaled_busy_load_per_task * imbn)) { -		*imbalance = sds->busiest_load_per_task; +		env->imbalance = sds->busiest_load_per_task;  		return;  	} @@ -4096,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,  	/* Move if we gain throughput */  	if (pwr_move > pwr_now) -		*imbalance = sds->busiest_load_per_task; +		env->imbalance = sds->busiest_load_per_task;  }  /**   * calculate_imbalance - Calculate the amount of imbalance present within the   *			 groups of a given sched_domain during load balance. + * @env: load balance environment   * @sds: statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: Cpu for which currently load balance is being performed. - * @imbalance: The variable to store the imbalance.   */ -static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, -		unsigned long *imbalance) +static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)  {  	unsigned long max_pull, load_above_capacity = ~0UL; @@ -4123,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,  	 * its cpu_power, while calculating max_load..)  	 */  	if (sds->max_load < sds->avg_load) { -		*imbalance = 0; -		return fix_small_imbalance(sds, this_cpu, imbalance); +		env->imbalance = 0; +		return fix_small_imbalance(env, sds);  	}  	if (!sds->group_imb) { @@ -4152,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,  	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);  	/* How much load to actually move to equalise the imbalance */ -	*imbalance = min(max_pull * sds->busiest->sgp->power, +	env->imbalance = min(max_pull * sds->busiest->sgp->power,  		(sds->avg_load - sds->this_load) * sds->this->sgp->power)  			/ SCHED_POWER_SCALE; @@ -4162,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,  	 * a think about bumping its value to force at least one task to be  	 * moved  	 */ -	if (*imbalance < sds->busiest_load_per_task) -		return fix_small_imbalance(sds, this_cpu, imbalance); +	if (env->imbalance < sds->busiest_load_per_task) +		return fix_small_imbalance(env, sds);  } @@ -4194,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,   *		   put to idle by rebalancing its tasks onto our group.   */  static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, -		   unsigned long *imbalance, enum cpu_idle_type idle, -		   const struct cpumask *cpus, int *balance) +find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)  {  	struct sd_lb_stats sds; @@ -4206,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  	 * Compute the various statistics relavent for load balancing at  	 * this level.  	 */ -	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); +	update_sd_lb_stats(env, cpus, balance, &sds);  	/*  	 * this_cpu is not the appropriate cpu to perform load balancing at @@ -4215,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  	if (!(*balance))  		goto ret; -	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && -	    check_asym_packing(sd, &sds, this_cpu, imbalance)) +	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && +	    check_asym_packing(env, &sds))  		return sds.busiest;  	/* There is no busy sibling group to pull tasks from */ @@ -4234,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  		goto force_balance;  	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ -	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && +	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&  			!sds.busiest_has_capacity)  		goto force_balance; @@ -4252,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  	if (sds.this_load >= sds.avg_load)  		goto out_balanced; -	if (idle == CPU_IDLE) { +	if (env->idle == CPU_IDLE) {  		/*  		 * This cpu is idle. If the busiest group load doesn't  		 * have more tasks than the number of available cpu's and @@ -4267,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use  		 * imbalance_pct to be conservative.  		 */ -		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) +		if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)  			goto out_balanced;  	}  force_balance:  	/* Looks like there is an imbalance. Compute it */ -	calculate_imbalance(&sds, this_cpu, imbalance); +	calculate_imbalance(env, &sds);  	return sds.busiest;  out_balanced: -	/* -	 * There is no obvious imbalance. But check if we can do some balancing -	 * to save power. -	 */ -	if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) -		return sds.busiest;  ret: -	*imbalance = 0; +	env->imbalance = 0;  	return NULL;  }  /*   * find_busiest_queue - find the busiest runqueue among the cpus in group.   */ -static struct rq * -find_busiest_queue(struct sched_domain *sd, struct sched_group *group, -		   enum cpu_idle_type idle, unsigned long imbalance, -		   const struct cpumask *cpus) +static struct rq *find_busiest_queue(struct lb_env *env, +				     struct sched_group *group, +				     const struct cpumask *cpus)  {  	struct rq *busiest = NULL, *rq;  	unsigned long max_load = 0; @@ -4307,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,  		unsigned long wl;  		if (!capacity) -			capacity = fix_small_capacity(sd, group); +			capacity = fix_small_capacity(env->sd, group);  		if (!cpumask_test_cpu(i, cpus))  			continue; @@ -4319,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,  		 * When comparing with imbalance, use weighted_cpuload()  		 * which is not scaled with the cpu power.  		 */ -		if (capacity && rq->nr_running == 1 && wl > imbalance) +		if (capacity && rq->nr_running == 1 && wl > env->imbalance)  			continue;  		/* @@ -4348,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,  /* Working cpumask for load_balance and load_balance_newidle. */  DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int idle, -			       int busiest_cpu, int this_cpu) +static int need_active_balance(struct lb_env *env)  { -	if (idle == CPU_NEWLY_IDLE) { +	struct sched_domain *sd = env->sd; + +	if (env->idle == CPU_NEWLY_IDLE) {  		/*  		 * ASYM_PACKING needs to force migrate tasks from busy but  		 * higher numbered CPUs in order to pack all tasks in the  		 * lowest numbered CPUs.  		 */ -		if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) +		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)  			return 1; - -		/* -		 * The only task running in a non-idle cpu can be moved to this -		 * cpu in an attempt to completely freeup the other CPU -		 * package. -		 * -		 * The package power saving logic comes from -		 * find_busiest_group(). If there are no imbalance, then -		 * f_b_g() will return NULL. However when sched_mc={1,2} then -		 * f_b_g() will select a group from which a running task may be -		 * pulled to this cpu in order to make the other package idle. -		 * If there is no opportunity to make a package idle and if -		 * there are no imbalance, then f_b_g() will return NULL and no -		 * action will be taken in load_balance_newidle(). -		 * -		 * Under normal task pull operation due to imbalance, there -		 * will be more than one task in the source run queue and -		 * move_tasks() will succeed.  ld_moved will be true and this -		 * active balance code will not be triggered. -		 */ -		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) -			return 0;  	}  	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); @@ -4399,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,  {  	int ld_moved, active_balance = 0;  	struct sched_group *group; -	unsigned long imbalance;  	struct rq *busiest;  	unsigned long flags;  	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); @@ -4417,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,  	schedstat_inc(sd, lb_count[idle]);  redo: -	group = find_busiest_group(sd, this_cpu, &imbalance, idle, -				   cpus, balance); +	group = find_busiest_group(&env, cpus, balance);  	if (*balance == 0)  		goto out_balanced; @@ -4428,7 +4243,7 @@ redo:  		goto out_balanced;  	} -	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); +	busiest = find_busiest_queue(&env, group, cpus);  	if (!busiest) {  		schedstat_inc(sd, lb_nobusyq[idle]);  		goto out_balanced; @@ -4436,7 +4251,7 @@ redo:  	BUG_ON(busiest == this_rq); -	schedstat_add(sd, lb_imbalance[idle], imbalance); +	schedstat_add(sd, lb_imbalance[idle], env.imbalance);  	ld_moved = 0;  	if (busiest->nr_running > 1) { @@ -4447,10 +4262,9 @@ redo:  		 * correctly treated as an imbalance.  		 */  		env.flags |= LBF_ALL_PINNED; -		env.load_move	= imbalance; -		env.src_cpu	= busiest->cpu; -		env.src_rq	= busiest; -		env.loop_max	= min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); +		env.src_cpu   = busiest->cpu; +		env.src_rq    = busiest; +		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);  more_balance:  		local_irq_save(flags); @@ -4492,7 +4306,7 @@ more_balance:  		if (idle != CPU_NEWLY_IDLE)  			sd->nr_balance_failed++; -		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { +		if (need_active_balance(&env)) {  			raw_spin_lock_irqsave(&busiest->lock, flags);  			/* don't kick the active_load_balance_cpu_stop, @@ -4519,10 +4333,11 @@ more_balance:  			}  			raw_spin_unlock_irqrestore(&busiest->lock, flags); -			if (active_balance) +			if (active_balance) {  				stop_one_cpu_nowait(cpu_of(busiest),  					active_load_balance_cpu_stop, busiest,  					&busiest->active_balance_work); +			}  			/*  			 * We've kicked active balancing, reset the failure @@ -4703,104 +4518,15 @@ static struct {  	unsigned long next_balance;     /* in jiffy units */  } nohz ____cacheline_aligned; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu:	The cpu whose lowest level of sched domain is to - *		be returned. - * @flag:	The flag to check for the lowest sched_domain - *		for the given cpu. - * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ -	struct sched_domain *sd; - -	for_each_domain(cpu, sd) -		if (sd->flags & flag) -			break; - -	return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu:	The cpu whose domains we're iterating over. - * @sd:		variable holding the value of the power_savings_sd - *		for cpu. - * @flag:	The flag to filter the sched_domains to be iterated. - * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ -	for (sd = lowest_flag_domain(cpu, flag); \ -		(sd && (sd->flags & flag)); sd = sd->parent) - -/** - * find_new_ilb - Finds the optimum idle load balancer for nomination. - * @cpu:	The cpu which is nominating a new idle_load_balancer. - * - * Returns:	Returns the id of the idle load balancer if it exists, - *		Else, returns >= nr_cpu_ids. - * - * This algorithm picks the idle load balancer such that it belongs to a - * semi-idle powersavings sched_domain. The idea is to try and avoid - * completely idle packages/cores just for the purpose of idle load balancing - * when there are other idle cpu's which are better suited for that job. - */ -static int find_new_ilb(int cpu) +static inline int find_new_ilb(int call_cpu)  {  	int ilb = cpumask_first(nohz.idle_cpus_mask); -	struct sched_group *ilbg; -	struct sched_domain *sd; - -	/* -	 * Have idle load balancer selection from semi-idle packages only -	 * when power-aware load balancing is enabled -	 */ -	if (!(sched_smt_power_savings || sched_mc_power_savings)) -		goto out_done; - -	/* -	 * Optimize for the case when we have no idle CPUs or only one -	 * idle CPU. Don't walk the sched_domain hierarchy in such cases -	 */ -	if (cpumask_weight(nohz.idle_cpus_mask) < 2) -		goto out_done; - -	rcu_read_lock(); -	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { -		ilbg = sd->groups; - -		do { -			if (ilbg->group_weight != -				atomic_read(&ilbg->sgp->nr_busy_cpus)) { -				ilb = cpumask_first_and(nohz.idle_cpus_mask, -							sched_group_cpus(ilbg)); -				goto unlock; -			} - -			ilbg = ilbg->next; - -		} while (ilbg != sd->groups); -	} -unlock: -	rcu_read_unlock(); -out_done:  	if (ilb < nr_cpu_ids && idle_cpu(ilb))  		return ilb;  	return nr_cpu_ids;  } -#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ -static inline int find_new_ilb(int call_cpu) -{ -	return nr_cpu_ids; -} -#endif  /*   * Kick a CPU to do the nohz balancing, if it is time for it. We pick the @@ -5023,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)  		raw_spin_lock_irq(&this_rq->lock);  		update_rq_clock(this_rq); -		update_cpu_load(this_rq); +		update_idle_cpu_load(this_rq);  		raw_spin_unlock_irq(&this_rq->lock);  		rebalance_domains(balance_cpu, CPU_IDLE); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f28..b44d604b35d 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -4,7 +4,7 @@   * idle-task scheduling class.   *   * (NOTE: these are not related to SCHED_IDLE tasks which are - *  handled in sched_fair.c) + *  handled in sched/fair.c)   */  #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d..c5565c3c515 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)  static void set_cpus_allowed_rt(struct task_struct *p,  				const struct cpumask *new_mask)  { -	int weight = cpumask_weight(new_mask); +	struct rq *rq; +	int weight;  	BUG_ON(!rt_task(p)); -	/* -	 * Update the migration status of the RQ if we have an RT task -	 * which is running AND changing its weight value. -	 */ -	if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { -		struct rq *rq = task_rq(p); - -		if (!task_current(rq, p)) { -			/* -			 * Make sure we dequeue this task from the pushable list -			 * before going further.  It will either remain off of -			 * the list because we are no longer pushable, or it -			 * will be requeued. -			 */ -			if (p->rt.nr_cpus_allowed > 1) -				dequeue_pushable_task(rq, p); +	if (!p->on_rq) +		return; -			/* -			 * Requeue if our weight is changing and still > 1 -			 */ -			if (weight > 1) -				enqueue_pushable_task(rq, p); +	weight = cpumask_weight(new_mask); -		} +	/* +	 * Only update if the process changes its state from whether it +	 * can migrate or not. +	 */ +	if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) +		return; -		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { -			rq->rt.rt_nr_migratory++; -		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { -			BUG_ON(!rq->rt.rt_nr_migratory); -			rq->rt.rt_nr_migratory--; -		} +	rq = task_rq(p); -		update_rt_migration(&rq->rt); +	/* +	 * The process used to be able to migrate OR it can now migrate +	 */ +	if (weight <= 1) { +		if (!task_current(rq, p)) +			dequeue_pushable_task(rq, p); +		BUG_ON(!rq->rt.rt_nr_migratory); +		rq->rt.rt_nr_migratory--; +	} else { +		if (!task_current(rq, p)) +			enqueue_pushable_task(rq, p); +		rq->rt.rt_nr_migratory++;  	} + +	update_rt_migration(&rq->rt);  }  /* Assumes rq->lock is held */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb3acba4d52..ba9dccfd24c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -201,7 +201,7 @@ struct cfs_bandwidth { };  /* CFS-related fields in a runqueue */  struct cfs_rq {  	struct load_weight load; -	unsigned long nr_running, h_nr_running; +	unsigned int nr_running, h_nr_running;  	u64 exec_clock;  	u64 min_vruntime; @@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)  /* Real-Time classes' related field in a runqueue: */  struct rt_rq {  	struct rt_prio_array active; -	unsigned long rt_nr_running; +	unsigned int rt_nr_running;  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED  	struct {  		int curr; /* highest queued rt task prio */ @@ -353,7 +353,7 @@ struct rq {  	 * nr_running and cpu_load should be in the same cacheline because  	 * remote CPUs use both these fields when doing load calculation.  	 */ -	unsigned long nr_running; +	unsigned int nr_running;  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX];  	unsigned long last_load_update_tick; @@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);  extern struct rt_bandwidth def_rt_bandwidth;  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); -extern void update_cpu_load(struct rq *this_rq); +extern void update_idle_cpu_load(struct rq *this_rq);  #ifdef CONFIG_CGROUP_CPUACCT  #include <linux/cgroup.h> diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895e..ee376beedaf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -3,16 +3,357 @@   *   * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>   * - * This defines a simple but solid secure-computing mode. + * Copyright (C) 2012 Google, Inc. + * Will Drewry <wad@chromium.org> + * + * This defines a simple but solid secure-computing facility. + * + * Mode 1 uses a fixed list of allowed system calls. + * Mode 2 allows user-defined system call filters in the form + *        of Berkeley Packet Filters/Linux Socket Filters.   */ +#include <linux/atomic.h>  #include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/sched.h>  #include <linux/compat.h> +#include <linux/sched.h> +#include <linux/seccomp.h>  /* #define SECCOMP_DEBUG 1 */ -#define NR_SECCOMP_MODES 1 + +#ifdef CONFIG_SECCOMP_FILTER +#include <asm/syscall.h> +#include <linux/filter.h> +#include <linux/ptrace.h> +#include <linux/security.h> +#include <linux/slab.h> +#include <linux/tracehook.h> +#include <linux/uaccess.h> + +/** + * struct seccomp_filter - container for seccomp BPF programs + * + * @usage: reference count to manage the object lifetime. + *         get/put helpers should be used when accessing an instance + *         outside of a lifetime-guarded section.  In general, this + *         is only needed for handling filters shared across tasks. + * @prev: points to a previously installed, or inherited, filter + * @len: the number of instructions in the program + * @insns: the BPF program instructions to evaluate + * + * seccomp_filter objects are organized in a tree linked via the @prev + * pointer.  For any task, it appears to be a singly-linked list starting + * with current->seccomp.filter, the most recently attached or inherited filter. + * However, multiple filters may share a @prev node, by way of fork(), which + * results in a unidirectional tree existing in memory.  This is similar to + * how namespaces work. + * + * seccomp_filter objects should never be modified after being attached + * to a task_struct (other than @usage). + */ +struct seccomp_filter { +	atomic_t usage; +	struct seccomp_filter *prev; +	unsigned short len;  /* Instruction count */ +	struct sock_filter insns[]; +}; + +/* Limit any path through the tree to 256KB worth of instructions. */ +#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) + +/** + * get_u32 - returns a u32 offset into data + * @data: a unsigned 64 bit value + * @index: 0 or 1 to return the first or second 32-bits + * + * This inline exists to hide the length of unsigned long.  If a 32-bit + * unsigned long is passed in, it will be extended and the top 32-bits will be + * 0. If it is a 64-bit unsigned long, then whatever data is resident will be + * properly returned. + * + * Endianness is explicitly ignored and left for BPF program authors to manage + * as per the specific architecture. + */ +static inline u32 get_u32(u64 data, int index) +{ +	return ((u32 *)&data)[index]; +} + +/* Helper for bpf_load below. */ +#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) +/** + * bpf_load: checks and returns a pointer to the requested offset + * @off: offset into struct seccomp_data to load from + * + * Returns the requested 32-bits of data. + * seccomp_check_filter() should assure that @off is 32-bit aligned + * and not out of bounds.  Failure to do so is a BUG. + */ +u32 seccomp_bpf_load(int off) +{ +	struct pt_regs *regs = task_pt_regs(current); +	if (off == BPF_DATA(nr)) +		return syscall_get_nr(current, regs); +	if (off == BPF_DATA(arch)) +		return syscall_get_arch(current, regs); +	if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { +		unsigned long value; +		int arg = (off - BPF_DATA(args[0])) / sizeof(u64); +		int index = !!(off % sizeof(u64)); +		syscall_get_arguments(current, regs, arg, 1, &value); +		return get_u32(value, index); +	} +	if (off == BPF_DATA(instruction_pointer)) +		return get_u32(KSTK_EIP(current), 0); +	if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) +		return get_u32(KSTK_EIP(current), 1); +	/* seccomp_check_filter should make this impossible. */ +	BUG(); +} + +/** + *	seccomp_check_filter - verify seccomp filter code + *	@filter: filter to verify + *	@flen: length of filter + * + * Takes a previously checked filter (by sk_chk_filter) and + * redirects all filter code that loads struct sk_buff data + * and related data through seccomp_bpf_load.  It also + * enforces length and alignment checking of those loads. + * + * Returns 0 if the rule set is legal or -EINVAL if not. + */ +static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) +{ +	int pc; +	for (pc = 0; pc < flen; pc++) { +		struct sock_filter *ftest = &filter[pc]; +		u16 code = ftest->code; +		u32 k = ftest->k; + +		switch (code) { +		case BPF_S_LD_W_ABS: +			ftest->code = BPF_S_ANC_SECCOMP_LD_W; +			/* 32-bit aligned and not out of bounds. */ +			if (k >= sizeof(struct seccomp_data) || k & 3) +				return -EINVAL; +			continue; +		case BPF_S_LD_W_LEN: +			ftest->code = BPF_S_LD_IMM; +			ftest->k = sizeof(struct seccomp_data); +			continue; +		case BPF_S_LDX_W_LEN: +			ftest->code = BPF_S_LDX_IMM; +			ftest->k = sizeof(struct seccomp_data); +			continue; +		/* Explicitly include allowed calls. */ +		case BPF_S_RET_K: +		case BPF_S_RET_A: +		case BPF_S_ALU_ADD_K: +		case BPF_S_ALU_ADD_X: +		case BPF_S_ALU_SUB_K: +		case BPF_S_ALU_SUB_X: +		case BPF_S_ALU_MUL_K: +		case BPF_S_ALU_MUL_X: +		case BPF_S_ALU_DIV_X: +		case BPF_S_ALU_AND_K: +		case BPF_S_ALU_AND_X: +		case BPF_S_ALU_OR_K: +		case BPF_S_ALU_OR_X: +		case BPF_S_ALU_LSH_K: +		case BPF_S_ALU_LSH_X: +		case BPF_S_ALU_RSH_K: +		case BPF_S_ALU_RSH_X: +		case BPF_S_ALU_NEG: +		case BPF_S_LD_IMM: +		case BPF_S_LDX_IMM: +		case BPF_S_MISC_TAX: +		case BPF_S_MISC_TXA: +		case BPF_S_ALU_DIV_K: +		case BPF_S_LD_MEM: +		case BPF_S_LDX_MEM: +		case BPF_S_ST: +		case BPF_S_STX: +		case BPF_S_JMP_JA: +		case BPF_S_JMP_JEQ_K: +		case BPF_S_JMP_JEQ_X: +		case BPF_S_JMP_JGE_K: +		case BPF_S_JMP_JGE_X: +		case BPF_S_JMP_JGT_K: +		case BPF_S_JMP_JGT_X: +		case BPF_S_JMP_JSET_K: +		case BPF_S_JMP_JSET_X: +			continue; +		default: +			return -EINVAL; +		} +	} +	return 0; +} + +/** + * seccomp_run_filters - evaluates all seccomp filters against @syscall + * @syscall: number of the current system call + * + * Returns valid seccomp BPF response codes. + */ +static u32 seccomp_run_filters(int syscall) +{ +	struct seccomp_filter *f; +	u32 ret = SECCOMP_RET_ALLOW; + +	/* Ensure unexpected behavior doesn't result in failing open. */ +	if (WARN_ON(current->seccomp.filter == NULL)) +		return SECCOMP_RET_KILL; + +	/* +	 * All filters in the list are evaluated and the lowest BPF return +	 * value always takes priority (ignoring the DATA). +	 */ +	for (f = current->seccomp.filter; f; f = f->prev) { +		u32 cur_ret = sk_run_filter(NULL, f->insns); +		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) +			ret = cur_ret; +	} +	return ret; +} + +/** + * seccomp_attach_filter: Attaches a seccomp filter to current. + * @fprog: BPF program to install + * + * Returns 0 on success or an errno on failure. + */ +static long seccomp_attach_filter(struct sock_fprog *fprog) +{ +	struct seccomp_filter *filter; +	unsigned long fp_size = fprog->len * sizeof(struct sock_filter); +	unsigned long total_insns = fprog->len; +	long ret; + +	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) +		return -EINVAL; + +	for (filter = current->seccomp.filter; filter; filter = filter->prev) +		total_insns += filter->len + 4;  /* include a 4 instr penalty */ +	if (total_insns > MAX_INSNS_PER_PATH) +		return -ENOMEM; + +	/* +	 * Installing a seccomp filter requires that the task have +	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. +	 * This avoids scenarios where unprivileged tasks can affect the +	 * behavior of privileged children. +	 */ +	if (!current->no_new_privs && +	    security_capable_noaudit(current_cred(), current_user_ns(), +				     CAP_SYS_ADMIN) != 0) +		return -EACCES; + +	/* Allocate a new seccomp_filter */ +	filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, +			 GFP_KERNEL|__GFP_NOWARN); +	if (!filter) +		return -ENOMEM; +	atomic_set(&filter->usage, 1); +	filter->len = fprog->len; + +	/* Copy the instructions from fprog. */ +	ret = -EFAULT; +	if (copy_from_user(filter->insns, fprog->filter, fp_size)) +		goto fail; + +	/* Check and rewrite the fprog via the skb checker */ +	ret = sk_chk_filter(filter->insns, filter->len); +	if (ret) +		goto fail; + +	/* Check and rewrite the fprog for seccomp use */ +	ret = seccomp_check_filter(filter->insns, filter->len); +	if (ret) +		goto fail; + +	/* +	 * If there is an existing filter, make it the prev and don't drop its +	 * task reference. +	 */ +	filter->prev = current->seccomp.filter; +	current->seccomp.filter = filter; +	return 0; +fail: +	kfree(filter); +	return ret; +} + +/** + * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * @user_filter: pointer to the user data containing a sock_fprog. + * + * Returns 0 on success and non-zero otherwise. + */ +long seccomp_attach_user_filter(char __user *user_filter) +{ +	struct sock_fprog fprog; +	long ret = -EFAULT; + +#ifdef CONFIG_COMPAT +	if (is_compat_task()) { +		struct compat_sock_fprog fprog32; +		if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) +			goto out; +		fprog.len = fprog32.len; +		fprog.filter = compat_ptr(fprog32.filter); +	} else /* falls through to the if below. */ +#endif +	if (copy_from_user(&fprog, user_filter, sizeof(fprog))) +		goto out; +	ret = seccomp_attach_filter(&fprog); +out: +	return ret; +} + +/* get_seccomp_filter - increments the reference count of the filter on @tsk */ +void get_seccomp_filter(struct task_struct *tsk) +{ +	struct seccomp_filter *orig = tsk->seccomp.filter; +	if (!orig) +		return; +	/* Reference count is bounded by the number of total processes. */ +	atomic_inc(&orig->usage); +} + +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ +	struct seccomp_filter *orig = tsk->seccomp.filter; +	/* Clean up single-reference branches iteratively. */ +	while (orig && atomic_dec_and_test(&orig->usage)) { +		struct seccomp_filter *freeme = orig; +		orig = orig->prev; +		kfree(freeme); +	} +} + +/** + * seccomp_send_sigsys - signals the task to allow in-process syscall emulation + * @syscall: syscall number to send to userland + * @reason: filter-supplied reason code to send to userland (via si_errno) + * + * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. + */ +static void seccomp_send_sigsys(int syscall, int reason) +{ +	struct siginfo info; +	memset(&info, 0, sizeof(info)); +	info.si_signo = SIGSYS; +	info.si_code = SYS_SECCOMP; +	info.si_call_addr = (void __user *)KSTK_EIP(current); +	info.si_errno = reason; +	info.si_arch = syscall_get_arch(current, task_pt_regs(current)); +	info.si_syscall = syscall; +	force_sig_info(SIGSYS, &info, current); +} +#endif	/* CONFIG_SECCOMP_FILTER */  /*   * Secure computing mode 1 allows only read/write/exit/sigreturn. @@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {  };  #endif -void __secure_computing(int this_syscall) +int __secure_computing(int this_syscall)  {  	int mode = current->seccomp.mode; -	int * syscall; +	int exit_sig = 0; +	int *syscall; +	u32 ret;  	switch (mode) { -	case 1: +	case SECCOMP_MODE_STRICT:  		syscall = mode1_syscalls;  #ifdef CONFIG_COMPAT  		if (is_compat_task()) @@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)  #endif  		do {  			if (*syscall == this_syscall) -				return; +				return 0;  		} while (*++syscall); +		exit_sig = SIGKILL; +		ret = SECCOMP_RET_KILL; +		break; +#ifdef CONFIG_SECCOMP_FILTER +	case SECCOMP_MODE_FILTER: { +		int data; +		ret = seccomp_run_filters(this_syscall); +		data = ret & SECCOMP_RET_DATA; +		ret &= SECCOMP_RET_ACTION; +		switch (ret) { +		case SECCOMP_RET_ERRNO: +			/* Set the low-order 16-bits as a errno. */ +			syscall_set_return_value(current, task_pt_regs(current), +						 -data, 0); +			goto skip; +		case SECCOMP_RET_TRAP: +			/* Show the handler the original registers. */ +			syscall_rollback(current, task_pt_regs(current)); +			/* Let the filter pass back 16 bits of data. */ +			seccomp_send_sigsys(this_syscall, data); +			goto skip; +		case SECCOMP_RET_TRACE: +			/* Skip these calls if there is no tracer. */ +			if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) +				goto skip; +			/* Allow the BPF to provide the event message */ +			ptrace_event(PTRACE_EVENT_SECCOMP, data); +			/* +			 * The delivery of a fatal signal during event +			 * notification may silently skip tracer notification. +			 * Terminating the task now avoids executing a system +			 * call that may not be intended. +			 */ +			if (fatal_signal_pending(current)) +				break; +			return 0; +		case SECCOMP_RET_ALLOW: +			return 0; +		case SECCOMP_RET_KILL: +		default: +			break; +		} +		exit_sig = SIGSYS;  		break; +	} +#endif  	default:  		BUG();  	} @@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)  #ifdef SECCOMP_DEBUG  	dump_stack();  #endif -	audit_seccomp(this_syscall); -	do_exit(SIGKILL); +	audit_seccomp(this_syscall, exit_sig, ret); +	do_exit(exit_sig); +#ifdef CONFIG_SECCOMP_FILTER +skip: +	audit_seccomp(this_syscall, exit_sig, ret); +#endif +	return -1;  }  long prctl_get_seccomp(void) @@ -64,25 +457,48 @@ long prctl_get_seccomp(void)  	return current->seccomp.mode;  } -long prctl_set_seccomp(unsigned long seccomp_mode) +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * This function may be called repeatedly with a @seccomp_mode of + * SECCOMP_MODE_FILTER to install additional filters.  Every filter + * successfully installed will be evaluated (in reverse order) for each system + * call the task makes. + * + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)  { -	long ret; +	long ret = -EINVAL; -	/* can set it only once to be even more secure */ -	ret = -EPERM; -	if (unlikely(current->seccomp.mode)) +	if (current->seccomp.mode && +	    current->seccomp.mode != seccomp_mode)  		goto out; -	ret = -EINVAL; -	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { -		current->seccomp.mode = seccomp_mode; -		set_thread_flag(TIF_SECCOMP); +	switch (seccomp_mode) { +	case SECCOMP_MODE_STRICT: +		ret = 0;  #ifdef TIF_NOTSC  		disable_TSC();  #endif -		ret = 0; +		break; +#ifdef CONFIG_SECCOMP_FILTER +	case SECCOMP_MODE_FILTER: +		ret = seccomp_attach_user_filter(filter); +		if (ret) +			goto out; +		break; +#endif +	default: +		goto out;  	} - out: +	current->seccomp.mode = seccomp_mode; +	set_thread_flag(TIF_SECCOMP); +out:  	return ret;  } diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 60636a4e25c..4567fc020fe 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);   * down_trylock - try to acquire the semaphore, without waiting   * @sem: the semaphore to be acquired   * - * Try to acquire the semaphore atomically.  Returns 0 if the mutex has + * Try to acquire the semaphore atomically.  Returns 0 if the semaphore has   * been acquired successfully or 1 if it it cannot be acquired.   *   * NOTE: This return value is inverted from both spin_trylock and diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d..1a006b5d9d9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -160,7 +160,7 @@ void recalc_sigpending(void)  #define SYNCHRONOUS_MASK \  	(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ -	 sigmask(SIGTRAP) | sigmask(SIGFPE)) +	 sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))  int next_signal(struct sigpending *pending, sigset_t *mask)  { @@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)  		err |= __put_user(from->si_uid, &to->si_uid);  		err |= __put_user(from->si_ptr, &to->si_ptr);  		break; +#ifdef __ARCH_SIGSYS +	case __SI_SYS: +		err |= __put_user(from->si_call_addr, &to->si_call_addr); +		err |= __put_user(from->si_syscall, &to->si_syscall); +		err |= __put_user(from->si_arch, &to->si_arch); +		break; +#endif  	default: /* this is just in case for now ... */  		err |= __put_user(from->si_pid, &to->si_pid);  		err |= __put_user(from->si_uid, &to->si_uid); diff --git a/kernel/smp.c b/kernel/smp.c index 2f8b10ecf75..d0ae5b24875 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,8 @@  #include <linux/smp.h>  #include <linux/cpu.h> +#include "smpboot.h" +  #ifdef CONFIG_USE_GENERIC_SMP_HELPERS  static struct {  	struct list_head	queue; @@ -669,6 +671,8 @@ void __init smp_init(void)  {  	unsigned int cpu; +	idle_threads_init(); +  	/* FIXME: This should be done in userspace --RR */  	for_each_present_cpu(cpu) {  		if (num_online_cpus() >= setup_max_cpus) @@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),  	}  }  EXPORT_SYMBOL(on_each_cpu_cond); + +static void do_nothing(void *unused) +{ +} + +/** + * kick_all_cpus_sync - Force all cpus out of idle + * + * Used to synchronize the update of pm_idle function pointer. It's + * called after the pointer is updated and returns after the dummy + * callback function has been executed on all cpus. The execution of + * the function can only happen on the remote cpus after they have + * left the idle function which had been called via pm_idle function + * pointer. So it's guaranteed that nothing uses the previous pointer + * anymore. + */ +void kick_all_cpus_sync(void) +{ +	/* Make sure the change is visible before we kick the cpus */ +	smp_mb(); +	smp_call_function(do_nothing, NULL, 1); +} +EXPORT_SYMBOL_GPL(kick_all_cpus_sync); diff --git a/kernel/smpboot.c b/kernel/smpboot.c new file mode 100644 index 00000000000..e1a797e028a --- /dev/null +++ b/kernel/smpboot.c @@ -0,0 +1,62 @@ +/* + * Common SMP CPU bringup/teardown functions + */ +#include <linux/err.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/percpu.h> + +#include "smpboot.h" + +#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD +/* + * For the hotplug case we keep the task structs around and reuse + * them. + */ +static DEFINE_PER_CPU(struct task_struct *, idle_threads); + +struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) +{ +	struct task_struct *tsk = per_cpu(idle_threads, cpu); + +	if (!tsk) +		return ERR_PTR(-ENOMEM); +	init_idle(tsk, cpu); +	return tsk; +} + +void __init idle_thread_set_boot_cpu(void) +{ +	per_cpu(idle_threads, smp_processor_id()) = current; +} + +static inline void idle_init(unsigned int cpu) +{ +	struct task_struct *tsk = per_cpu(idle_threads, cpu); + +	if (!tsk) { +		tsk = fork_idle(cpu); +		if (IS_ERR(tsk)) +			pr_err("SMP: fork_idle() failed for CPU %u\n", cpu); +		else +			per_cpu(idle_threads, cpu) = tsk; +	} +} + +/** + * idle_thread_init - Initialize the idle thread for a cpu + * @cpu:	The cpu for which the idle thread should be initialized + * + * Creates the thread if it does not exist. + */ +void __init idle_threads_init(void) +{ +	unsigned int cpu; + +	for_each_possible_cpu(cpu) { +		if (cpu != smp_processor_id()) +			idle_init(cpu); +	} +} +#endif diff --git a/kernel/smpboot.h b/kernel/smpboot.h new file mode 100644 index 00000000000..80c0acfb847 --- /dev/null +++ b/kernel/smpboot.h @@ -0,0 +1,18 @@ +#ifndef SMPBOOT_H +#define SMPBOOT_H + +struct task_struct; + +int smpboot_prepare(unsigned int cpu); + +#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD +struct task_struct *idle_thread_get(unsigned int cpu); +void idle_thread_set_boot_cpu(void); +void idle_threads_init(void); +#else +static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; } +static inline void idle_thread_set_boot_cpu(void) { } +static inline void idle_threads_init(void) { } +#endif + +#endif diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f..2095be3318d 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -34,10 +34,77 @@  #include <linux/delay.h>  #include <linux/srcu.h> +/* + * Initialize an rcu_batch structure to empty. + */ +static inline void rcu_batch_init(struct rcu_batch *b) +{ +	b->head = NULL; +	b->tail = &b->head; +} + +/* + * Enqueue a callback onto the tail of the specified rcu_batch structure. + */ +static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) +{ +	*b->tail = head; +	b->tail = &head->next; +} + +/* + * Is the specified rcu_batch structure empty? + */ +static inline bool rcu_batch_empty(struct rcu_batch *b) +{ +	return b->tail == &b->head; +} + +/* + * Remove the callback at the head of the specified rcu_batch structure + * and return a pointer to it, or return NULL if the structure is empty. + */ +static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) +{ +	struct rcu_head *head; + +	if (rcu_batch_empty(b)) +		return NULL; + +	head = b->head; +	b->head = head->next; +	if (b->tail == &head->next) +		rcu_batch_init(b); + +	return head; +} + +/* + * Move all callbacks from the rcu_batch structure specified by "from" to + * the structure specified by "to". + */ +static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) +{ +	if (!rcu_batch_empty(from)) { +		*to->tail = from->head; +		to->tail = from->tail; +		rcu_batch_init(from); +	} +} + +/* single-thread state-machine */ +static void process_srcu(struct work_struct *work); +  static int init_srcu_struct_fields(struct srcu_struct *sp)  {  	sp->completed = 0; -	mutex_init(&sp->mutex); +	spin_lock_init(&sp->queue_lock); +	sp->running = false; +	rcu_batch_init(&sp->batch_queue); +	rcu_batch_init(&sp->batch_check0); +	rcu_batch_init(&sp->batch_check1); +	rcu_batch_init(&sp->batch_done); +	INIT_DELAYED_WORK(&sp->work, process_srcu);  	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);  	return sp->per_cpu_ref ? 0 : -ENOMEM;  } @@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */  /* - * srcu_readers_active_idx -- returns approximate number of readers - *	active on the specified rank of per-CPU counters. + * Returns approximate total of the readers' ->seq[] values for the + * rank of per-CPU counters specified by idx.   */ +static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +{ +	int cpu; +	unsigned long sum = 0; +	unsigned long t; -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) +	for_each_possible_cpu(cpu) { +		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); +		sum += t; +	} +	return sum; +} + +/* + * Returns approximate number of readers active on the specified rank + * of the per-CPU ->c[] counters. + */ +static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)  {  	int cpu; -	int sum; +	unsigned long sum = 0; +	unsigned long t; -	sum = 0; -	for_each_possible_cpu(cpu) -		sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; +	for_each_possible_cpu(cpu) { +		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); +		sum += t; +	}  	return sum;  } +/* + * Return true if the number of pre-existing readers is determined to + * be stably zero.  An example unstable zero can occur if the call + * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, + * but due to task migration, sees the corresponding __srcu_read_unlock() + * decrement.  This can happen because srcu_readers_active_idx() takes + * time to sum the array, and might in fact be interrupted or preempted + * partway through the summation. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +{ +	unsigned long seq; + +	seq = srcu_readers_seq_idx(sp, idx); + +	/* +	 * The following smp_mb() A pairs with the smp_mb() B located in +	 * __srcu_read_lock().  This pairing ensures that if an +	 * __srcu_read_lock() increments its counter after the summation +	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side +	 * critical section will see any changes made prior to the start +	 * of the current SRCU grace period. +	 * +	 * Also, if the above call to srcu_readers_seq_idx() saw the +	 * increment of ->seq[], then the call to srcu_readers_active_idx() +	 * must see the increment of ->c[]. +	 */ +	smp_mb(); /* A */ + +	/* +	 * Note that srcu_readers_active_idx() can incorrectly return +	 * zero even though there is a pre-existing reader throughout. +	 * To see this, suppose that task A is in a very long SRCU +	 * read-side critical section that started on CPU 0, and that +	 * no other reader exists, so that the sum of the counters +	 * is equal to one.  Then suppose that task B starts executing +	 * srcu_readers_active_idx(), summing up to CPU 1, and then that +	 * task C starts reading on CPU 0, so that its increment is not +	 * summed, but finishes reading on CPU 2, so that its decrement +	 * -is- summed.  Then when task B completes its sum, it will +	 * incorrectly get zero, despite the fact that task A has been +	 * in its SRCU read-side critical section the whole time. +	 * +	 * We therefore do a validation step should srcu_readers_active_idx() +	 * return zero. +	 */ +	if (srcu_readers_active_idx(sp, idx) != 0) +		return false; + +	/* +	 * The remainder of this function is the validation step. +	 * The following smp_mb() D pairs with the smp_mb() C in +	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen +	 * by srcu_readers_active_idx() above, then any destructive +	 * operation performed after the grace period will happen after +	 * the corresponding SRCU read-side critical section. +	 * +	 * Note that there can be at most NR_CPUS worth of readers using +	 * the old index, which is not enough to overflow even a 32-bit +	 * integer.  (Yes, this does mean that systems having more than +	 * a billion or so CPUs need to be 64-bit systems.)  Therefore, +	 * the sum of the ->seq[] counters cannot possibly overflow. +	 * Therefore, the only way that the return values of the two +	 * calls to srcu_readers_seq_idx() can be equal is if there were +	 * no increments of the corresponding rank of ->seq[] counts +	 * in the interim.  But the missed-increment scenario laid out +	 * above includes an increment of the ->seq[] counter by +	 * the corresponding __srcu_read_lock().  Therefore, if this +	 * scenario occurs, the return values from the two calls to +	 * srcu_readers_seq_idx() will differ, and thus the validation +	 * step below suffices. +	 */ +	smp_mb(); /* D */ + +	return srcu_readers_seq_idx(sp, idx) == seq; +} +  /**   * srcu_readers_active - returns approximate number of readers.   * @sp: which srcu_struct to count active readers (holding srcu_read_lock). @@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)   */  static int srcu_readers_active(struct srcu_struct *sp)  { -	return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); +	int cpu; +	unsigned long sum = 0; + +	for_each_possible_cpu(cpu) { +		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); +		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); +	} +	return sum;  }  /** @@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)  	int idx;  	preempt_disable(); -	idx = sp->completed & 0x1; -	barrier();  /* ensure compiler looks -once- at sp->completed. */ -	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; -	srcu_barrier();  /* ensure compiler won't misorder critical section. */ +	idx = rcu_dereference_index_check(sp->completed, +					  rcu_read_lock_sched_held()) & 0x1; +	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; +	smp_mb(); /* B */  /* Avoid leaking the critical section. */ +	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;  	preempt_enable();  	return idx;  } @@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);  void __srcu_read_unlock(struct srcu_struct *sp, int idx)  {  	preempt_disable(); -	srcu_barrier();  /* ensure compiler won't misorder critical section. */ -	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; +	smp_mb(); /* C */  /* Avoid leaking the critical section. */ +	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;  	preempt_enable();  }  EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);   * we repeatedly block for 1-millisecond time periods.  This approach   * has done well in testing, so there is no need for a config parameter.   */ -#define SYNCHRONIZE_SRCU_READER_DELAY 10 +#define SRCU_RETRY_CHECK_DELAY		5 +#define SYNCHRONIZE_SRCU_TRYCOUNT	2 +#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12  /* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + * @@@ Wait until all pre-existing readers complete.  Such readers + * will have used the index specified by "idx". + * the caller should ensures the ->completed is not changed while checking + * and idx = (->completed & 1) ^ 1   */ -static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) +static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)  { -	int idx; - -	rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && -			   !lock_is_held(&rcu_bh_lock_map) && -			   !lock_is_held(&rcu_lock_map) && -			   !lock_is_held(&rcu_sched_lock_map), -			   "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); +	for (;;) { +		if (srcu_readers_active_idx_check(sp, idx)) +			return true; +		if (--trycount <= 0) +			return false; +		udelay(SRCU_RETRY_CHECK_DELAY); +	} +} -	idx = sp->completed; -	mutex_lock(&sp->mutex); +/* + * Increment the ->completed counter so that future SRCU readers will + * use the other rank of the ->c[] and ->seq[] arrays.  This allows + * us to wait for pre-existing readers in a starvation-free manner. + */ +static void srcu_flip(struct srcu_struct *sp) +{ +	sp->completed++; +} -	/* -	 * Check to see if someone else did the work for us while we were -	 * waiting to acquire the lock.  We need -two- advances of -	 * the counter, not just one.  If there was but one, we might have -	 * shown up -after- our helper's first synchronize_sched(), thus -	 * having failed to prevent CPU-reordering races with concurrent -	 * srcu_read_unlock()s on other CPUs (see comment below).  So we -	 * either (1) wait for two or (2) supply the second ourselves. -	 */ +/* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *head, +		void (*func)(struct rcu_head *head)) +{ +	unsigned long flags; -	if ((sp->completed - idx) >= 2) { -		mutex_unlock(&sp->mutex); -		return; +	head->next = NULL; +	head->func = func; +	spin_lock_irqsave(&sp->queue_lock, flags); +	rcu_batch_queue(&sp->batch_queue, head); +	if (!sp->running) { +		sp->running = true; +		queue_delayed_work(system_nrt_wq, &sp->work, 0);  	} +	spin_unlock_irqrestore(&sp->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(call_srcu); -	sync_func();  /* Force memory barrier on all CPUs. */ +struct rcu_synchronize { +	struct rcu_head head; +	struct completion completion; +}; -	/* -	 * The preceding synchronize_sched() ensures that any CPU that -	 * sees the new value of sp->completed will also see any preceding -	 * changes to data structures made by this CPU.  This prevents -	 * some other CPU from reordering the accesses in its SRCU -	 * read-side critical section to precede the corresponding -	 * srcu_read_lock() -- ensuring that such references will in -	 * fact be protected. -	 * -	 * So it is now safe to do the flip. -	 */ +/* + * Awaken the corresponding synchronize_srcu() instance now that a + * grace period has elapsed. + */ +static void wakeme_after_rcu(struct rcu_head *head) +{ +	struct rcu_synchronize *rcu; -	idx = sp->completed & 0x1; -	sp->completed++; +	rcu = container_of(head, struct rcu_synchronize, head); +	complete(&rcu->completion); +} -	sync_func();  /* Force memory barrier on all CPUs. */ +static void srcu_advance_batches(struct srcu_struct *sp, int trycount); +static void srcu_reschedule(struct srcu_struct *sp); -	/* -	 * At this point, because of the preceding synchronize_sched(), -	 * all srcu_read_lock() calls using the old counters have completed. -	 * Their corresponding critical sections might well be still -	 * executing, but the srcu_read_lock() primitives themselves -	 * will have finished executing.  We initially give readers -	 * an arbitrarily chosen 10 microseconds to get out of their -	 * SRCU read-side critical sections, then loop waiting 1/HZ -	 * seconds per iteration.  The 10-microsecond value has done -	 * very well in testing. -	 */ +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp, int trycount) +{ +	struct rcu_synchronize rcu; +	struct rcu_head *head = &rcu.head; +	bool done = false; -	if (srcu_readers_active_idx(sp, idx)) -		udelay(SYNCHRONIZE_SRCU_READER_DELAY); -	while (srcu_readers_active_idx(sp, idx)) -		schedule_timeout_interruptible(1); +	rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && +			   !lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); -	sync_func();  /* Force memory barrier on all CPUs. */ +	init_completion(&rcu.completion); -	/* -	 * The preceding synchronize_sched() forces all srcu_read_unlock() -	 * primitives that were executing concurrently with the preceding -	 * for_each_possible_cpu() loop to have completed by this point. -	 * More importantly, it also forces the corresponding SRCU read-side -	 * critical sections to have also completed, and the corresponding -	 * references to SRCU-protected data items to be dropped. -	 * -	 * Note: -	 * -	 *	Despite what you might think at first glance, the -	 *	preceding synchronize_sched() -must- be within the -	 *	critical section ended by the following mutex_unlock(). -	 *	Otherwise, a task taking the early exit can race -	 *	with a srcu_read_unlock(), which might have executed -	 *	just before the preceding srcu_readers_active() check, -	 *	and whose CPU might have reordered the srcu_read_unlock() -	 *	with the preceding critical section.  In this case, there -	 *	is nothing preventing the synchronize_sched() task that is -	 *	taking the early exit from freeing a data structure that -	 *	is still being referenced (out of order) by the task -	 *	doing the srcu_read_unlock(). -	 * -	 *	Alternatively, the comparison with "2" on the early exit -	 *	could be changed to "3", but this increases synchronize_srcu() -	 *	latency for bulk loads.  So the current code is preferred. -	 */ +	head->next = NULL; +	head->func = wakeme_after_rcu; +	spin_lock_irq(&sp->queue_lock); +	if (!sp->running) { +		/* steal the processing owner */ +		sp->running = true; +		rcu_batch_queue(&sp->batch_check0, head); +		spin_unlock_irq(&sp->queue_lock); -	mutex_unlock(&sp->mutex); +		srcu_advance_batches(sp, trycount); +		if (!rcu_batch_empty(&sp->batch_done)) { +			BUG_ON(sp->batch_done.head != head); +			rcu_batch_dequeue(&sp->batch_done); +			done = true; +		} +		/* give the processing owner to work_struct */ +		srcu_reschedule(sp); +	} else { +		rcu_batch_queue(&sp->batch_queue, head); +		spin_unlock_irq(&sp->queue_lock); +	} + +	if (!done) +		wait_for_completion(&rcu.completion);  }  /** @@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))   */  void synchronize_srcu(struct srcu_struct *sp)  { -	__synchronize_srcu(sp, synchronize_sched); +	__synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);  }  EXPORT_SYMBOL_GPL(synchronize_srcu); @@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);   * synchronize_srcu_expedited - Brute-force SRCU grace period   * @sp: srcu_struct with which to synchronize.   * - * Wait for an SRCU grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly.  This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code.  In fact, - * if you are using synchronize_srcu_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_srcu() instead. + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting.   *   * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal - * to call this function from a CPU-hotplug notifier.  Failing to observe - * these restriction will result in deadlock.  It is also illegal to call + * that is acquired by a CPU-hotplug notifier.  It is also illegal to call   * synchronize_srcu_expedited() from the corresponding SRCU read-side   * critical section; doing so will result in deadlock.  However, it is   * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct @@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);   */  void synchronize_srcu_expedited(struct srcu_struct *sp)  { -	__synchronize_srcu(sp, synchronize_sched_expedited); +	__synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);  }  EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);  /** + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + */ +void srcu_barrier(struct srcu_struct *sp) +{ +	synchronize_srcu(sp); +} +EXPORT_SYMBOL_GPL(srcu_barrier); + +/**   * srcu_batches_completed - return batches completed.   * @sp: srcu_struct on which to report batch completion.   *   * Report the number of batches, correlated with, but not necessarily   * precisely the same as, the number of grace periods that have elapsed.   */ -  long srcu_batches_completed(struct srcu_struct *sp)  {  	return sp->completed;  }  EXPORT_SYMBOL_GPL(srcu_batches_completed); + +#define SRCU_CALLBACK_BATCH	10 +#define SRCU_INTERVAL		1 + +/* + * Move any new SRCU callbacks to the first stage of the SRCU grace + * period pipeline. + */ +static void srcu_collect_new(struct srcu_struct *sp) +{ +	if (!rcu_batch_empty(&sp->batch_queue)) { +		spin_lock_irq(&sp->queue_lock); +		rcu_batch_move(&sp->batch_check0, &sp->batch_queue); +		spin_unlock_irq(&sp->queue_lock); +	} +} + +/* + * Core SRCU state machine.  Advance callbacks from ->batch_check0 to + * ->batch_check1 and then to ->batch_done as readers drain. + */ +static void srcu_advance_batches(struct srcu_struct *sp, int trycount) +{ +	int idx = 1 ^ (sp->completed & 1); + +	/* +	 * Because readers might be delayed for an extended period after +	 * fetching ->completed for their index, at any point in time there +	 * might well be readers using both idx=0 and idx=1.  We therefore +	 * need to wait for readers to clear from both index values before +	 * invoking a callback. +	 */ + +	if (rcu_batch_empty(&sp->batch_check0) && +	    rcu_batch_empty(&sp->batch_check1)) +		return; /* no callbacks need to be advanced */ + +	if (!try_check_zero(sp, idx, trycount)) +		return; /* failed to advance, will try after SRCU_INTERVAL */ + +	/* +	 * The callbacks in ->batch_check1 have already done with their +	 * first zero check and flip back when they were enqueued on +	 * ->batch_check0 in a previous invocation of srcu_advance_batches(). +	 * (Presumably try_check_zero() returned false during that +	 * invocation, leaving the callbacks stranded on ->batch_check1.) +	 * They are therefore ready to invoke, so move them to ->batch_done. +	 */ +	rcu_batch_move(&sp->batch_done, &sp->batch_check1); + +	if (rcu_batch_empty(&sp->batch_check0)) +		return; /* no callbacks need to be advanced */ +	srcu_flip(sp); + +	/* +	 * The callbacks in ->batch_check0 just finished their +	 * first check zero and flip, so move them to ->batch_check1 +	 * for future checking on the other idx. +	 */ +	rcu_batch_move(&sp->batch_check1, &sp->batch_check0); + +	/* +	 * SRCU read-side critical sections are normally short, so check +	 * at least twice in quick succession after a flip. +	 */ +	trycount = trycount < 2 ? 2 : trycount; +	if (!try_check_zero(sp, idx^1, trycount)) +		return; /* failed to advance, will try after SRCU_INTERVAL */ + +	/* +	 * The callbacks in ->batch_check1 have now waited for all +	 * pre-existing readers using both idx values.  They are therefore +	 * ready to invoke, so move them to ->batch_done. +	 */ +	rcu_batch_move(&sp->batch_done, &sp->batch_check1); +} + +/* + * Invoke a limited number of SRCU callbacks that have passed through + * their grace period.  If there are more to do, SRCU will reschedule + * the workqueue. + */ +static void srcu_invoke_callbacks(struct srcu_struct *sp) +{ +	int i; +	struct rcu_head *head; + +	for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { +		head = rcu_batch_dequeue(&sp->batch_done); +		if (!head) +			break; +		local_bh_disable(); +		head->func(head); +		local_bh_enable(); +	} +} + +/* + * Finished one round of SRCU grace period.  Start another if there are + * more SRCU callbacks queued, otherwise put SRCU into not-running state. + */ +static void srcu_reschedule(struct srcu_struct *sp) +{ +	bool pending = true; + +	if (rcu_batch_empty(&sp->batch_done) && +	    rcu_batch_empty(&sp->batch_check1) && +	    rcu_batch_empty(&sp->batch_check0) && +	    rcu_batch_empty(&sp->batch_queue)) { +		spin_lock_irq(&sp->queue_lock); +		if (rcu_batch_empty(&sp->batch_done) && +		    rcu_batch_empty(&sp->batch_check1) && +		    rcu_batch_empty(&sp->batch_check0) && +		    rcu_batch_empty(&sp->batch_queue)) { +			sp->running = false; +			pending = false; +		} +		spin_unlock_irq(&sp->queue_lock); +	} + +	if (pending) +		queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); +} + +/* + * This is the work-queue function that handles SRCU grace periods. + */ +static void process_srcu(struct work_struct *work) +{ +	struct srcu_struct *sp; + +	sp = container_of(work, struct srcu_struct, work.work); + +	srcu_collect_new(sp); +	srcu_advance_batches(sp, 1); +	srcu_invoke_callbacks(sp); +	srcu_reschedule(sp); +} diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e..ba0ae8eea6f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  			error = prctl_get_seccomp();  			break;  		case PR_SET_SECCOMP: -			error = prctl_set_seccomp(arg2); +			error = prctl_set_seccomp(arg2, (char __user *)arg3);  			break;  		case PR_GET_TSC:  			error = GET_TSC_CTL(arg2); @@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  			error = put_user(me->signal->is_child_subreaper,  					 (int __user *) arg2);  			break; +		case PR_SET_NO_NEW_PRIVS: +			if (arg2 != 1 || arg3 || arg4 || arg5) +				return -EINVAL; + +			current->no_new_privs = 1; +			break; +		case PR_GET_NO_NEW_PRIVS: +			if (arg2 || arg3 || arg4 || arg5) +				return -EINVAL; +			return current->no_new_privs ? 1 : 0;  		default:  			error = -EINVAL;  			break; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7..aa27d391bfc 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);   * If one has not already been chosen, it checks to see if a   * functional rtc device is available.   */ -static struct rtc_device *alarmtimer_get_rtcdev(void) +struct rtc_device *alarmtimer_get_rtcdev(void)  {  	unsigned long flags;  	struct rtc_device *ret; @@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)  	class_interface_unregister(&alarmtimer_rtc_interface);  }  #else -static inline struct rtc_device *alarmtimer_get_rtcdev(void) +struct rtc_device *alarmtimer_get_rtcdev(void)  {  	return NULL;  } diff --git a/kernel/timer.c b/kernel/timer.c index a297ffcf888..09de9a941cd 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);   *   * mod_timer_pinned() is a way to update the expire field of an   * active timer (if the timer is inactive it will be activated) - * and not allow the timer to be migrated to a different CPU. + * and to ensure that the timer is scheduled on the current CPU. + * + * Note that this does not prevent the timer from being migrated + * when the current CPU goes offline.  If this is a problem for + * you, use CPU-hotplug notifiers to handle it correctly, for + * example, cancelling the timer when the corresponding CPU goes + * offline.   *   * mod_timer_pinned(timer, expires) is equivalent to:   * @@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),  	 * warnings as well as problems when looking into  	 * timer->lockdep_map, make a copy and use that here.  	 */ -	struct lockdep_map lockdep_map = timer->lockdep_map; +	struct lockdep_map lockdep_map; + +	lockdep_copy_map(&lockdep_map, &timer->lockdep_map);  #endif  	/*  	 * Couple the lock chain with the lock chain at diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f247..f347ac91292 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,6 @@ if FTRACE  config FUNCTION_TRACER  	bool "Kernel Function Tracer"  	depends on HAVE_FUNCTION_TRACER -	select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE  	select KALLSYMS  	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER @@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES  	bool "Trace likely/unlikely profiler"  	select TRACE_BRANCH_PROFILING  	help -	  This tracer profiles all the the likely and unlikely macros +	  This tracer profiles all likely and unlikely macros  	  in the kernel. It will display the results in:  	  /sys/kernel/debug/tracing/trace_stat/branch_annotated diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5e..b3afe0e76f7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o  obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o  obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o  obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o  ifeq ($(CONFIG_BLOCK),y)  obj-$(CONFIG_EVENT_TRACING) += blktrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fa92f677c9..a008663d86c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)  static int ftrace_cmp_recs(const void *a, const void *b)  { -	const struct dyn_ftrace *reca = a; -	const struct dyn_ftrace *recb = b; +	const struct dyn_ftrace *key = a; +	const struct dyn_ftrace *rec = b; -	if (reca->ip > recb->ip) -		return 1; -	if (reca->ip < recb->ip) +	if (key->flags < rec->ip)  		return -1; +	if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) +		return 1;  	return 0;  } -/** - * ftrace_location - return true if the ip giving is a traced location - * @ip: the instruction pointer to check - * - * Returns 1 if @ip given is a pointer to a ftrace location. - * That is, the instruction that is either a NOP or call to - * the function tracer. It checks the ftrace internal tables to - * determine if the address belongs or not. - */ -int ftrace_location(unsigned long ip) +static unsigned long ftrace_location_range(unsigned long start, unsigned long end)  {  	struct ftrace_page *pg;  	struct dyn_ftrace *rec;  	struct dyn_ftrace key; -	key.ip = ip; +	key.ip = start; +	key.flags = end;	/* overload flags, as it is unsigned long */  	for (pg = ftrace_pages_start; pg; pg = pg->next) { +		if (end < pg->records[0].ip || +		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) +			continue;  		rec = bsearch(&key, pg->records, pg->index,  			      sizeof(struct dyn_ftrace),  			      ftrace_cmp_recs);  		if (rec) -			return 1; +			return rec->ip;  	}  	return 0;  } +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns rec->ip if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +unsigned long ftrace_location(unsigned long ip) +{ +	return ftrace_location_range(ip, ip); +} + +/** + * ftrace_text_reserved - return true if range contains an ftrace location + * @start: start of range to search + * @end: end of range to search (inclusive). @end points to the last byte to check. + * + * Returns 1 if @start and @end contains a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_text_reserved(void *start, void *end) +{ +	unsigned long ret; + +	ret = ftrace_location_range((unsigned long)start, +				    (unsigned long)end); + +	return (int)!!ret; +} +  static void __ftrace_hash_rec_update(struct ftrace_ops *ops,  				     int filter_hash,  				     bool inc) @@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,  	__ftrace_hash_rec_update(ops, filter_hash, 1);  } -static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -{ -	if (ftrace_pages->index == ftrace_pages->size) { -		/* We should have allocated enough */ -		if (WARN_ON(!ftrace_pages->next)) -			return NULL; -		ftrace_pages = ftrace_pages->next; -	} - -	return &ftrace_pages->records[ftrace_pages->index++]; -} - -static struct dyn_ftrace * -ftrace_record_ip(unsigned long ip) -{ -	struct dyn_ftrace *rec; - -	if (ftrace_disabled) -		return NULL; - -	rec = ftrace_alloc_dyn_node(ip); -	if (!rec) -		return NULL; - -	rec->ip = ip; - -	return rec; -} -  static void print_ip_ins(const char *fmt, unsigned char *p)  {  	int i; @@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip)  	}  } - -/* Return 1 if the address range is reserved for ftrace */ -int ftrace_text_reserved(void *start, void *end) -{ -	struct dyn_ftrace *rec; -	struct ftrace_page *pg; - -	do_for_each_ftrace_rec(pg, rec) { -		if (rec->ip <= (unsigned long)end && -		    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) -			return 1; -	} while_for_each_ftrace_rec(); -	return 0; -} -  static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)  {  	unsigned long flag = 0UL; @@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)  	return -1; /* unknow ftrace bug */  } -static void ftrace_replace_code(int update) +void __weak ftrace_replace_code(int enable)  {  	struct dyn_ftrace *rec;  	struct ftrace_page *pg; @@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update)  		return;  	do_for_each_ftrace_rec(pg, rec) { -		failed = __ftrace_replace_code(rec, update); +		failed = __ftrace_replace_code(rec, enable);  		if (failed) {  			ftrace_bug(failed, rec->ip);  			/* Stop processing */ @@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void)  	return 0;  } -static int __ftrace_modify_code(void *data) +void ftrace_modify_all_code(int command)  { -	int *command = data; - -	if (*command & FTRACE_UPDATE_CALLS) +	if (command & FTRACE_UPDATE_CALLS)  		ftrace_replace_code(1); -	else if (*command & FTRACE_DISABLE_CALLS) +	else if (command & FTRACE_DISABLE_CALLS)  		ftrace_replace_code(0); -	if (*command & FTRACE_UPDATE_TRACE_FUNC) +	if (command & FTRACE_UPDATE_TRACE_FUNC)  		ftrace_update_ftrace_func(ftrace_trace_function); -	if (*command & FTRACE_START_FUNC_RET) +	if (command & FTRACE_START_FUNC_RET)  		ftrace_enable_ftrace_graph_caller(); -	else if (*command & FTRACE_STOP_FUNC_RET) +	else if (command & FTRACE_STOP_FUNC_RET)  		ftrace_disable_ftrace_graph_caller(); +} + +static int __ftrace_modify_code(void *data) +{ +	int *command = data; + +	ftrace_modify_all_code(*command);  	return 0;  } @@ -2469,57 +2459,35 @@ static int  ftrace_avail_open(struct inode *inode, struct file *file)  {  	struct ftrace_iterator *iter; -	int ret;  	if (unlikely(ftrace_disabled))  		return -ENODEV; -	iter = kzalloc(sizeof(*iter), GFP_KERNEL); -	if (!iter) -		return -ENOMEM; - -	iter->pg = ftrace_pages_start; -	iter->ops = &global_ops; - -	ret = seq_open(file, &show_ftrace_seq_ops); -	if (!ret) { -		struct seq_file *m = file->private_data; - -		m->private = iter; -	} else { -		kfree(iter); +	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); +	if (iter) { +		iter->pg = ftrace_pages_start; +		iter->ops = &global_ops;  	} -	return ret; +	return iter ? 0 : -ENOMEM;  }  static int  ftrace_enabled_open(struct inode *inode, struct file *file)  {  	struct ftrace_iterator *iter; -	int ret;  	if (unlikely(ftrace_disabled))  		return -ENODEV; -	iter = kzalloc(sizeof(*iter), GFP_KERNEL); -	if (!iter) -		return -ENOMEM; - -	iter->pg = ftrace_pages_start; -	iter->flags = FTRACE_ITER_ENABLED; -	iter->ops = &global_ops; - -	ret = seq_open(file, &show_ftrace_seq_ops); -	if (!ret) { -		struct seq_file *m = file->private_data; - -		m->private = iter; -	} else { -		kfree(iter); +	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); +	if (iter) { +		iter->pg = ftrace_pages_start; +		iter->flags = FTRACE_ITER_ENABLED; +		iter->ops = &global_ops;  	} -	return ret; +	return iter ? 0 : -ENOMEM;  }  static void ftrace_filter_reset(struct ftrace_hash *hash) @@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)  	return 0;  } -static void ftrace_swap_recs(void *a, void *b, int size) +static int ftrace_cmp_ips(const void *a, const void *b) +{ +	const unsigned long *ipa = a; +	const unsigned long *ipb = b; + +	if (*ipa > *ipb) +		return 1; +	if (*ipa < *ipb) +		return -1; +	return 0; +} + +static void ftrace_swap_ips(void *a, void *b, int size)  { -	struct dyn_ftrace *reca = a; -	struct dyn_ftrace *recb = b; -	struct dyn_ftrace t; +	unsigned long *ipa = a; +	unsigned long *ipb = b; +	unsigned long t; -	t = *reca; -	*reca = *recb; -	*recb = t; +	t = *ipa; +	*ipa = *ipb; +	*ipb = t;  }  static int ftrace_process_locs(struct module *mod,  			       unsigned long *start,  			       unsigned long *end)  { +	struct ftrace_page *start_pg;  	struct ftrace_page *pg; +	struct dyn_ftrace *rec;  	unsigned long count;  	unsigned long *p;  	unsigned long addr; @@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod,  	if (!count)  		return 0; -	pg = ftrace_allocate_pages(count); -	if (!pg) +	sort(start, count, sizeof(*start), +	     ftrace_cmp_ips, ftrace_swap_ips); + +	start_pg = ftrace_allocate_pages(count); +	if (!start_pg)  		return -ENOMEM;  	mutex_lock(&ftrace_lock); @@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod,  	if (!mod) {  		WARN_ON(ftrace_pages || ftrace_pages_start);  		/* First initialization */ -		ftrace_pages = ftrace_pages_start = pg; +		ftrace_pages = ftrace_pages_start = start_pg;  	} else {  		if (!ftrace_pages)  			goto out; @@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod,  				ftrace_pages = ftrace_pages->next;  		} -		ftrace_pages->next = pg; -		ftrace_pages = pg; +		ftrace_pages->next = start_pg;  	}  	p = start; +	pg = start_pg;  	while (p < end) {  		addr = ftrace_call_adjust(*p++);  		/* @@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod,  		 */  		if (!addr)  			continue; -		if (!ftrace_record_ip(addr)) -			break; + +		if (pg->index == pg->size) { +			/* We should have allocated enough */ +			if (WARN_ON(!pg->next)) +				break; +			pg = pg->next; +		} + +		rec = &pg->records[pg->index++]; +		rec->ip = addr;  	} -	/* These new locations need to be initialized */ -	ftrace_new_pgs = pg; +	/* We should have used all pages */ +	WARN_ON(pg->next); + +	/* Assign the last page to ftrace_pages */ +	ftrace_pages = pg; -	/* Make each individual set of pages sorted by ips */ -	for (; pg; pg = pg->next) -		sort(pg->records, pg->index, sizeof(struct dyn_ftrace), -		     ftrace_cmp_recs, ftrace_swap_recs); +	/* These new locations need to be initialized */ +	ftrace_new_pgs = start_pg;  	/*  	 * We only need to disable interrupts on start up diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cf8d11e91ef..6420cda6233 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -23,6 +23,8 @@  #include <asm/local.h>  #include "trace.h" +static void update_pages_handler(struct work_struct *work); +  /*   * The ring buffer header is special. We must manually up keep it.   */ @@ -449,6 +451,7 @@ struct ring_buffer_per_cpu {  	raw_spinlock_t			reader_lock;	/* serialize readers */  	arch_spinlock_t			lock;  	struct lock_class_key		lock_key; +	unsigned int			nr_pages;  	struct list_head		*pages;  	struct buffer_page		*head_page;	/* read from head */  	struct buffer_page		*tail_page;	/* write to tail */ @@ -466,13 +469,18 @@ struct ring_buffer_per_cpu {  	unsigned long			read_bytes;  	u64				write_stamp;  	u64				read_stamp; +	/* ring buffer pages to update, > 0 to add, < 0 to remove */ +	int				nr_pages_to_update; +	struct list_head		new_pages; /* new pages to add */ +	struct work_struct		update_pages_work; +	struct completion		update_done;  };  struct ring_buffer { -	unsigned			pages;  	unsigned			flags;  	int				cpus;  	atomic_t			record_disabled; +	atomic_t			resize_disabled;  	cpumask_var_t			cpumask;  	struct lock_class_key		*reader_lock_key; @@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)  	struct list_head *head = cpu_buffer->pages;  	struct buffer_page *bpage, *tmp; +	/* Reset the head page if it exists */ +	if (cpu_buffer->head_page) +		rb_set_head_page(cpu_buffer); +  	rb_head_page_deactivate(cpu_buffer);  	if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)  	return 0;  } -static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, -			     unsigned nr_pages) +static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)  { +	int i;  	struct buffer_page *bpage, *tmp; -	LIST_HEAD(pages); -	unsigned i; - -	WARN_ON(!nr_pages);  	for (i = 0; i < nr_pages; i++) {  		struct page *page; @@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,  		 */  		bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),  				    GFP_KERNEL | __GFP_NORETRY, -				    cpu_to_node(cpu_buffer->cpu)); +				    cpu_to_node(cpu));  		if (!bpage)  			goto free_pages; -		rb_check_bpage(cpu_buffer, bpage); +		list_add(&bpage->list, pages); -		list_add(&bpage->list, &pages); - -		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), +		page = alloc_pages_node(cpu_to_node(cpu),  					GFP_KERNEL | __GFP_NORETRY, 0);  		if (!page)  			goto free_pages; @@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,  		rb_init_page(bpage->page);  	} +	return 0; + +free_pages: +	list_for_each_entry_safe(bpage, tmp, pages, list) { +		list_del_init(&bpage->list); +		free_buffer_page(bpage); +	} + +	return -ENOMEM; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, +			     unsigned nr_pages) +{ +	LIST_HEAD(pages); + +	WARN_ON(!nr_pages); + +	if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) +		return -ENOMEM; +  	/*  	 * The ring buffer page list is a circular list that does not  	 * start and end with a list head. All page list items point to @@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,  	cpu_buffer->pages = pages.next;  	list_del(&pages); +	cpu_buffer->nr_pages = nr_pages; +  	rb_check_pages(cpu_buffer);  	return 0; - - free_pages: -	list_for_each_entry_safe(bpage, tmp, &pages, list) { -		list_del_init(&bpage->list); -		free_buffer_page(bpage); -	} -	return -ENOMEM;  }  static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)  {  	struct ring_buffer_per_cpu *cpu_buffer;  	struct buffer_page *bpage; @@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)  	raw_spin_lock_init(&cpu_buffer->reader_lock);  	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);  	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); +	init_completion(&cpu_buffer->update_done);  	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),  			    GFP_KERNEL, cpu_to_node(cpu)); @@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)  	INIT_LIST_HEAD(&cpu_buffer->reader_page->list); -	ret = rb_allocate_pages(cpu_buffer, buffer->pages); +	ret = rb_allocate_pages(cpu_buffer, nr_pages);  	if (ret < 0)  		goto fail_free_reader; @@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  {  	struct ring_buffer *buffer;  	int bsize; -	int cpu; +	int cpu, nr_pages;  	/* keep it in its own cache line */  	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))  		goto fail_free_buffer; -	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); +	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);  	buffer->flags = flags;  	buffer->clock = trace_clock_local;  	buffer->reader_lock_key = key;  	/* need at least two pages */ -	if (buffer->pages < 2) -		buffer->pages = 2; +	if (nr_pages < 2) +		nr_pages = 2;  	/*  	 * In case of non-hotplug cpu, if the ring-buffer is allocated @@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	for_each_buffer_cpu(buffer, cpu) {  		buffer->buffers[cpu] = -			rb_allocate_cpu_buffer(buffer, cpu); +			rb_allocate_cpu_buffer(buffer, nr_pages, cpu);  		if (!buffer->buffers[cpu])  			goto fail_free_buffers;  	} @@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,  static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); -static void -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +static inline unsigned long rb_page_entries(struct buffer_page *bpage)  { -	struct buffer_page *bpage; -	struct list_head *p; -	unsigned i; +	return local_read(&bpage->entries) & RB_WRITE_MASK; +} + +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ +	return local_read(&bpage->write) & RB_WRITE_MASK; +} + +static int +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +{ +	struct list_head *tail_page, *to_remove, *next_page; +	struct buffer_page *to_remove_page, *tmp_iter_page; +	struct buffer_page *last_page, *first_page; +	unsigned int nr_removed; +	unsigned long head_bit; +	int page_entries; + +	head_bit = 0;  	raw_spin_lock_irq(&cpu_buffer->reader_lock); -	rb_head_page_deactivate(cpu_buffer); +	atomic_inc(&cpu_buffer->record_disabled); +	/* +	 * We don't race with the readers since we have acquired the reader +	 * lock. We also don't race with writers after disabling recording. +	 * This makes it easy to figure out the first and the last page to be +	 * removed from the list. We unlink all the pages in between including +	 * the first and last pages. This is done in a busy loop so that we +	 * lose the least number of traces. +	 * The pages are freed after we restart recording and unlock readers. +	 */ +	tail_page = &cpu_buffer->tail_page->list; -	for (i = 0; i < nr_pages; i++) { -		if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) -			goto out; -		p = cpu_buffer->pages->next; -		bpage = list_entry(p, struct buffer_page, list); -		list_del_init(&bpage->list); -		free_buffer_page(bpage); +	/* +	 * tail page might be on reader page, we remove the next page +	 * from the ring buffer +	 */ +	if (cpu_buffer->tail_page == cpu_buffer->reader_page) +		tail_page = rb_list_head(tail_page->next); +	to_remove = tail_page; + +	/* start of pages to remove */ +	first_page = list_entry(rb_list_head(to_remove->next), +				struct buffer_page, list); + +	for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { +		to_remove = rb_list_head(to_remove)->next; +		head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;  	} -	if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) -		goto out; -	rb_reset_cpu(cpu_buffer); -	rb_check_pages(cpu_buffer); +	next_page = rb_list_head(to_remove)->next; -out: +	/* +	 * Now we remove all pages between tail_page and next_page. +	 * Make sure that we have head_bit value preserved for the +	 * next page +	 */ +	tail_page->next = (struct list_head *)((unsigned long)next_page | +						head_bit); +	next_page = rb_list_head(next_page); +	next_page->prev = tail_page; + +	/* make sure pages points to a valid page in the ring buffer */ +	cpu_buffer->pages = next_page; + +	/* update head page */ +	if (head_bit) +		cpu_buffer->head_page = list_entry(next_page, +						struct buffer_page, list); + +	/* +	 * change read pointer to make sure any read iterators reset +	 * themselves +	 */ +	cpu_buffer->read = 0; + +	/* pages are removed, resume tracing and then free the pages */ +	atomic_dec(&cpu_buffer->record_disabled);  	raw_spin_unlock_irq(&cpu_buffer->reader_lock); + +	RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); + +	/* last buffer page to remove */ +	last_page = list_entry(rb_list_head(to_remove), struct buffer_page, +				list); +	tmp_iter_page = first_page; + +	do { +		to_remove_page = tmp_iter_page; +		rb_inc_page(cpu_buffer, &tmp_iter_page); + +		/* update the counters */ +		page_entries = rb_page_entries(to_remove_page); +		if (page_entries) { +			/* +			 * If something was added to this page, it was full +			 * since it is not the tail page. So we deduct the +			 * bytes consumed in ring buffer from here. +			 * No need to update overruns, since this page is +			 * deleted from ring buffer and its entries are +			 * already accounted for. +			 */ +			local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); +		} + +		/* +		 * We have already removed references to this list item, just +		 * free up the buffer_page and its page +		 */ +		free_buffer_page(to_remove_page); +		nr_removed--; + +	} while (to_remove_page != last_page); + +	RB_WARN_ON(cpu_buffer, nr_removed); + +	return nr_removed == 0;  } -static void -rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, -		struct list_head *pages, unsigned nr_pages) +static int +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)  { -	struct buffer_page *bpage; -	struct list_head *p; -	unsigned i; +	struct list_head *pages = &cpu_buffer->new_pages; +	int retries, success;  	raw_spin_lock_irq(&cpu_buffer->reader_lock); -	rb_head_page_deactivate(cpu_buffer); +	/* +	 * We are holding the reader lock, so the reader page won't be swapped +	 * in the ring buffer. Now we are racing with the writer trying to +	 * move head page and the tail page. +	 * We are going to adapt the reader page update process where: +	 * 1. We first splice the start and end of list of new pages between +	 *    the head page and its previous page. +	 * 2. We cmpxchg the prev_page->next to point from head page to the +	 *    start of new pages list. +	 * 3. Finally, we update the head->prev to the end of new list. +	 * +	 * We will try this process 10 times, to make sure that we don't keep +	 * spinning. +	 */ +	retries = 10; +	success = 0; +	while (retries--) { +		struct list_head *head_page, *prev_page, *r; +		struct list_head *last_page, *first_page; +		struct list_head *head_page_with_bit; -	for (i = 0; i < nr_pages; i++) { -		if (RB_WARN_ON(cpu_buffer, list_empty(pages))) -			goto out; -		p = pages->next; -		bpage = list_entry(p, struct buffer_page, list); -		list_del_init(&bpage->list); -		list_add_tail(&bpage->list, cpu_buffer->pages); +		head_page = &rb_set_head_page(cpu_buffer)->list; +		prev_page = head_page->prev; + +		first_page = pages->next; +		last_page  = pages->prev; + +		head_page_with_bit = (struct list_head *) +				     ((unsigned long)head_page | RB_PAGE_HEAD); + +		last_page->next = head_page_with_bit; +		first_page->prev = prev_page; + +		r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); + +		if (r == head_page_with_bit) { +			/* +			 * yay, we replaced the page pointer to our new list, +			 * now, we just have to update to head page's prev +			 * pointer to point to end of list +			 */ +			head_page->prev = last_page; +			success = 1; +			break; +		}  	} -	rb_reset_cpu(cpu_buffer); -	rb_check_pages(cpu_buffer); -out: +	if (success) +		INIT_LIST_HEAD(pages); +	/* +	 * If we weren't successful in adding in new pages, warn and stop +	 * tracing +	 */ +	RB_WARN_ON(cpu_buffer, !success);  	raw_spin_unlock_irq(&cpu_buffer->reader_lock); + +	/* free pages if they weren't inserted */ +	if (!success) { +		struct buffer_page *bpage, *tmp; +		list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, +					 list) { +			list_del_init(&bpage->list); +			free_buffer_page(bpage); +		} +	} +	return success; +} + +static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ +	int success; + +	if (cpu_buffer->nr_pages_to_update > 0) +		success = rb_insert_pages(cpu_buffer); +	else +		success = rb_remove_pages(cpu_buffer, +					-cpu_buffer->nr_pages_to_update); + +	if (success) +		cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; +} + +static void update_pages_handler(struct work_struct *work) +{ +	struct ring_buffer_per_cpu *cpu_buffer = container_of(work, +			struct ring_buffer_per_cpu, update_pages_work); +	rb_update_pages(cpu_buffer); +	complete(&cpu_buffer->update_done);  }  /** @@ -1283,16 +1471,14 @@ out:   *   * Minimum size is 2 * BUF_PAGE_SIZE.   * - * Returns -1 on failure. + * Returns 0 on success and < 0 on failure.   */ -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, +			int cpu_id)  {  	struct ring_buffer_per_cpu *cpu_buffer; -	unsigned nr_pages, rm_pages, new_pages; -	struct buffer_page *bpage, *tmp; -	unsigned long buffer_size; -	LIST_HEAD(pages); -	int i, cpu; +	unsigned nr_pages; +	int cpu, err = 0;  	/*  	 * Always succeed at resizing a non-existent buffer: @@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)  	size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);  	size *= BUF_PAGE_SIZE; -	buffer_size = buffer->pages * BUF_PAGE_SIZE;  	/* we need a minimum of two pages */  	if (size < BUF_PAGE_SIZE * 2)  		size = BUF_PAGE_SIZE * 2; -	if (size == buffer_size) -		return size; - -	atomic_inc(&buffer->record_disabled); +	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); -	/* Make sure all writers are done with this buffer. */ -	synchronize_sched(); +	/* +	 * Don't succeed if resizing is disabled, as a reader might be +	 * manipulating the ring buffer and is expecting a sane state while +	 * this is true. +	 */ +	if (atomic_read(&buffer->resize_disabled)) +		return -EBUSY; +	/* prevent another thread from changing buffer sizes */  	mutex_lock(&buffer->mutex); -	get_online_cpus(); -	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); +	if (cpu_id == RING_BUFFER_ALL_CPUS) { +		/* calculate the pages to update */ +		for_each_buffer_cpu(buffer, cpu) { +			cpu_buffer = buffer->buffers[cpu]; -	if (size < buffer_size) { +			cpu_buffer->nr_pages_to_update = nr_pages - +							cpu_buffer->nr_pages; +			/* +			 * nothing more to do for removing pages or no update +			 */ +			if (cpu_buffer->nr_pages_to_update <= 0) +				continue; +			/* +			 * to add pages, make sure all new pages can be +			 * allocated without receiving ENOMEM +			 */ +			INIT_LIST_HEAD(&cpu_buffer->new_pages); +			if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, +						&cpu_buffer->new_pages, cpu)) { +				/* not enough memory for new pages */ +				err = -ENOMEM; +				goto out_err; +			} +		} -		/* easy case, just free pages */ -		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) -			goto out_fail; +		get_online_cpus(); +		/* +		 * Fire off all the required work handlers +		 * We can't schedule on offline CPUs, but it's not necessary +		 * since we can change their buffer sizes without any race. +		 */ +		for_each_buffer_cpu(buffer, cpu) { +			cpu_buffer = buffer->buffers[cpu]; +			if (!cpu_buffer->nr_pages_to_update) +				continue; -		rm_pages = buffer->pages - nr_pages; +			if (cpu_online(cpu)) +				schedule_work_on(cpu, +						&cpu_buffer->update_pages_work); +			else +				rb_update_pages(cpu_buffer); +		} +		/* wait for all the updates to complete */  		for_each_buffer_cpu(buffer, cpu) {  			cpu_buffer = buffer->buffers[cpu]; -			rb_remove_pages(cpu_buffer, rm_pages); +			if (!cpu_buffer->nr_pages_to_update) +				continue; + +			if (cpu_online(cpu)) +				wait_for_completion(&cpu_buffer->update_done); +			cpu_buffer->nr_pages_to_update = 0;  		} -		goto out; -	} -	/* -	 * This is a bit more difficult. We only want to add pages -	 * when we can allocate enough for all CPUs. We do this -	 * by allocating all the pages and storing them on a local -	 * link list. If we succeed in our allocation, then we -	 * add these pages to the cpu_buffers. Otherwise we just free -	 * them all and return -ENOMEM; -	 */ -	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) -		goto out_fail; +		put_online_cpus(); +	} else { +		cpu_buffer = buffer->buffers[cpu_id]; -	new_pages = nr_pages - buffer->pages; +		if (nr_pages == cpu_buffer->nr_pages) +			goto out; -	for_each_buffer_cpu(buffer, cpu) { -		for (i = 0; i < new_pages; i++) { -			struct page *page; -			/* -			 * __GFP_NORETRY flag makes sure that the allocation -			 * fails gracefully without invoking oom-killer and -			 * the system is not destabilized. -			 */ -			bpage = kzalloc_node(ALIGN(sizeof(*bpage), -						  cache_line_size()), -					    GFP_KERNEL | __GFP_NORETRY, -					    cpu_to_node(cpu)); -			if (!bpage) -				goto free_pages; -			list_add(&bpage->list, &pages); -			page = alloc_pages_node(cpu_to_node(cpu), -						GFP_KERNEL | __GFP_NORETRY, 0); -			if (!page) -				goto free_pages; -			bpage->page = page_address(page); -			rb_init_page(bpage->page); +		cpu_buffer->nr_pages_to_update = nr_pages - +						cpu_buffer->nr_pages; + +		INIT_LIST_HEAD(&cpu_buffer->new_pages); +		if (cpu_buffer->nr_pages_to_update > 0 && +			__rb_allocate_pages(cpu_buffer->nr_pages_to_update, +					    &cpu_buffer->new_pages, cpu_id)) { +			err = -ENOMEM; +			goto out_err;  		} -	} -	for_each_buffer_cpu(buffer, cpu) { -		cpu_buffer = buffer->buffers[cpu]; -		rb_insert_pages(cpu_buffer, &pages, new_pages); -	} +		get_online_cpus(); -	if (RB_WARN_ON(buffer, !list_empty(&pages))) -		goto out_fail; +		if (cpu_online(cpu_id)) { +			schedule_work_on(cpu_id, +					 &cpu_buffer->update_pages_work); +			wait_for_completion(&cpu_buffer->update_done); +		} else +			rb_update_pages(cpu_buffer); + +		cpu_buffer->nr_pages_to_update = 0; +		put_online_cpus(); +	}   out: -	buffer->pages = nr_pages; -	put_online_cpus(); +	/* +	 * The ring buffer resize can happen with the ring buffer +	 * enabled, so that the update disturbs the tracing as little +	 * as possible. But if the buffer is disabled, we do not need +	 * to worry about that, and we can take the time to verify +	 * that the buffer is not corrupt. +	 */ +	if (atomic_read(&buffer->record_disabled)) { +		atomic_inc(&buffer->record_disabled); +		/* +		 * Even though the buffer was disabled, we must make sure +		 * that it is truly disabled before calling rb_check_pages. +		 * There could have been a race between checking +		 * record_disable and incrementing it. +		 */ +		synchronize_sched(); +		for_each_buffer_cpu(buffer, cpu) { +			cpu_buffer = buffer->buffers[cpu]; +			rb_check_pages(cpu_buffer); +		} +		atomic_dec(&buffer->record_disabled); +	} +  	mutex_unlock(&buffer->mutex); +	return size; -	atomic_dec(&buffer->record_disabled); + out_err: +	for_each_buffer_cpu(buffer, cpu) { +		struct buffer_page *bpage, *tmp; -	return size; +		cpu_buffer = buffer->buffers[cpu]; +		cpu_buffer->nr_pages_to_update = 0; - free_pages: -	list_for_each_entry_safe(bpage, tmp, &pages, list) { -		list_del_init(&bpage->list); -		free_buffer_page(bpage); -	} -	put_online_cpus(); -	mutex_unlock(&buffer->mutex); -	atomic_dec(&buffer->record_disabled); -	return -ENOMEM; +		if (list_empty(&cpu_buffer->new_pages)) +			continue; -	/* -	 * Something went totally wrong, and we are too paranoid -	 * to even clean up the mess. -	 */ - out_fail: -	put_online_cpus(); +		list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, +					list) { +			list_del_init(&bpage->list); +			free_buffer_page(bpage); +		} +	}  	mutex_unlock(&buffer->mutex); -	atomic_dec(&buffer->record_disabled); -	return -1; +	return err;  }  EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)  	return __rb_page_index(iter->head_page, iter->head);  } -static inline unsigned long rb_page_write(struct buffer_page *bpage) -{ -	return local_read(&bpage->write) & RB_WRITE_MASK; -} -  static inline unsigned rb_page_commit(struct buffer_page *bpage)  {  	return local_read(&bpage->page->commit);  } -static inline unsigned long rb_page_entries(struct buffer_page *bpage) -{ -	return local_read(&bpage->entries) & RB_WRITE_MASK; -} -  /* Size is determined by what has been committed */  static inline unsigned rb_page_size(struct buffer_page *bpage)  { @@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)  	 * assign the commit to the tail.  	 */   again: -	max_count = cpu_buffer->buffer->pages * 100; +	max_count = cpu_buffer->nr_pages * 100;  	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {  		if (RB_WARN_ON(cpu_buffer, !(--max_count))) @@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)  	iter->cpu_buffer = cpu_buffer; +	atomic_inc(&buffer->resize_disabled);  	atomic_inc(&cpu_buffer->record_disabled);  	return iter; @@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)  {  	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; +	/* +	 * Ring buffer is disabled from recording, here's a good place +	 * to check the integrity of the ring buffer.  +	 */ +	rb_check_pages(cpu_buffer); +  	atomic_dec(&cpu_buffer->record_disabled); +	atomic_dec(&cpu_buffer->buffer->resize_disabled);  	kfree(iter);  }  EXPORT_SYMBOL_GPL(ring_buffer_read_finish); @@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);   * ring_buffer_size - return the size of the ring buffer (in bytes)   * @buffer: The ring buffer.   */ -unsigned long ring_buffer_size(struct ring_buffer *buffer) +unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)  { -	return BUF_PAGE_SIZE * buffer->pages; +	/* +	 * Earlier, this method returned +	 *	BUF_PAGE_SIZE * buffer->nr_pages +	 * Since the nr_pages field is now removed, we have converted this to +	 * return the per cpu buffer value. +	 */ +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;  }  EXPORT_SYMBOL_GPL(ring_buffer_size); @@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)  	cpu_buffer->commit_page = cpu_buffer->head_page;  	INIT_LIST_HEAD(&cpu_buffer->reader_page->list); +	INIT_LIST_HEAD(&cpu_buffer->new_pages);  	local_set(&cpu_buffer->reader_page->write, 0);  	local_set(&cpu_buffer->reader_page->entries, 0);  	local_set(&cpu_buffer->reader_page->page->commit, 0); @@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	if (!cpumask_test_cpu(cpu, buffer->cpumask))  		return; +	atomic_inc(&buffer->resize_disabled);  	atomic_inc(&cpu_buffer->record_disabled); +	/* Make sure all commits have finished */ +	synchronize_sched(); +  	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) @@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);  	atomic_dec(&cpu_buffer->record_disabled); +	atomic_dec(&buffer->resize_disabled);  }  EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,  	    !cpumask_test_cpu(cpu, buffer_b->cpumask))  		goto out; +	cpu_buffer_a = buffer_a->buffers[cpu]; +	cpu_buffer_b = buffer_b->buffers[cpu]; +  	/* At least make sure the two buffers are somewhat the same */ -	if (buffer_a->pages != buffer_b->pages) +	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)  		goto out;  	ret = -EAGAIN; @@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,  	if (atomic_read(&buffer_b->record_disabled))  		goto out; -	cpu_buffer_a = buffer_a->buffers[cpu]; -	cpu_buffer_b = buffer_b->buffers[cpu]; -  	if (atomic_read(&cpu_buffer_a->record_disabled))  		goto out; @@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self,  	struct ring_buffer *buffer =  		container_of(self, struct ring_buffer, cpu_notify);  	long cpu = (long)hcpu; +	int cpu_i, nr_pages_same; +	unsigned int nr_pages;  	switch (action) {  	case CPU_UP_PREPARE: @@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self,  		if (cpumask_test_cpu(cpu, buffer->cpumask))  			return NOTIFY_OK; +		nr_pages = 0; +		nr_pages_same = 1; +		/* check if all cpu sizes are same */ +		for_each_buffer_cpu(buffer, cpu_i) { +			/* fill in the size from first enabled cpu */ +			if (nr_pages == 0) +				nr_pages = buffer->buffers[cpu_i]->nr_pages; +			if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { +				nr_pages_same = 0; +				break; +			} +		} +		/* allocate minimum pages, user can later expand it */ +		if (!nr_pages_same) +			nr_pages = 2;  		buffer->buffers[cpu] = -			rb_allocate_cpu_buffer(buffer, cpu); +			rb_allocate_cpu_buffer(buffer, nr_pages, cpu);  		if (!buffer->buffers[cpu]) {  			WARN(1, "failed to allocate ring buffer on CPU %ld\n",  			     cpu); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a22255c101..68032c6177d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,18 +87,6 @@ static int tracing_disabled = 1;  DEFINE_PER_CPU(int, ftrace_cpu_disabled); -static inline void ftrace_disable_cpu(void) -{ -	preempt_disable(); -	__this_cpu_inc(ftrace_cpu_disabled); -} - -static inline void ftrace_enable_cpu(void) -{ -	__this_cpu_dec(ftrace_cpu_disabled); -	preempt_enable(); -} -  cpumask_var_t __read_mostly	tracing_buffer_mask;  /* @@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)  static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)  {  	int len; -	void *ret;  	if (s->len <= s->readpos)  		return -EBUSY; @@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)  	len = s->len - s->readpos;  	if (cnt > len)  		cnt = len; -	ret = memcpy(buf, s->buffer + s->readpos, cnt); -	if (!ret) -		return -EFAULT; +	memcpy(buf, s->buffer + s->readpos, cnt);  	s->readpos += cnt;  	return cnt; @@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  	arch_spin_lock(&ftrace_max_lock); -	ftrace_disable_cpu(); -  	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);  	if (ret == -EBUSY) { @@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  			"Failed to swap buffers due to commit in progress\n");  	} -	ftrace_enable_cpu(); -  	WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);  	__update_max_tr(tr, tsk, cpu); @@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)   * Register a new plugin tracer.   */  int register_tracer(struct tracer *type) -__releases(kernel_lock) -__acquires(kernel_lock)  {  	struct tracer *t;  	int ret = 0; @@ -841,7 +820,8 @@ __acquires(kernel_lock)  		/* If we expanded the buffers, make sure the max is expanded too */  		if (ring_buffer_expanded && type->use_max_tr) -			ring_buffer_resize(max_tr.buffer, trace_buf_size); +			ring_buffer_resize(max_tr.buffer, trace_buf_size, +						RING_BUFFER_ALL_CPUS);  		/* the test is responsible for initializing and enabling */  		pr_info("Testing tracer %s: ", type->name); @@ -857,7 +837,8 @@ __acquires(kernel_lock)  		/* Shrink the max buffer again */  		if (ring_buffer_expanded && type->use_max_tr) -			ring_buffer_resize(max_tr.buffer, 1); +			ring_buffer_resize(max_tr.buffer, 1, +						RING_BUFFER_ALL_CPUS);  		printk(KERN_CONT "PASSED\n");  	} @@ -917,13 +898,6 @@ out:  	mutex_unlock(&trace_types_lock);  } -static void __tracing_reset(struct ring_buffer *buffer, int cpu) -{ -	ftrace_disable_cpu(); -	ring_buffer_reset_cpu(buffer, cpu); -	ftrace_enable_cpu(); -} -  void tracing_reset(struct trace_array *tr, int cpu)  {  	struct ring_buffer *buffer = tr->buffer; @@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu)  	/* Make sure all commits have finished */  	synchronize_sched(); -	__tracing_reset(buffer, cpu); +	ring_buffer_reset_cpu(buffer, cpu);  	ring_buffer_record_enable(buffer);  } @@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)  	tr->time_start = ftrace_now(tr->cpu);  	for_each_online_cpu(cpu) -		__tracing_reset(buffer, cpu); +		ring_buffer_reset_cpu(buffer, cpu);  	ring_buffer_record_enable(buffer);  } @@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)  #endif /* CONFIG_STACKTRACE */ +/* created for use with alloc_percpu */ +struct trace_buffer_struct { +	char buffer[TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct *trace_percpu_sirq_buffer; +static struct trace_buffer_struct *trace_percpu_irq_buffer; +static struct trace_buffer_struct *trace_percpu_nmi_buffer; + +/* + * The buffer used is dependent on the context. There is a per cpu + * buffer for normal context, softirq contex, hard irq context and + * for NMI context. Thise allows for lockless recording. + * + * Note, if the buffers failed to be allocated, then this returns NULL + */ +static char *get_trace_buf(void) +{ +	struct trace_buffer_struct *percpu_buffer; +	struct trace_buffer_struct *buffer; + +	/* +	 * If we have allocated per cpu buffers, then we do not +	 * need to do any locking. +	 */ +	if (in_nmi()) +		percpu_buffer = trace_percpu_nmi_buffer; +	else if (in_irq()) +		percpu_buffer = trace_percpu_irq_buffer; +	else if (in_softirq()) +		percpu_buffer = trace_percpu_sirq_buffer; +	else +		percpu_buffer = trace_percpu_buffer; + +	if (!percpu_buffer) +		return NULL; + +	buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); + +	return buffer->buffer; +} + +static int alloc_percpu_trace_buffer(void) +{ +	struct trace_buffer_struct *buffers; +	struct trace_buffer_struct *sirq_buffers; +	struct trace_buffer_struct *irq_buffers; +	struct trace_buffer_struct *nmi_buffers; + +	buffers = alloc_percpu(struct trace_buffer_struct); +	if (!buffers) +		goto err_warn; + +	sirq_buffers = alloc_percpu(struct trace_buffer_struct); +	if (!sirq_buffers) +		goto err_sirq; + +	irq_buffers = alloc_percpu(struct trace_buffer_struct); +	if (!irq_buffers) +		goto err_irq; + +	nmi_buffers = alloc_percpu(struct trace_buffer_struct); +	if (!nmi_buffers) +		goto err_nmi; + +	trace_percpu_buffer = buffers; +	trace_percpu_sirq_buffer = sirq_buffers; +	trace_percpu_irq_buffer = irq_buffers; +	trace_percpu_nmi_buffer = nmi_buffers; + +	return 0; + + err_nmi: +	free_percpu(irq_buffers); + err_irq: +	free_percpu(sirq_buffers); + err_sirq: +	free_percpu(buffers); + err_warn: +	WARN(1, "Could not allocate percpu trace_printk buffer"); +	return -ENOMEM; +} + +void trace_printk_init_buffers(void) +{ +	static int buffers_allocated; + +	if (buffers_allocated) +		return; + +	if (alloc_percpu_trace_buffer()) +		return; + +	pr_info("ftrace: Allocated trace_printk buffers\n"); + +	buffers_allocated = 1; +} +  /**   * trace_vbprintk - write binary msg to tracing buffer   *   */  int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  { -	static arch_spinlock_t trace_buf_lock = -		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -	static u32 trace_buf[TRACE_BUF_SIZE]; -  	struct ftrace_event_call *call = &event_bprint;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	struct trace_array *tr = &global_trace; -	struct trace_array_cpu *data;  	struct bprint_entry *entry;  	unsigned long flags; -	int disable; -	int cpu, len = 0, size, pc; +	char *tbuffer; +	int len = 0, size, pc;  	if (unlikely(tracing_selftest_running || tracing_disabled))  		return 0; @@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	pc = preempt_count();  	preempt_disable_notrace(); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disable = atomic_inc_return(&data->disabled); -	if (unlikely(disable != 1)) +	tbuffer = get_trace_buf(); +	if (!tbuffer) { +		len = 0;  		goto out; +	} -	/* Lockdep uses trace_printk for lock tracing */ -	local_irq_save(flags); -	arch_spin_lock(&trace_buf_lock); -	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); +	len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); -	if (len > TRACE_BUF_SIZE || len < 0) -		goto out_unlock; +	if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) +		goto out; +	local_save_flags(flags);  	size = sizeof(*entry) + sizeof(u32) * len;  	buffer = tr->buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,  					  flags, pc);  	if (!event) -		goto out_unlock; +		goto out;  	entry = ring_buffer_event_data(event);  	entry->ip			= ip;  	entry->fmt			= fmt; -	memcpy(entry->buf, trace_buf, sizeof(u32) * len); +	memcpy(entry->buf, tbuffer, sizeof(u32) * len);  	if (!filter_check_discard(call, entry, buffer, event)) {  		ring_buffer_unlock_commit(buffer, event);  		ftrace_trace_stack(buffer, flags, 6, pc);  	} -out_unlock: -	arch_spin_unlock(&trace_buf_lock); -	local_irq_restore(flags); -  out: -	atomic_dec_return(&data->disabled);  	preempt_enable_notrace();  	unpause_graph_tracing(); @@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr,  int trace_array_vprintk(struct trace_array *tr,  			unsigned long ip, const char *fmt, va_list args)  { -	static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; -	static char trace_buf[TRACE_BUF_SIZE]; -  	struct ftrace_event_call *call = &event_print;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; -	struct trace_array_cpu *data; -	int cpu, len = 0, size, pc; +	int len = 0, size, pc;  	struct print_entry *entry; -	unsigned long irq_flags; -	int disable; +	unsigned long flags; +	char *tbuffer;  	if (tracing_disabled || tracing_selftest_running)  		return 0; +	/* Don't pollute graph traces with trace_vprintk internals */ +	pause_graph_tracing(); +  	pc = preempt_count();  	preempt_disable_notrace(); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disable = atomic_inc_return(&data->disabled); -	if (unlikely(disable != 1)) + +	tbuffer = get_trace_buf(); +	if (!tbuffer) { +		len = 0;  		goto out; +	} -	pause_graph_tracing(); -	raw_local_irq_save(irq_flags); -	arch_spin_lock(&trace_buf_lock); -	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); +	len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); +	if (len > TRACE_BUF_SIZE) +		goto out; +	local_save_flags(flags);  	size = sizeof(*entry) + len + 1;  	buffer = tr->buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, -					  irq_flags, pc); +					  flags, pc);  	if (!event) -		goto out_unlock; +		goto out;  	entry = ring_buffer_event_data(event);  	entry->ip = ip; -	memcpy(&entry->buf, trace_buf, len); +	memcpy(&entry->buf, tbuffer, len);  	entry->buf[len] = '\0';  	if (!filter_check_discard(call, entry, buffer, event)) {  		ring_buffer_unlock_commit(buffer, event); -		ftrace_trace_stack(buffer, irq_flags, 6, pc); +		ftrace_trace_stack(buffer, flags, 6, pc);  	} - - out_unlock: -	arch_spin_unlock(&trace_buf_lock); -	raw_local_irq_restore(irq_flags); -	unpause_graph_tracing();   out: -	atomic_dec_return(&data->disabled);  	preempt_enable_notrace(); +	unpause_graph_tracing();  	return len;  } @@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk);  static void trace_iterator_increment(struct trace_iterator *iter)  { -	/* Don't allow ftrace to trace into the ring buffers */ -	ftrace_disable_cpu(); -  	iter->idx++;  	if (iter->buffer_iter[iter->cpu])  		ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - -	ftrace_enable_cpu();  }  static struct trace_entry * @@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,  	struct ring_buffer_event *event;  	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; -	/* Don't allow ftrace to trace into the ring buffers */ -	ftrace_disable_cpu(); -  	if (buf_iter)  		event = ring_buffer_iter_peek(buf_iter, ts);  	else  		event = ring_buffer_peek(iter->tr->buffer, cpu, ts,  					 lost_events); -	ftrace_enable_cpu(); -  	if (event) {  		iter->ent_size = ring_buffer_event_length(event);  		return ring_buffer_event_data(event); @@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)  static void trace_consume(struct trace_iterator *iter)  { -	/* Don't allow ftrace to trace into the ring buffers */ -	ftrace_disable_cpu();  	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,  			    &iter->lost_events); -	ftrace_enable_cpu();  }  static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)  		iter->cpu = 0;  		iter->idx = -1; -		ftrace_disable_cpu(); -  		if (cpu_file == TRACE_PIPE_ALL_CPU) {  			for_each_tracing_cpu(cpu)  				tracing_iter_reset(iter, cpu);  		} else  			tracing_iter_reset(iter, cpu_file); -		ftrace_enable_cpu(); -  		iter->leftover = 0;  		for (p = iter; p && l < *pos; p = s_next(m, p, &l))  			; @@ -2332,15 +2371,13 @@ static struct trace_iterator *  __tracing_open(struct inode *inode, struct file *file)  {  	long cpu_file = (long) inode->i_private; -	void *fail_ret = ERR_PTR(-ENOMEM);  	struct trace_iterator *iter; -	struct seq_file *m; -	int cpu, ret; +	int cpu;  	if (tracing_disabled)  		return ERR_PTR(-ENODEV); -	iter = kzalloc(sizeof(*iter), GFP_KERNEL); +	iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));  	if (!iter)  		return ERR_PTR(-ENOMEM); @@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file)  		tracing_iter_reset(iter, cpu);  	} -	ret = seq_open(file, &tracer_seq_ops); -	if (ret < 0) { -		fail_ret = ERR_PTR(ret); -		goto fail_buffer; -	} - -	m = file->private_data; -	m->private = iter; -  	mutex_unlock(&trace_types_lock);  	return iter; - fail_buffer: -	for_each_tracing_cpu(cpu) { -		if (iter->buffer_iter[cpu]) -			ring_buffer_read_finish(iter->buffer_iter[cpu]); -	} -	free_cpumask_var(iter->started); -	tracing_start();   fail:  	mutex_unlock(&trace_types_lock);  	kfree(iter->trace); -	kfree(iter); - -	return fail_ret; +	seq_release_private(inode, file); +	return ERR_PTR(-ENOMEM);  }  int tracing_open_generic(struct inode *inode, struct file *filp) @@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file)  	tracing_start();  	mutex_unlock(&trace_types_lock); -	seq_release(inode, file);  	mutex_destroy(&iter->mutex);  	free_cpumask_var(iter->started);  	kfree(iter->trace); -	kfree(iter); +	seq_release_private(inode, file);  	return 0;  } @@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		if (cpumask_test_cpu(cpu, tracing_cpumask) &&  				!cpumask_test_cpu(cpu, tracing_cpumask_new)) {  			atomic_inc(&global_trace.data[cpu]->disabled); +			ring_buffer_record_disable_cpu(global_trace.buffer, cpu);  		}  		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&  				cpumask_test_cpu(cpu, tracing_cpumask_new)) {  			atomic_dec(&global_trace.data[cpu]->disabled); +			ring_buffer_record_enable_cpu(global_trace.buffer, cpu);  		}  	}  	arch_spin_unlock(&ftrace_max_lock); @@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)  	return t->init(tr);  } -static int __tracing_resize_ring_buffer(unsigned long size) +static void set_buffer_entries(struct trace_array *tr, unsigned long val) +{ +	int cpu; +	for_each_tracing_cpu(cpu) +		tr->data[cpu]->entries = val; +} + +static int __tracing_resize_ring_buffer(unsigned long size, int cpu)  {  	int ret; @@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)  	 */  	ring_buffer_expanded = 1; -	ret = ring_buffer_resize(global_trace.buffer, size); +	ret = ring_buffer_resize(global_trace.buffer, size, cpu);  	if (ret < 0)  		return ret;  	if (!current_trace->use_max_tr)  		goto out; -	ret = ring_buffer_resize(max_tr.buffer, size); +	ret = ring_buffer_resize(max_tr.buffer, size, cpu);  	if (ret < 0) { -		int r; +		int r = 0; + +		if (cpu == RING_BUFFER_ALL_CPUS) { +			int i; +			for_each_tracing_cpu(i) { +				r = ring_buffer_resize(global_trace.buffer, +						global_trace.data[i]->entries, +						i); +				if (r < 0) +					break; +			} +		} else { +			r = ring_buffer_resize(global_trace.buffer, +						global_trace.data[cpu]->entries, +						cpu); +		} -		r = ring_buffer_resize(global_trace.buffer, -				       global_trace.entries);  		if (r < 0) {  			/*  			 * AARGH! We are left with different @@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size)  		return ret;  	} -	max_tr.entries = size; +	if (cpu == RING_BUFFER_ALL_CPUS) +		set_buffer_entries(&max_tr, size); +	else +		max_tr.data[cpu]->entries = size; +   out: -	global_trace.entries = size; +	if (cpu == RING_BUFFER_ALL_CPUS) +		set_buffer_entries(&global_trace, size); +	else +		global_trace.data[cpu]->entries = size;  	return ret;  } -static ssize_t tracing_resize_ring_buffer(unsigned long size) +static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)  { -	int cpu, ret = size; +	int ret = size;  	mutex_lock(&trace_types_lock); -	tracing_stop(); - -	/* disable all cpu buffers */ -	for_each_tracing_cpu(cpu) { -		if (global_trace.data[cpu]) -			atomic_inc(&global_trace.data[cpu]->disabled); -		if (max_tr.data[cpu]) -			atomic_inc(&max_tr.data[cpu]->disabled); +	if (cpu_id != RING_BUFFER_ALL_CPUS) { +		/* make sure, this cpu is enabled in the mask */ +		if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { +			ret = -EINVAL; +			goto out; +		}  	} -	if (size != global_trace.entries) -		ret = __tracing_resize_ring_buffer(size); - +	ret = __tracing_resize_ring_buffer(size, cpu_id);  	if (ret < 0)  		ret = -ENOMEM; -	for_each_tracing_cpu(cpu) { -		if (global_trace.data[cpu]) -			atomic_dec(&global_trace.data[cpu]->disabled); -		if (max_tr.data[cpu]) -			atomic_dec(&max_tr.data[cpu]->disabled); -	} - -	tracing_start(); +out:  	mutex_unlock(&trace_types_lock);  	return ret; @@ -3078,7 +3115,8 @@ int tracing_update_buffers(void)  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) -		ret = __tracing_resize_ring_buffer(trace_buf_size); +		ret = __tracing_resize_ring_buffer(trace_buf_size, +						RING_BUFFER_ALL_CPUS);  	mutex_unlock(&trace_types_lock);  	return ret; @@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf)  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) { -		ret = __tracing_resize_ring_buffer(trace_buf_size); +		ret = __tracing_resize_ring_buffer(trace_buf_size, +						RING_BUFFER_ALL_CPUS);  		if (ret < 0)  			goto out;  		ret = 0; @@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf)  		 * The max_tr ring buffer has some state (e.g. ring->clock) and  		 * we want preserve it.  		 */ -		ring_buffer_resize(max_tr.buffer, 1); -		max_tr.entries = 1; +		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); +		set_buffer_entries(&max_tr, 1);  	}  	destroy_trace_option_files(topts); @@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf)  	topts = create_trace_option_files(current_trace);  	if (current_trace->use_max_tr) { -		ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); -		if (ret < 0) -			goto out; -		max_tr.entries = global_trace.entries; +		int cpu; +		/* we need to make per cpu buffer sizes equivalent */ +		for_each_tracing_cpu(cpu) { +			ret = ring_buffer_resize(max_tr.buffer, +						global_trace.data[cpu]->entries, +						cpu); +			if (ret < 0) +				goto out; +			max_tr.data[cpu]->entries = +					global_trace.data[cpu]->entries; +		}  	}  	if (t->init) { @@ -3642,30 +3688,82 @@ out_err:  	goto out;  } +struct ftrace_entries_info { +	struct trace_array	*tr; +	int			cpu; +}; + +static int tracing_entries_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_entries_info *info; + +	if (tracing_disabled) +		return -ENODEV; + +	info = kzalloc(sizeof(*info), GFP_KERNEL); +	if (!info) +		return -ENOMEM; + +	info->tr = &global_trace; +	info->cpu = (unsigned long)inode->i_private; + +	filp->private_data = info; + +	return 0; +} +  static ssize_t  tracing_entries_read(struct file *filp, char __user *ubuf,  		     size_t cnt, loff_t *ppos)  { -	struct trace_array *tr = filp->private_data; -	char buf[96]; -	int r; +	struct ftrace_entries_info *info = filp->private_data; +	struct trace_array *tr = info->tr; +	char buf[64]; +	int r = 0; +	ssize_t ret;  	mutex_lock(&trace_types_lock); -	if (!ring_buffer_expanded) -		r = sprintf(buf, "%lu (expanded: %lu)\n", -			    tr->entries >> 10, -			    trace_buf_size >> 10); -	else -		r = sprintf(buf, "%lu\n", tr->entries >> 10); + +	if (info->cpu == RING_BUFFER_ALL_CPUS) { +		int cpu, buf_size_same; +		unsigned long size; + +		size = 0; +		buf_size_same = 1; +		/* check if all cpu sizes are same */ +		for_each_tracing_cpu(cpu) { +			/* fill in the size from first enabled cpu */ +			if (size == 0) +				size = tr->data[cpu]->entries; +			if (size != tr->data[cpu]->entries) { +				buf_size_same = 0; +				break; +			} +		} + +		if (buf_size_same) { +			if (!ring_buffer_expanded) +				r = sprintf(buf, "%lu (expanded: %lu)\n", +					    size >> 10, +					    trace_buf_size >> 10); +			else +				r = sprintf(buf, "%lu\n", size >> 10); +		} else +			r = sprintf(buf, "X\n"); +	} else +		r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); +  	mutex_unlock(&trace_types_lock); -	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +	return ret;  }  static ssize_t  tracing_entries_write(struct file *filp, const char __user *ubuf,  		      size_t cnt, loff_t *ppos)  { +	struct ftrace_entries_info *info = filp->private_data;  	unsigned long val;  	int ret; @@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	/* value is in KB */  	val <<= 10; -	ret = tracing_resize_ring_buffer(val); +	ret = tracing_resize_ring_buffer(val, info->cpu);  	if (ret < 0)  		return ret; @@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	return cnt;  } +static int +tracing_entries_release(struct inode *inode, struct file *filp) +{ +	struct ftrace_entries_info *info = filp->private_data; + +	kfree(info); + +	return 0; +} +  static ssize_t  tracing_total_entries_read(struct file *filp, char __user *ubuf,  				size_t cnt, loff_t *ppos) @@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,  	mutex_lock(&trace_types_lock);  	for_each_tracing_cpu(cpu) { -		size += tr->entries >> 10; +		size += tr->data[cpu]->entries >> 10;  		if (!ring_buffer_expanded)  			expanded_size += trace_buf_size >> 10;  	} @@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)  	if (trace_flags & TRACE_ITER_STOP_ON_FREE)  		tracing_off();  	/* resize the ring buffer to 0 */ -	tracing_resize_ring_buffer(0); +	tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);  	return 0;  } @@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	struct print_entry *entry;  	unsigned long irq_flags;  	struct page *pages[2]; +	void *map_page[2];  	int nr_pages = 1;  	ssize_t written; -	void *page1; -	void *page2;  	int offset;  	int size;  	int len;  	int ret; +	int i;  	if (tracing_disabled)  		return -EINVAL; @@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  		goto out;  	} -	page1 = kmap_atomic(pages[0]); -	if (nr_pages == 2) -		page2 = kmap_atomic(pages[1]); +	for (i = 0; i < nr_pages; i++) +		map_page[i] = kmap_atomic(pages[i]);  	local_save_flags(irq_flags);  	size = sizeof(*entry) + cnt + 2; /* possible \n added */ @@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	if (nr_pages == 2) {  		len = PAGE_SIZE - offset; -		memcpy(&entry->buf, page1 + offset, len); -		memcpy(&entry->buf[len], page2, cnt - len); +		memcpy(&entry->buf, map_page[0] + offset, len); +		memcpy(&entry->buf[len], map_page[1], cnt - len);  	} else -		memcpy(&entry->buf, page1 + offset, cnt); +		memcpy(&entry->buf, map_page[0] + offset, cnt);  	if (entry->buf[cnt - 1] != '\n') {  		entry->buf[cnt] = '\n'; @@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	*fpos += written;   out_unlock: -	if (nr_pages == 2) -		kunmap_atomic(page2); -	kunmap_atomic(page1); -	while (nr_pages > 0) -		put_page(pages[--nr_pages]); +	for (i = 0; i < nr_pages; i++){ +		kunmap_atomic(map_page[i]); +		put_page(pages[i]); +	}   out:  	return written;  } @@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = {  };  static const struct file_operations tracing_entries_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_entries_open,  	.read		= tracing_entries_read,  	.write		= tracing_entries_write, +	.release	= tracing_entries_release,  	.llseek		= generic_file_llseek,  }; @@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu)  	struct dentry *d_cpu;  	char cpu_dir[30]; /* 30 characters should be more than enough */ +	if (!d_percpu) +		return; +  	snprintf(cpu_dir, 30, "cpu%ld", cpu);  	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);  	if (!d_cpu) { @@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu)  	trace_create_file("stats", 0444, d_cpu,  			(void *) cpu, &tracing_stats_fops); + +	trace_create_file("buffer_size_kb", 0444, d_cpu, +			(void *) cpu, &tracing_entries_fops);  }  #ifdef CONFIG_FTRACE_SELFTEST @@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void)  			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);  	trace_create_file("buffer_size_kb", 0644, d_tracer, -			&global_trace, &tracing_entries_fops); +			(void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);  	trace_create_file("buffer_total_size_kb", 0444, d_tracer,  			&global_trace, &tracing_total_entries_fops); @@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void)  	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))  		goto out_free_buffer_mask; +	/* Only allocate trace_printk buffers if a trace_printk exists */ +	if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) +		trace_printk_init_buffers(); +  	/* To save memory, keep the ring buffer size to its minimum */  	if (ring_buffer_expanded)  		ring_buf_size = trace_buf_size; @@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void)  		WARN_ON(1);  		goto out_free_cpumask;  	} -	global_trace.entries = ring_buffer_size(global_trace.buffer);  	if (global_trace.buffer_disabled)  		tracing_off(); @@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void)  		ring_buffer_free(global_trace.buffer);  		goto out_free_cpumask;  	} -	max_tr.entries = 1;  #endif  	/* Allocate the first page for all buffers */ @@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void)  		max_tr.data[i] = &per_cpu(max_tr_data, i);  	} +	set_buffer_entries(&global_trace, +			   ring_buffer_size(global_trace.buffer, 0)); +#ifdef CONFIG_TRACER_MAX_TRACE +	set_buffer_entries(&max_tr, 1); +#endif +  	trace_init_cmdlines();  	register_tracer(&nop_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f95d65da6db..6c6f7933eed 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -131,6 +131,7 @@ struct trace_array_cpu {  	atomic_t		disabled;  	void			*buffer_page;	/* ring buffer spare */ +	unsigned long		entries;  	unsigned long		saved_latency;  	unsigned long		critical_start;  	unsigned long		critical_end; @@ -152,7 +153,6 @@ struct trace_array_cpu {   */  struct trace_array {  	struct ring_buffer	*buffer; -	unsigned long		entries;  	int			cpu;  	int			buffer_disabled;  	cycle_t			time_start; @@ -826,6 +826,8 @@ extern struct list_head ftrace_events;  extern const char *__start___trace_bprintk_fmt[];  extern const char *__stop___trace_bprintk_fmt[]; +void trace_printk_init_buffers(void); +  #undef FTRACE_ENTRY  #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\  	extern struct ftrace_event_call					\ diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f..a9077c1b4ad 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)  	const char **iter;  	char *fmt; +	/* allocate the trace_printk per cpu buffers */ +	if (start != end) +		trace_printk_init_buffers(); +  	mutex_lock(&btrace_mutex);  	for (iter = start; iter < end; iter++) {  		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a472..00000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Workqueue statistical tracer. - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - - -#include <trace/events/workqueue.h> -#include <linux/list.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/kref.h> -#include "trace_stat.h" -#include "trace.h" - - -/* A cpu workqueue thread */ -struct cpu_workqueue_stats { -	struct list_head            list; -	struct kref                 kref; -	int		            cpu; -	pid_t			    pid; -/* Can be inserted from interrupt or user context, need to be atomic */ -	atomic_t	            inserted; -/* - *  Don't need to be atomic, works are serialized in a single workqueue thread - *  on a single CPU. - */ -	unsigned int		    executed; -}; - -/* List of workqueue threads on one cpu */ -struct workqueue_global_stats { -	struct list_head	list; -	spinlock_t		lock; -}; - -/* Don't need a global lock because allocated before the workqueues, and - * never freed. - */ -static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); -#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) - -static void cpu_workqueue_stat_free(struct kref *kref) -{ -	kfree(container_of(kref, struct cpu_workqueue_stats, kref)); -} - -/* Insertion of a work */ -static void -probe_workqueue_insertion(void *ignore, -			  struct task_struct *wq_thread, -			  struct work_struct *work) -{ -	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { -		if (node->pid == wq_thread->pid) { -			atomic_inc(&node->inserted); -			goto found; -		} -	} -	pr_debug("trace_workqueue: entry not found\n"); -found: -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Execution of a work */ -static void -probe_workqueue_execution(void *ignore, -			  struct task_struct *wq_thread, -			  struct work_struct *work) -{ -	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { -		if (node->pid == wq_thread->pid) { -			node->executed++; -			goto found; -		} -	} -	pr_debug("trace_workqueue: entry not found\n"); -found: -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(void *ignore, -				     struct task_struct *wq_thread, int cpu) -{ -	struct cpu_workqueue_stats *cws; -	unsigned long flags; - -	WARN_ON(cpu < 0); - -	/* Workqueues are sometimes created in atomic context */ -	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); -	if (!cws) { -		pr_warning("trace_workqueue: not enough memory\n"); -		return; -	} -	INIT_LIST_HEAD(&cws->list); -	kref_init(&cws->kref); -	cws->cpu = cpu; -	cws->pid = wq_thread->pid; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Destruction of a cpu workqueue thread */ -static void -probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) -{ -	/* Workqueue only execute on one cpu */ -	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node, *next; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, -							list) { -		if (node->pid == wq_thread->pid) { -			list_del(&node->list); -			kref_put(&node->kref, cpu_workqueue_stat_free); -			goto found; -		} -	} - -	pr_debug("trace_workqueue: don't find workqueue to destroy\n"); -found: -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -} - -static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) -{ -	unsigned long flags; -	struct cpu_workqueue_stats *ret = NULL; - - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - -	if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { -		ret = list_entry(workqueue_cpu_stat(cpu)->list.next, -				 struct cpu_workqueue_stats, list); -		kref_get(&ret->kref); -	} - -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -	return ret; -} - -static void *workqueue_stat_start(struct tracer_stat *trace) -{ -	int cpu; -	void *ret = NULL; - -	for_each_possible_cpu(cpu) { -		ret = workqueue_stat_start_cpu(cpu); -		if (ret) -			return ret; -	} -	return NULL; -} - -static void *workqueue_stat_next(void *prev, int idx) -{ -	struct cpu_workqueue_stats *prev_cws = prev; -	struct cpu_workqueue_stats *ret; -	int cpu = prev_cws->cpu; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { -		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -		do { -			cpu = cpumask_next(cpu, cpu_possible_mask); -			if (cpu >= nr_cpu_ids) -				return NULL; -		} while (!(ret = workqueue_stat_start_cpu(cpu))); -		return ret; -	} else { -		ret = list_entry(prev_cws->list.next, -				 struct cpu_workqueue_stats, list); -		kref_get(&ret->kref); -	} -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -	return ret; -} - -static int workqueue_stat_show(struct seq_file *s, void *p) -{ -	struct cpu_workqueue_stats *cws = p; -	struct pid *pid; -	struct task_struct *tsk; - -	pid = find_get_pid(cws->pid); -	if (pid) { -		tsk = get_pid_task(pid, PIDTYPE_PID); -		if (tsk) { -			seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu, -				   atomic_read(&cws->inserted), cws->executed, -				   tsk->comm); -			put_task_struct(tsk); -		} -		put_pid(pid); -	} - -	return 0; -} - -static void workqueue_stat_release(void *stat) -{ -	struct cpu_workqueue_stats *node = stat; - -	kref_put(&node->kref, cpu_workqueue_stat_free); -} - -static int workqueue_stat_headers(struct seq_file *s) -{ -	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n"); -	seq_printf(s, "# |      |         |          |\n"); -	return 0; -} - -struct tracer_stat workqueue_stats __read_mostly = { -	.name = "workqueues", -	.stat_start = workqueue_stat_start, -	.stat_next = workqueue_stat_next, -	.stat_show = workqueue_stat_show, -	.stat_release = workqueue_stat_release, -	.stat_headers = workqueue_stat_headers -}; - - -int __init stat_workqueue_init(void) -{ -	if (register_stat_tracer(&workqueue_stats)) { -		pr_warning("Unable to register workqueue stat tracer\n"); -		return 1; -	} - -	return 0; -} -fs_initcall(stat_workqueue_init); - -/* - * Workqueues are created very early, just after pre-smp initcalls. - * So we must register our tracepoints at this stage. - */ -int __init trace_workqueue_early_init(void) -{ -	int ret, cpu; - -	for_each_possible_cpu(cpu) { -		spin_lock_init(&workqueue_cpu_stat(cpu)->lock); -		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); -	} - -	ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -	if (ret) -		goto out; - -	ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); -	if (ret) -		goto no_insertion; - -	ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); -	if (ret) -		goto no_execution; - -	ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); -	if (ret) -		goto no_creation; - -	return 0; - -no_creation: -	unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); -no_execution: -	unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); -no_insertion: -	unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -out: -	pr_warning("trace_workqueue: unable to trace workqueues\n"); - -	return 1; -} -early_initcall(trace_workqueue_early_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c0..9a3128dc67d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  	cwq = get_cwq(gcwq->cpu, wq);  	trace_workqueue_queue_work(cpu, cwq, work); -	BUG_ON(!list_empty(&work->entry)); +	if (WARN_ON(!list_empty(&work->entry))) { +		spin_unlock_irqrestore(&gcwq->lock, flags); +		return; +	}  	cwq->nr_in_flight[cwq->work_color]++;  	work_flags = work_color_to_flags(cwq->work_color); @@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)  	} else  		wake_up_all(&gcwq->trustee_wait); -	/* sanity check nr_running */ -	WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && +	/* +	 * Sanity check nr_running.  Because trustee releases gcwq->lock +	 * between setting %WORKER_ROGUE and zapping nr_running, the +	 * warning may trigger spuriously.  Check iff trustee is idle. +	 */ +	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && +		     gcwq->nr_workers == gcwq->nr_idle &&  		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));  } @@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)  	 * lock freed" warnings as well as problems when looking into  	 * work->lockdep_map, make a copy and use that here.  	 */ -	struct lockdep_map lockdep_map = work->lockdep_map; +	struct lockdep_map lockdep_map; + +	lockdep_copy_map(&lockdep_map, &work->lockdep_map);  #endif  	/*  	 * A single work shouldn't be executed concurrently by @@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)  {  	struct wq_barrier barr; +	lock_map_acquire(&work->lockdep_map); +	lock_map_release(&work->lockdep_map); +  	if (start_flush_work(work, &barr, true)) {  		wait_for_completion(&barr.done);  		destroy_work_on_stack(&barr.work);  |