diff options
Diffstat (limited to 'kernel')
75 files changed, 8204 insertions, 3718 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index ab4f1090f43..b3097bde4e9 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -4,3 +4,4 @@  config_data.h  config_data.gz  timeconst.h +hz.bc diff --git a/kernel/async.c b/kernel/async.c index 8ddee2c3e5b..61f023ce022 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -73,7 +73,7 @@ struct async_entry {  	struct list_head	global_list;  	struct work_struct	work;  	async_cookie_t		cookie; -	async_func_ptr		*func; +	async_func_t		func;  	void			*data;  	struct async_domain	*domain;  }; @@ -84,24 +84,20 @@ static atomic_t entry_count;  static async_cookie_t lowest_in_progress(struct async_domain *domain)  { -	struct async_entry *first = NULL; +	struct list_head *pending;  	async_cookie_t ret = ASYNC_COOKIE_MAX;  	unsigned long flags;  	spin_lock_irqsave(&async_lock, flags); -	if (domain) { -		if (!list_empty(&domain->pending)) -			first = list_first_entry(&domain->pending, -					struct async_entry, domain_list); -	} else { -		if (!list_empty(&async_global_pending)) -			first = list_first_entry(&async_global_pending, -					struct async_entry, global_list); -	} +	if (domain) +		pending = &domain->pending; +	else +		pending = &async_global_pending; -	if (first) -		ret = first->cookie; +	if (!list_empty(pending)) +		ret = list_first_entry(pending, struct async_entry, +				       domain_list)->cookie;  	spin_unlock_irqrestore(&async_lock, flags);  	return ret; @@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)  	wake_up(&async_done);  } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) +static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)  {  	struct async_entry *entry;  	unsigned long flags; @@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a  		spin_unlock_irqrestore(&async_lock, flags);  		/* low on memory.. run synchronously */ -		ptr(data, newcookie); +		func(data, newcookie);  		return newcookie;  	}  	INIT_LIST_HEAD(&entry->domain_list);  	INIT_LIST_HEAD(&entry->global_list);  	INIT_WORK(&entry->work, async_run_entry_fn); -	entry->func = ptr; +	entry->func = func;  	entry->data = data;  	entry->domain = domain; @@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a  /**   * async_schedule - schedule a function for asynchronous execution - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously   * @data: data pointer to pass to the function   *   * Returns an async_cookie_t that may be used for checkpointing later.   * Note: This function may be called from atomic or non-atomic contexts.   */ -async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +async_cookie_t async_schedule(async_func_t func, void *data)  { -	return __async_schedule(ptr, data, &async_dfl_domain); +	return __async_schedule(func, data, &async_dfl_domain);  }  EXPORT_SYMBOL_GPL(async_schedule);  /**   * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously   * @data: data pointer to pass to the function   * @domain: the domain   * @@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);   * synchronization domain is specified via @domain.  Note: This function   * may be called from atomic or non-atomic contexts.   */ -async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, +async_cookie_t async_schedule_domain(async_func_t func, void *data,  				     struct async_domain *domain)  { -	return __async_schedule(ptr, data, domain); +	return __async_schedule(func, data, domain);  }  EXPORT_SYMBOL_GPL(async_schedule_domain); diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f1..9816a1b96cf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	/* As soon as there's any sign of userspace auditd,  	 * start kauditd to talk to it */ -	if (!kauditd_task) +	if (!kauditd_task) {  		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); -	if (IS_ERR(kauditd_task)) { -		err = PTR_ERR(kauditd_task); -		kauditd_task = NULL; -		return err; +		if (IS_ERR(kauditd_task)) { +			err = PTR_ERR(kauditd_task); +			kauditd_task = NULL; +			return err; +		}  	} -  	loginuid = audit_get_loginuid(current);  	sessionid = audit_get_sessionid(current);  	security_task_getsecid(current, &sid); diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1..11468d99dad 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -59,10 +59,7 @@ struct audit_entry {  	struct audit_krule	rule;  }; -#ifdef CONFIG_AUDIT -extern int audit_enabled;  extern int audit_ever_enabled; -#endif  extern int audit_pid; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d..a291aa23fb3 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -617,9 +617,9 @@ void audit_trim_trees(void)  		}  		spin_unlock(&hash_lock);  		trim_marked(tree); -		put_tree(tree);  		drop_collected_mounts(root_mnt);  skip_it: +		put_tree(tree);  		mutex_lock(&audit_filter_mutex);  	}  	list_del(&cursor); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06..267436826c3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -594,6 +594,10 @@ exit_nofree:  	return entry;  exit_free: +	if (entry->rule.watch) +		audit_put_watch(entry->rule.watch); /* matches initial get */ +	if (entry->rule.tree) +		audit_put_tree(entry->rule.tree); /* that's the temporary one */  	audit_free_rule(entry);  	return ERR_PTR(err);  } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a..c68229411a7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)  	}  } -static inline void audit_zero_context(struct audit_context *context, -				      enum audit_state state) -{ -	memset(context, 0, sizeof(*context)); -	context->state      = state; -	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} -  static inline struct audit_context *audit_alloc_context(enum audit_state state)  {  	struct audit_context *context; -	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) +	context = kzalloc(sizeof(*context), GFP_KERNEL); +	if (!context)  		return NULL; -	audit_zero_context(context, state); +	context->state = state; +	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;  	INIT_LIST_HEAD(&context->killed_trees);  	INIT_LIST_HEAD(&context->names_list);  	return context; diff --git a/kernel/capability.c b/kernel/capability.c index 493d9725948..f6c2ce5701e 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)  EXPORT_SYMBOL(ns_capable);  /** + * file_ns_capable - Determine if the file's opener had a capability in effect + * @file:  The file we want to check + * @ns:  The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if task that opened the file had a capability in effect + * when the file was opened. + * + * This does not set PF_SUPERPRIV because the caller may not + * actually be privileged. + */ +bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +{ +	if (WARN_ON_ONCE(!cap_valid(cap))) +		return false; + +	if (security_capable(file->f_cred, ns, cap) == 0) +		return true; + +	return false; +} +EXPORT_SYMBOL(file_ns_capable); + +/**   * capable - Determine if the current task has a superior capability in effect   * @cap: The capability to be tested for   * diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c462813722..d3abce2d645 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -30,7 +30,6 @@  #include <linux/cred.h>  #include <linux/ctype.h>  #include <linux/errno.h> -#include <linux/fs.h>  #include <linux/init_task.h>  #include <linux/kernel.h>  #include <linux/list.h> @@ -59,7 +58,7 @@  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */  #include <linux/eventfd.h>  #include <linux/poll.h> -#include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/flex_array.h> /* used in cgroup_attach_task */  #include <linux/kthread.h>  #include <linux/atomic.h> @@ -83,7 +82,13 @@   * B happens only through cgroup_show_options() and using cgroup_root_mutex   * breaks it.   */ +#ifdef CONFIG_PROVE_RCU +DEFINE_MUTEX(cgroup_mutex); +EXPORT_SYMBOL_GPL(cgroup_mutex);	/* only for task_subsys_state_check() */ +#else  static DEFINE_MUTEX(cgroup_mutex); +#endif +  static DEFINE_MUTEX(cgroup_root_mutex);  /* @@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {  #include <linux/cgroup_subsys.h>  }; -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { -	struct super_block *sb; - -	/* -	 * The bitmask of subsystems intended to be attached to this -	 * hierarchy -	 */ -	unsigned long subsys_mask; - -	/* Unique id for this hierarchy. */ -	int hierarchy_id; - -	/* The bitmask of subsystems currently attached to this hierarchy */ -	unsigned long actual_subsys_mask; - -	/* A list running through the attached subsystems */ -	struct list_head subsys_list; - -	/* The root cgroup for this hierarchy */ -	struct cgroup top_cgroup; - -	/* Tracks how many cgroups are currently defined in hierarchy.*/ -	int number_of_cgroups; - -	/* A list running through the active hierarchies */ -	struct list_head root_list; - -	/* All cgroups on this root, cgroup_mutex protected */ -	struct list_head allcg_list; - -	/* Hierarchy-specific flags */ -	unsigned long flags; - -	/* IDs for cgroups in this hierarchy */ -	struct ida cgroup_ida; - -	/* The path to use for release notifications. */ -	char release_agent_path[PATH_MAX]; - -	/* The name for this hierarchy - may be empty */ -	char name[MAX_CGROUP_ROOT_NAMELEN]; -}; -  /*   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the   * subsystems that are otherwise unattached - it never has more than a @@ -162,6 +117,9 @@ struct cfent {  	struct list_head		node;  	struct dentry			*dentry;  	struct cftype			*type; + +	/* file xattrs */ +	struct simple_xattrs		xattrs;  };  /* @@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */  #define dummytop (&rootnode.top_cgroup) +static struct cgroup_name root_cgroup_name = { .name = "/" }; +  /* This flag indicates whether tasks in the fork and exit paths should   * check for fork/exit handlers to call. This avoids us having to do   * extra work in the fork/exit path if none of the subsystems need to @@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);  static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,  			      struct cftype cfts[], bool is_add); -#ifdef CONFIG_PROVE_LOCKING -int cgroup_lock_is_held(void) -{ -	return lockdep_is_held(&cgroup_mutex); -} -#else /* #ifdef CONFIG_PROVE_LOCKING */ -int cgroup_lock_is_held(void) -{ -	return mutex_is_locked(&cgroup_mutex); -} -#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ - -EXPORT_SYMBOL_GPL(cgroup_lock_is_held); -  static int css_unbias_refcnt(int refcnt)  {  	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; @@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)  	return test_bit(CGRP_REMOVED, &cgrp->flags);  } -/* bits in struct cgroupfs_root flags field */ -enum { -	ROOT_NOPREFIX,	/* mounted subsystems have no named prefix */ -	ROOT_XATTR,	/* supports extended attributes */ -}; +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor.  It also returns %true + * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) +{ +	while (cgrp) { +		if (cgrp == ancestor) +			return true; +		cgrp = cgrp->parent; +	} +	return false; +} +EXPORT_SYMBOL_GPL(cgroup_is_descendant);  static int cgroup_is_releasable(const struct cgroup *cgrp)  { @@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)  	return __d_cfe(dentry)->type;  } +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the mutex should be later unlocked.  On + * failure returns false with no lock held. + */ +static bool cgroup_lock_live_group(struct cgroup *cgrp) +{ +	mutex_lock(&cgroup_mutex); +	if (cgroup_is_removed(cgrp)) { +		mutex_unlock(&cgroup_mutex); +		return false; +	} +	return true; +} +  /* the list of cgroups eligible for automatic release. Protected by   * release_list_lock */  static LIST_HEAD(release_list); @@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,   * update of a tasks cgroup pointer by cgroup_attach_task()   */ -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -void cgroup_lock(void) -{ -	mutex_lock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_lock); - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -void cgroup_unlock(void) -{ -	mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unlock); -  /*   * A couple of forward declarations required, due to cyclic reference loop:   * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> @@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)  	return inode;  } +static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +{ +	struct cgroup_name *name; + +	name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); +	if (!name) +		return NULL; +	strcpy(name->name, dentry->d_name.name); +	return name; +} +  static void cgroup_free_fn(struct work_struct *work)  {  	struct cgroup *cgrp = container_of(work, struct cgroup, free_work); @@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work)  	mutex_unlock(&cgroup_mutex);  	/* +	 * We get a ref to the parent's dentry, and put the ref when +	 * this cgroup is being freed, so it's guaranteed that the +	 * parent won't be destroyed before its children. +	 */ +	dput(cgrp->parent->dentry); + +	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + +	/*  	 * Drop the active superblock reference that we took when we -	 * created the cgroup +	 * created the cgroup. This will free cgrp->root, if we are +	 * holding the last reference to @sb.  	 */  	deactivate_super(cgrp->root->sb); @@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work)  	simple_xattrs_free(&cgrp->xattrs); -	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); +	kfree(rcu_dereference_raw(cgrp->name));  	kfree(cgrp);  } @@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)  	} else {  		struct cfent *cfe = __d_cfe(dentry);  		struct cgroup *cgrp = dentry->d_parent->d_fsdata; -		struct cftype *cft = cfe->type;  		WARN_ONCE(!list_empty(&cfe->node) &&  			  cgrp != &cgrp->root->top_cgroup,  			  "cfe still linked for %s\n", cfe->type->name); +		simple_xattrs_free(&cfe->xattrs);  		kfree(cfe); -		simple_xattrs_free(&cft->xattrs);  	}  	iput(inode);  } @@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)  	mutex_lock(&cgroup_root_mutex);  	for_each_subsys(root, ss)  		seq_printf(seq, ",%s", ss->name); -	if (test_bit(ROOT_NOPREFIX, &root->flags)) +	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) +		seq_puts(seq, ",sane_behavior"); +	if (root->flags & CGRP_ROOT_NOPREFIX)  		seq_puts(seq, ",noprefix"); -	if (test_bit(ROOT_XATTR, &root->flags)) +	if (root->flags & CGRP_ROOT_XATTR)  		seq_puts(seq, ",xattr");  	if (strlen(root->release_agent_path))  		seq_printf(seq, ",release_agent=%s", root->release_agent_path); @@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			all_ss = true;  			continue;  		} +		if (!strcmp(token, "__DEVEL__sane_behavior")) { +			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; +			continue; +		}  		if (!strcmp(token, "noprefix")) { -			set_bit(ROOT_NOPREFIX, &opts->flags); +			opts->flags |= CGRP_ROOT_NOPREFIX;  			continue;  		}  		if (!strcmp(token, "clone_children")) { @@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			continue;  		}  		if (!strcmp(token, "xattr")) { -			set_bit(ROOT_XATTR, &opts->flags); +			opts->flags |= CGRP_ROOT_XATTR;  			continue;  		}  		if (!strncmp(token, "release_agent=", 14)) { @@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	/* Consistency checks */ +	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { +		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + +		if (opts->flags & CGRP_ROOT_NOPREFIX) { +			pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); +			return -EINVAL; +		} + +		if (opts->cpuset_clone_children) { +			pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); +			return -EINVAL; +		} +	} +  	/*  	 * Option noprefix was introduced just for backward compatibility  	 * with the old cpuset, so we allow noprefix only if mounting just  	 * the cpuset subsystem.  	 */ -	if (test_bit(ROOT_NOPREFIX, &opts->flags) && -	    (opts->subsys_mask & mask)) +	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))  		return -EINVAL; @@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	struct cgroup_sb_opts opts;  	unsigned long added_mask, removed_mask; +	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { +		pr_err("cgroup: sane_behavior: remount is not allowed\n"); +		return -EINVAL; +	} +  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);  	mutex_lock(&cgroup_mutex);  	mutex_lock(&cgroup_root_mutex); @@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)  	INIT_LIST_HEAD(&root->allcg_list);  	root->number_of_cgroups = 1;  	cgrp->root = root; -	cgrp->top_cgroup = cgrp; +	cgrp->name = &root_cgroup_name;  	init_cgroup_housekeeping(cgrp);  	list_add_tail(&cgrp->allcg_node, &root->allcg_list);  } @@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		 * any) is not needed  		 */  		cgroup_drop_root(opts.new_root); + +		if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && +		    root->flags != opts.flags) { +			pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); +			ret = -EINVAL; +			goto drop_new_super; +		} +  		/* no subsys rebinding, so refcounts don't change */  		drop_parsed_module_refcounts(opts.subsys_mask);  	} @@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj;   * @buf: the buffer to write the path into   * @buflen: the length of the buffer   * - * Called with cgroup_mutex held or else with an RCU-protected cgroup - * reference.  Writes path of cgroup into buf.  Returns 0 on success, - * -errno on error. + * Writes path of cgroup into buf.  Returns 0 on success, -errno on error. + * + * We can't generate cgroup path using dentry->d_name, as accessing + * dentry->name must be protected by irq-unsafe dentry->d_lock or parent + * inode's i_mutex, while on the other hand cgroup_path() can be called + * with some irq-safe spinlocks held.   */  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)  { -	struct dentry *dentry = cgrp->dentry; +	int ret = -ENAMETOOLONG;  	char *start; -	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), -			   "cgroup_path() called without proper locking"); - -	if (cgrp == dummytop) { -		/* -		 * Inactive subsystems have no dentry for their root -		 * cgroup -		 */ -		strcpy(buf, "/"); +	if (!cgrp->parent) { +		if (strlcpy(buf, "/", buflen) >= buflen) +			return -ENAMETOOLONG;  		return 0;  	}  	start = buf + buflen - 1; -  	*start = '\0'; -	for (;;) { -		int len = dentry->d_name.len; +	rcu_read_lock(); +	do { +		const char *name = cgroup_name(cgrp); +		int len; + +		len = strlen(name);  		if ((start -= len) < buf) -			return -ENAMETOOLONG; -		memcpy(start, dentry->d_name.name, len); -		cgrp = cgrp->parent; -		if (!cgrp) -			break; +			goto out; +		memcpy(start, name, len); -		dentry = cgrp->dentry; -		if (!cgrp->parent) -			continue;  		if (--start < buf) -			return -ENAMETOOLONG; +			goto out;  		*start = '/'; -	} + +		cgrp = cgrp->parent; +	} while (cgrp->parent); +	ret = 0;  	memmove(buf, start, buf + buflen - start); -	return 0; +out: +	rcu_read_unlock(); +	return ret;  }  EXPORT_SYMBOL_GPL(cgroup_path); @@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);   *   * Must be called with cgroup_mutex and threadgroup locked.   */ -static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +static void cgroup_task_migrate(struct cgroup *oldcgrp,  				struct task_struct *tsk, struct css_set *newcg)  {  	struct css_set *oldcg; @@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,  }  /** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call with cgroup_mutex and threadgroup locked. May take task_lock of - * @tsk during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ -	int retval = 0; -	struct cgroup_subsys *ss, *failed_ss = NULL; -	struct cgroup *oldcgrp; -	struct cgroupfs_root *root = cgrp->root; -	struct cgroup_taskset tset = { }; -	struct css_set *newcg; - -	/* @tsk either already exited or can't exit until the end */ -	if (tsk->flags & PF_EXITING) -		return -ESRCH; - -	/* Nothing to do if the task is already in that cgroup */ -	oldcgrp = task_cgroup_from_root(tsk, root); -	if (cgrp == oldcgrp) -		return 0; - -	tset.single.task = tsk; -	tset.single.cgrp = oldcgrp; - -	for_each_subsys(root, ss) { -		if (ss->can_attach) { -			retval = ss->can_attach(cgrp, &tset); -			if (retval) { -				/* -				 * Remember on which subsystem the can_attach() -				 * failed, so that we only call cancel_attach() -				 * against the subsystems whose can_attach() -				 * succeeded. (See below) -				 */ -				failed_ss = ss; -				goto out; -			} -		} -	} - -	newcg = find_css_set(tsk->cgroups, cgrp); -	if (!newcg) { -		retval = -ENOMEM; -		goto out; -	} - -	cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); - -	for_each_subsys(root, ss) { -		if (ss->attach) -			ss->attach(cgrp, &tset); -	} - -out: -	if (retval) { -		for_each_subsys(root, ss) { -			if (ss == failed_ss) -				/* -				 * This subsystem was the one that failed the -				 * can_attach() check earlier, so we don't need -				 * to call cancel_attach() against it or any -				 * remaining subsystems. -				 */ -				break; -			if (ss->cancel_attach) -				ss->cancel_attach(cgrp, &tset); -		} -	} -	return retval; -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ -	struct cgroupfs_root *root; -	int retval = 0; - -	cgroup_lock(); -	for_each_active_root(root) { -		struct cgroup *from_cg = task_cgroup_from_root(from, root); - -		retval = cgroup_attach_task(from_cg, tsk); -		if (retval) -			break; -	} -	cgroup_unlock(); - -	return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/** - * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup   * @cgrp: the cgroup to attach to - * @leader: the threadgroup leader task_struct of the group to be attached + * @tsk: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup?   *   * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of each thread in leader's threadgroup individually in turn. + * task_lock of @tsk or each thread in the threadgroup individually in turn.   */ -static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, +			      bool threadgroup)  {  	int retval, i, group_size;  	struct cgroup_subsys *ss, *failed_ss = NULL; -	/* guaranteed to be initialized later, but the compiler needs this */  	struct cgroupfs_root *root = cgrp->root;  	/* threadgroup list cursor and array */ -	struct task_struct *tsk; +	struct task_struct *leader = tsk;  	struct task_and_cgroup *tc;  	struct flex_array *group;  	struct cgroup_taskset tset = { }; @@ -2059,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 * group - group_rwsem prevents new threads from appearing, and if  	 * threads exit, this will just be an over-estimate.  	 */ -	group_size = get_nr_threads(leader); +	if (threadgroup) +		group_size = get_nr_threads(tsk); +	else +		group_size = 1;  	/* flex_array supports very large thread-groups better than kmalloc. */  	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);  	if (!group)  		return -ENOMEM;  	/* pre-allocate to guarantee space while iterating in rcu read-side. */ -	retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); +	retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);  	if (retval)  		goto out_free_group_list; -	tsk = leader;  	i = 0;  	/*  	 * Prevent freeing of tasks while we take a snapshot. Tasks that are @@ -2098,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);  		BUG_ON(retval != 0);  		i++; + +		if (!threadgroup) +			break;  	} while_each_thread(leader, tsk);  	rcu_read_unlock();  	/* remember the number of threads in the array for later. */ @@ -2143,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 */  	for (i = 0; i < group_size; i++) {  		tc = flex_array_get(group, i); -		cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); +		cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);  	}  	/* nothing is sensitive to fork() after this point. */ @@ -2224,11 +2137,11 @@ retry_find_task:  		tsk = tsk->group_leader;  	/* -	 * Workqueue threads may acquire PF_THREAD_BOUND and become +	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become  	 * trapped in a cpuset, or RT worker may be born in a cgroup  	 * with no rt_runtime allocated.  Just say no.  	 */ -	if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { +	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {  		ret = -EINVAL;  		rcu_read_unlock();  		goto out_unlock_cgroup; @@ -2251,17 +2164,42 @@ retry_find_task:  			put_task_struct(tsk);  			goto retry_find_task;  		} -		ret = cgroup_attach_proc(cgrp, tsk); -	} else -		ret = cgroup_attach_task(cgrp, tsk); +	} + +	ret = cgroup_attach_task(cgrp, tsk, threadgroup); +  	threadgroup_unlock(tsk);  	put_task_struct(tsk);  out_unlock_cgroup: -	cgroup_unlock(); +	mutex_unlock(&cgroup_mutex);  	return ret;  } +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ +	struct cgroupfs_root *root; +	int retval = 0; + +	mutex_lock(&cgroup_mutex); +	for_each_active_root(root) { +		struct cgroup *from_cg = task_cgroup_from_root(from, root); + +		retval = cgroup_attach_task(from_cg, tsk, false); +		if (retval) +			break; +	} +	mutex_unlock(&cgroup_mutex); + +	return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); +  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)  {  	return attach_task_by_pid(cgrp, pid, false); @@ -2272,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)  	return attach_task_by_pid(cgrp, tgid, true);  } -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. - */ -bool cgroup_lock_live_group(struct cgroup *cgrp) -{ -	mutex_lock(&cgroup_mutex); -	if (cgroup_is_removed(cgrp)) { -		mutex_unlock(&cgroup_mutex); -		return false; -	} -	return true; -} -EXPORT_SYMBOL_GPL(cgroup_lock_live_group); -  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,  				      const char *buffer)  { @@ -2301,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,  	mutex_lock(&cgroup_root_mutex);  	strcpy(cgrp->root->release_agent_path, buffer);  	mutex_unlock(&cgroup_root_mutex); -	cgroup_unlock(); +	mutex_unlock(&cgroup_mutex);  	return 0;  } @@ -2312,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,  		return -ENODEV;  	seq_puts(seq, cgrp->root->release_agent_path);  	seq_putc(seq, '\n'); -	cgroup_unlock(); +	mutex_unlock(&cgroup_mutex); +	return 0; +} + +static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, +				     struct seq_file *seq) +{ +	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));  	return 0;  } @@ -2537,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,  			    struct inode *new_dir, struct dentry *new_dentry)  { +	int ret; +	struct cgroup_name *name, *old_name; +	struct cgroup *cgrp; + +	/* +	 * It's convinient to use parent dir's i_mutex to protected +	 * cgrp->name. +	 */ +	lockdep_assert_held(&old_dir->i_mutex); +  	if (!S_ISDIR(old_dentry->d_inode->i_mode))  		return -ENOTDIR;  	if (new_dentry->d_inode)  		return -EEXIST;  	if (old_dir != new_dir)  		return -EIO; -	return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + +	cgrp = __d_cgrp(old_dentry); + +	name = cgroup_alloc_name(new_dentry); +	if (!name) +		return -ENOMEM; + +	ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); +	if (ret) { +		kfree(name); +		return ret; +	} + +	old_name = cgrp->name; +	rcu_assign_pointer(cgrp->name, name); + +	kfree_rcu(old_name, rcu_head); +	return 0;  }  static struct simple_xattrs *__d_xattrs(struct dentry *dentry) @@ -2551,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)  	if (S_ISDIR(dentry->d_inode->i_mode))  		return &__d_cgrp(dentry)->xattrs;  	else -		return &__d_cft(dentry)->xattrs; +		return &__d_cfe(dentry)->xattrs;  }  static inline int xattr_enabled(struct dentry *dentry)  {  	struct cgroupfs_root *root = dentry->d_sb->s_fs_info; -	return test_bit(ROOT_XATTR, &root->flags); +	return root->flags & CGRP_ROOT_XATTR;  }  static bool is_valid_xattr(const char *name) @@ -2727,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,  	umode_t mode;  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; -	simple_xattrs_init(&cft->xattrs); - -	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { +	if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {  		strcpy(name, subsys->name);  		strcat(name, ".");  	} @@ -2753,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,  		cfe->type = (void *)cft;  		cfe->dentry = dentry;  		dentry->d_fsdata = cfe; +		simple_xattrs_init(&cfe->xattrs);  		list_add_tail(&cfe->node, &parent->files);  		cfe = NULL;  	} @@ -2770,6 +2723,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,  	for (cft = cfts; cft->name[0] != '\0'; cft++) {  		/* does cft->flags tell us to skip this file on @cgrp? */ +		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) +			continue;  		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)  			continue;  		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) @@ -3300,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)  	return 0;  } +static void cgroup_transfer_one_task(struct task_struct *task, +				     struct cgroup_scanner *scan) +{ +	struct cgroup *new_cgroup = scan->data; + +	mutex_lock(&cgroup_mutex); +	cgroup_attach_task(new_cgroup, task, false); +	mutex_unlock(&cgroup_mutex); +} + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ +	struct cgroup_scanner scan; + +	scan.cg = from; +	scan.test_task = NULL; /* select all tasks in cgroup */ +	scan.process_task = cgroup_transfer_one_task; +	scan.heap = NULL; +	scan.data = to; + +	return cgroup_scan_tasks(&scan); +} +  /*   * Stuff for reading the 'tasks'/'procs' files.   * @@ -3362,35 +3345,14 @@ static void pidlist_free(void *p)  	else  		kfree(p);  } -static void *pidlist_resize(void *p, int newcount) -{ -	void *newlist; -	/* note: if new alloc fails, old p will still be valid either way */ -	if (is_vmalloc_addr(p)) { -		newlist = vmalloc(newcount * sizeof(pid_t)); -		if (!newlist) -			return NULL; -		memcpy(newlist, p, newcount * sizeof(pid_t)); -		vfree(p); -	} else { -		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); -	} -	return newlist; -}  /*   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * If the new stripped list is sufficiently smaller and there's enough memory - * to allocate a new buffer, will let go of the unneeded memory. Returns the - * number of unique elements. + * Returns the number of unique elements.   */ -/* is the size difference enough that we should re-allocate the array? */ -#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) -static int pidlist_uniq(pid_t **p, int length) +static int pidlist_uniq(pid_t *list, int length)  {  	int src, dest = 1; -	pid_t *list = *p; -	pid_t *newlist;  	/*  	 * we presume the 0th element is unique, so i starts at 1. trivial @@ -3411,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length)  		dest++;  	}  after: -	/* -	 * if the length difference is large enough, we want to allocate a -	 * smaller buffer to save memory. if this fails due to out of memory, -	 * we'll just stay with what we've got. -	 */ -	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { -		newlist = pidlist_resize(list, dest); -		if (newlist) -			*p = newlist; -	}  	return dest;  } @@ -3516,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,  	/* now sort & (if procs) strip out duplicates */  	sort(array, length, sizeof(pid_t), cmppid, NULL);  	if (type == CGROUP_FILE_PROCS) -		length = pidlist_uniq(&array, length); +		length = pidlist_uniq(array, length);  	l = cgroup_pidlist_find(cgrp, type);  	if (!l) {  		pidlist_free(array); @@ -3930,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,  	if (ret)  		goto fail; -	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { -		event->cft->unregister_event(cgrp, event->cft, event->eventfd); -		ret = 0; -		goto fail; -	} +	efile->f_op->poll(efile, &event->pt);  	/*  	 * Events should be removed after rmdir of cgroup directory, but before @@ -4016,10 +3964,16 @@ static struct cftype files[] = {  	},  	{  		.name = "cgroup.clone_children", +		.flags = CFTYPE_INSANE,  		.read_u64 = cgroup_clone_children_read,  		.write_u64 = cgroup_clone_children_write,  	},  	{ +		.name = "cgroup.sane_behavior", +		.flags = CFTYPE_ONLY_ON_ROOT, +		.read_seq_string = cgroup_sane_behavior_show, +	}, +	{  		.name = "release_agent",  		.flags = CFTYPE_ONLY_ON_ROOT,  		.read_seq_string = cgroup_release_agent_show, @@ -4131,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)  	if (!(css->flags & CSS_ONLINE))  		return; -	/* -	 * css_offline() should be called with cgroup_mutex unlocked.  See -	 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for -	 * details.  This temporary unlocking should go away once -	 * cgroup_mutex is unexported from controllers. -	 */ -	if (ss->css_offline) { -		mutex_unlock(&cgroup_mutex); +	if (ss->css_offline)  		ss->css_offline(cgrp); -		mutex_lock(&cgroup_mutex); -	}  	cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;  } @@ -4158,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  			     umode_t mode)  {  	struct cgroup *cgrp; +	struct cgroup_name *name;  	struct cgroupfs_root *root = parent->root;  	int err = 0;  	struct cgroup_subsys *ss; @@ -4168,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	if (!cgrp)  		return -ENOMEM; +	name = cgroup_alloc_name(dentry); +	if (!name) +		goto err_free_cgrp; +	rcu_assign_pointer(cgrp->name, name); +  	cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);  	if (cgrp->id < 0) -		goto err_free_cgrp; +		goto err_free_name;  	/*  	 * Only live parents can have children.  Note that the liveliness @@ -4198,7 +4149,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	cgrp->parent = parent;  	cgrp->root = parent->root; -	cgrp->top_cgroup = parent->top_cgroup;  	if (notify_on_release(parent))  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4241,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	for_each_subsys(root, ss)  		dget(dentry); +	/* hold a ref to the parent's dentry */ +	dget(parent->dentry); +  	/* creation succeeded, notify subsystems */  	for_each_subsys(root, ss) {  		err = online_css(ss, cgrp); @@ -4276,6 +4229,8 @@ err_free_all:  	deactivate_super(sb);  err_free_id:  	ida_simple_remove(&root->cgroup_ida, cgrp->id); +err_free_name: +	kfree(rcu_dereference_raw(cgrp->name));  err_free_cgrp:  	kfree(cgrp);  	return err; @@ -4295,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);  } -/* - * Check the reference count on each subsystem. Since we already - * established that there are no tasks in the cgroup, if the css refcount - * is also 1, then there should be no outstanding references, so the - * subsystem is safe to destroy. We scan across all subsystems rather than - * using the per-hierarchy linked list of mounted subsystems since we can - * be called via check_for_release() with no synchronization other than - * RCU, and the subsystem linked list isn't RCU-safe. - */ -static int cgroup_has_css_refs(struct cgroup *cgrp) -{ -	int i; - -	/* -	 * We won't need to lock the subsys array, because the subsystems -	 * we're concerned about aren't going anywhere since our cgroup root -	 * has a reference on them. -	 */ -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		struct cgroup_subsys *ss = subsys[i]; -		struct cgroup_subsys_state *css; - -		/* Skip subsystems not present or not in this hierarchy */ -		if (ss == NULL || ss->root != cgrp->root) -			continue; - -		css = cgrp->subsys[ss->subsys_id]; -		/* -		 * When called from check_for_release() it's possible -		 * that by this point the cgroup has been removed -		 * and the css deleted. But a false-positive doesn't -		 * matter, since it can only happen if the cgroup -		 * has been deleted and hence no longer needs the -		 * release agent to be called anyway. -		 */ -		if (css && css_refcnt(css) > 1) -			return 1; -	} -	return 0; -} -  static int cgroup_destroy_locked(struct cgroup *cgrp)  	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)  {  	struct dentry *d = cgrp->dentry;  	struct cgroup *parent = cgrp->parent; -	DEFINE_WAIT(wait);  	struct cgroup_event *event, *tmp;  	struct cgroup_subsys *ss; -	LIST_HEAD(tmp_list);  	lockdep_assert_held(&d->d_inode->i_mutex);  	lockdep_assert_held(&cgroup_mutex); @@ -4932,17 +4844,17 @@ void cgroup_post_fork(struct task_struct *child)  	 * and addition to css_set.  	 */  	if (need_forkexit_callback) { -		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { +		/* +		 * fork/exit callbacks are supported only for builtin +		 * subsystems, and the builtin section of the subsys +		 * array is immutable, so we don't need to lock the +		 * subsys array here. On the other hand, modular section +		 * of the array can be freed at module unload, so we +		 * can't touch that. +		 */ +		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {  			struct cgroup_subsys *ss = subsys[i]; -			/* -			 * fork/exit callbacks are supported only for -			 * builtin subsystems and we don't need further -			 * synchronization as they never go away. -			 */ -			if (!ss || ss->module) -				continue; -  			if (ss->fork)  				ss->fork(child);  		} @@ -5007,13 +4919,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  	tsk->cgroups = &init_css_set;  	if (run_callbacks && need_forkexit_callback) { -		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { +		/* +		 * fork/exit callbacks are supported only for builtin +		 * subsystems, see cgroup_post_fork() for details. +		 */ +		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {  			struct cgroup_subsys *ss = subsys[i]; -			/* modular subsystems can't use callbacks */ -			if (!ss || ss->module) -				continue; -  			if (ss->exit) {  				struct cgroup *old_cgrp =  					rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -5027,44 +4939,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  	put_css_set_taskexit(cg);  } -/** - * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp - * @cgrp: the cgroup in question - * @task: the task in question - * - * See if @cgrp is a descendant of @task's cgroup in the appropriate - * hierarchy. - * - * If we are sending in dummytop, then presumably we are creating - * the top cgroup in the subsystem. - * - * Called only by the ns (nsproxy) cgroup. - */ -int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) -{ -	int ret; -	struct cgroup *target; - -	if (cgrp == dummytop) -		return 1; - -	target = task_cgroup_from_root(task, cgrp->root); -	while (cgrp != target && cgrp!= cgrp->top_cgroup) -		cgrp = cgrp->parent; -	ret = (cgrp == target); -	return ret; -} -  static void check_for_release(struct cgroup *cgrp)  {  	/* All of these checks rely on RCU to keep the cgroup  	 * structure alive */ -	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) -	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { -		/* Control Group is currently removeable. If it's not +	if (cgroup_is_releasable(cgrp) && +	    !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { +		/* +		 * Control Group is currently removeable. If it's not  		 * already queued for a userspace notification, queue -		 * it now */ +		 * it now +		 */  		int need_schedule_work = 0; +  		raw_spin_lock(&release_list_lock);  		if (!cgroup_is_removed(cgrp) &&  		    list_empty(&cgrp->release_list)) { @@ -5097,24 +4984,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);  /* Caller must verify that the css is not for root cgroup */  void __css_put(struct cgroup_subsys_state *css)  { -	struct cgroup *cgrp = css->cgroup;  	int v; -	rcu_read_lock();  	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - -	switch (v) { -	case 1: -		if (notify_on_release(cgrp)) { -			set_bit(CGRP_RELEASABLE, &cgrp->flags); -			check_for_release(cgrp); -		} -		break; -	case 0: +	if (v == 0)  		schedule_work(&css->dput_work); -		break; -	} -	rcu_read_unlock();  }  EXPORT_SYMBOL_GPL(__css_put); @@ -5413,55 +5287,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)  }  EXPORT_SYMBOL_GPL(css_lookup); -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, -	     struct cgroup_subsys_state *root, int *foundid) -{ -	struct cgroup_subsys_state *ret = NULL; -	struct css_id *tmp; -	int tmpid; -	int rootid = css_id(root); -	int depth = css_depth(root); - -	if (!rootid) -		return NULL; - -	BUG_ON(!ss->use_id); -	WARN_ON_ONCE(!rcu_read_lock_held()); - -	/* fill start point for scan */ -	tmpid = id; -	while (1) { -		/* -		 * scan next entry from bitmap(tree), tmpid is updated after -		 * idr_get_next(). -		 */ -		tmp = idr_get_next(&ss->idr, &tmpid); -		if (!tmp) -			break; -		if (tmp->depth >= depth && tmp->stack[depth] == rootid) { -			ret = rcu_dereference(tmp->css); -			if (ret) { -				*foundid = tmpid; -				break; -			} -		} -		/* continue to scan from next id */ -		tmpid = tmpid + 1; -	} -	return ret; -} -  /*   * get corresponding css from file open on cgroupfs directory   */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecb..12331120767 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex);  static DEFINE_MUTEX(callback_mutex);  /* - * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist - * buffers.  They are statically allocated to prevent using excess stack - * when calling cpuset_print_task_mems_allowed(). - */ -#define CPUSET_NAME_LEN		(128) -#define	CPUSET_NODELIST_LEN	(256) -static char cpuset_name[CPUSET_NAME_LEN]; -static char cpuset_nodelist[CPUSET_NODELIST_LEN]; -static DEFINE_SPINLOCK(cpuset_buffer_lock); - -/*   * CPU / memory hotplug is handled asynchronously.   */  static struct workqueue_struct *cpuset_propagate_hotplug_wq; @@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)  	lockdep_assert_held(&cpuset_mutex);  	get_online_cpus(); +	/* +	 * We have raced with CPU hotplug. Don't do anything to avoid +	 * passing doms with offlined cpu to partition_sched_domains(). +	 * Anyways, hotplug work item will rebuild sched domains. +	 */ +	if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) +		goto out; +  	/* Generate domain masks and attrs */  	ndoms = generate_sched_domains(&doms, &attr);  	/* Have scheduler rebuild the domains */  	partition_sched_domains(ndoms, doms, attr); - +out:  	put_online_cpus();  }  #else /* !CONFIG_SMP */  static void rebuild_sched_domains_locked(void)  {  } - -static int generate_sched_domains(cpumask_var_t **domains, -			struct sched_domain_attr **attributes) -{ -	*domains = NULL; -	return 1; -}  #endif /* CONFIG_SMP */  void rebuild_sched_domains(void) @@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  	cgroup_taskset_for_each(task, cgrp, tset) {  		/* -		 * Kthreads bound to specific cpus cannot be moved to a new -		 * cpuset; we cannot change their cpu affinity and -		 * isolating such threads by their set of allowed nodes is -		 * unnecessary.  Thus, cpusets are not applicable for such -		 * threads.  This prevents checking for success of -		 * set_cpus_allowed_ptr() on all attached tasks before -		 * cpus_allowed may be changed. +		 * Kthreads which disallow setaffinity shouldn't be moved +		 * to a new cpuset; we don't want to change their cpu +		 * affinity and isolating such threads by their set of +		 * allowed nodes is unnecessary.  Thus, cpusets are not +		 * applicable for such threads.  This prevents checking for +		 * success of set_cpus_allowed_ptr() on all attached tasks +		 * before cpus_allowed may be changed.  		 */  		ret = -EINVAL; -		if (task->flags & PF_THREAD_BOUND) +		if (task->flags & PF_NO_SETAFFINITY)  			goto out_unlock;  		ret = security_task_setscheduler(task);  		if (ret) @@ -2005,50 +1995,6 @@ int __init cpuset_init(void)  	return 0;  } -/** - * cpuset_do_move_task - move a given task to another cpuset - * @tsk: pointer to task_struct the task to move - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Called by cgroup_scan_tasks() for each task in a cgroup. - * Return nonzero to stop the walk through the tasks. - */ -static void cpuset_do_move_task(struct task_struct *tsk, -				struct cgroup_scanner *scan) -{ -	struct cgroup *new_cgroup = scan->data; - -	cgroup_lock(); -	cgroup_attach_task(new_cgroup, tsk); -	cgroup_unlock(); -} - -/** - * move_member_tasks_to_cpuset - move tasks from one cpuset to another - * @from: cpuset in which the tasks currently reside - * @to: cpuset to which the tasks will be moved - * - * Called with cpuset_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - */ -static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) -{ -	struct cgroup_scanner scan; - -	scan.cg = from->css.cgroup; -	scan.test_task = NULL; /* select all tasks in cgroup */ -	scan.process_task = cpuset_do_move_task; -	scan.heap = NULL; -	scan.data = to->css.cgroup; - -	if (cgroup_scan_tasks(&scan)) -		printk(KERN_ERR "move_member_tasks_to_cpuset: " -				"cgroup_scan_tasks failed\n"); -} -  /*   * If CPU and/or memory hotplug handlers, below, unplug any CPUs   * or memory nodes, we need to walk over the cpuset hierarchy, @@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)  			nodes_empty(parent->mems_allowed))  		parent = parent_cs(parent); -	move_member_tasks_to_cpuset(cs, parent); +	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { +		rcu_read_lock(); +		printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", +		       cgroup_name(cs->css.cgroup)); +		rcu_read_unlock(); +	}  }  /** @@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	flush_workqueue(cpuset_propagate_hotplug_wq);  	/* rebuild sched domains if cpus_allowed has changed */ -	if (cpus_updated) { -		struct sched_domain_attr *attr; -		cpumask_var_t *doms; -		int ndoms; - -		mutex_lock(&cpuset_mutex); -		ndoms = generate_sched_domains(&doms, &attr); -		mutex_unlock(&cpuset_mutex); - -		partition_sched_domains(ndoms, doms, attr); -	} +	if (cpus_updated) +		rebuild_sched_domains();  }  void cpuset_update_active_cpus(bool cpu_online) @@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online)  	schedule_work(&cpuset_hotplug_work);  } -#ifdef CONFIG_MEMORY_HOTPLUG  /*   * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].   * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,  	schedule_work(&cpuset_hotplug_work);  	return NOTIFY_OK;  } -#endif + +static struct notifier_block cpuset_track_online_nodes_nb = { +	.notifier_call = cpuset_track_online_nodes, +	.priority = 10,		/* ??! */ +};  /**   * cpuset_init_smp - initialize cpus_allowed   *   * Description: Finish top cpuset after cpu, node maps are initialized - **/ - + */  void __init cpuset_init_smp(void)  {  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);  	top_cpuset.mems_allowed = node_states[N_MEMORY]; -	hotplug_memory_notifier(cpuset_track_online_nodes, 10); +	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);  	cpuset_propagate_hotplug_wq =  		alloc_ordered_workqueue("cpuset_hotplug", 0); @@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,  	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);  } +#define CPUSET_NODELIST_LEN	(256) +  /**   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed   * @task: pointer to task_struct of some task. @@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,   */  void cpuset_print_task_mems_allowed(struct task_struct *tsk)  { -	struct dentry *dentry; +	 /* Statically allocated to prevent using excess stack. */ +	static char cpuset_nodelist[CPUSET_NODELIST_LEN]; +	static DEFINE_SPINLOCK(cpuset_buffer_lock); -	dentry = task_cs(tsk)->css.cgroup->dentry; -	spin_lock(&cpuset_buffer_lock); +	struct cgroup *cgrp = task_cs(tsk)->css.cgroup; -	if (!dentry) { -		strcpy(cpuset_name, "/"); -	} else { -		spin_lock(&dentry->d_lock); -		strlcpy(cpuset_name, (const char *)dentry->d_name.name, -			CPUSET_NAME_LEN); -		spin_unlock(&dentry->d_lock); -	} +	rcu_read_lock(); +	spin_lock(&cpuset_buffer_lock);  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,  			   tsk->mems_allowed);  	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", -	       tsk->comm, cpuset_name, cpuset_nodelist); +	       tsk->comm, cgroup_name(cgrp), cpuset_nodelist); +  	spin_unlock(&cpuset_buffer_lock); +	rcu_read_unlock();  }  /* diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c3..3820e3cefba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -37,6 +37,7 @@  #include <linux/ftrace_event.h>  #include <linux/hw_breakpoint.h>  #include <linux/mm_types.h> +#include <linux/cgroup.h>  #include "internal.h" @@ -234,6 +235,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,  #ifdef CONFIG_CGROUP_PERF  /* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { +	u64				time; +	u64				timestamp; +}; + +struct perf_cgroup { +	struct cgroup_subsys_state	css; +	struct perf_cgroup_info	__percpu *info; +}; + +/*   * Must ensure cgroup is pinned (css_get) before calling   * this function. In other words, we cannot call this function   * if there is no cgroup event for the current CPU context. @@ -251,7 +266,22 @@ perf_cgroup_match(struct perf_event *event)  	struct perf_event_context *ctx = event->ctx;  	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); -	return !event->cgrp || event->cgrp == cpuctx->cgrp; +	/* @event doesn't care about cgroup */ +	if (!event->cgrp) +		return true; + +	/* wants specific cgroup scope but @cpuctx isn't associated with any */ +	if (!cpuctx->cgrp) +		return false; + +	/* +	 * Cgroup scoping is recursive.  An event enabled for a cgroup is +	 * also enabled for all its descendant cgroups.  If @cpuctx's +	 * cgroup is a descendant of @event's (the test covers identity +	 * case), it's a match. +	 */ +	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, +				    event->cgrp->css.cgroup);  }  static inline bool perf_tryget_cgroup(struct perf_event *event) @@ -961,9 +991,15 @@ static void perf_event__header_size(struct perf_event *event)  	if (sample_type & PERF_SAMPLE_PERIOD)  		size += sizeof(data->period); +	if (sample_type & PERF_SAMPLE_WEIGHT) +		size += sizeof(data->weight); +  	if (sample_type & PERF_SAMPLE_READ)  		size += event->read_size; +	if (sample_type & PERF_SAMPLE_DATA_SRC) +		size += sizeof(data->data_src.val); +  	event->header_size = size;  } @@ -4178,6 +4214,12 @@ void perf_output_sample(struct perf_output_handle *handle,  		perf_output_sample_ustack(handle,  					  data->stack_user_size,  					  data->regs_user.regs); + +	if (sample_type & PERF_SAMPLE_WEIGHT) +		perf_output_put(handle, data->weight); + +	if (sample_type & PERF_SAMPLE_DATA_SRC) +		perf_output_put(handle, data->data_src.val);  }  void perf_prepare_sample(struct perf_event_header *header, @@ -4434,12 +4476,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)  			if (ctxn < 0)  				goto next;  			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); +			if (ctx) +				perf_event_task_ctx(ctx, task_event);  		} -		if (ctx) -			perf_event_task_ctx(ctx, task_event);  next:  		put_cpu_ptr(pmu->pmu_cpu_context);  	} +	if (task_event->task_ctx) +		perf_event_task_ctx(task_event->task_ctx, task_event); +  	rcu_read_unlock();  } @@ -4593,6 +4638,7 @@ void perf_event_comm(struct task_struct *task)  	struct perf_event_context *ctx;  	int ctxn; +	rcu_read_lock();  	for_each_task_context_nr(ctxn) {  		ctx = task->perf_event_ctxp[ctxn];  		if (!ctx) @@ -4600,6 +4646,7 @@ void perf_event_comm(struct task_struct *task)  		perf_event_enable_on_exec(ctx);  	} +	rcu_read_unlock();  	if (!atomic_read(&nr_comm_events))  		return; @@ -4734,7 +4781,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)  	} else {  		if (arch_vma_name(mmap_event->vma)) {  			name = strncpy(tmp, arch_vma_name(mmap_event->vma), -				       sizeof(tmp)); +				       sizeof(tmp) - 1); +			tmp[sizeof(tmp) - 1] = '\0';  			goto got_name;  		} @@ -4761,6 +4809,9 @@ got_name:  	mmap_event->file_name = name;  	mmap_event->file_size = size; +	if (!(vma->vm_flags & VM_EXEC)) +		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; +  	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;  	rcu_read_lock(); @@ -5327,7 +5378,7 @@ static void sw_perf_event_destroy(struct perf_event *event)  static int perf_swevent_init(struct perf_event *event)  { -	int event_id = event->attr.config; +	u64 event_id = event->attr.config;  	if (event->attr.type != PERF_TYPE_SOFTWARE)  		return -ENOENT; @@ -5647,6 +5698,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)  		event->attr.sample_period = NSEC_PER_SEC / freq;  		hwc->sample_period = event->attr.sample_period;  		local64_set(&hwc->period_left, hwc->sample_period); +		hwc->last_period = hwc->sample_period;  		event->attr.freq = 0;  	}  } @@ -5982,6 +6034,7 @@ skip_type:  	if (pmu->pmu_cpu_context)  		goto got_cpu_context; +	ret = -ENOMEM;  	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);  	if (!pmu->pmu_cpu_context)  		goto free_dev; @@ -7509,12 +7562,5 @@ struct cgroup_subsys perf_subsys = {  	.css_free	= perf_cgroup_css_free,  	.exit		= perf_cgroup_exit,  	.attach		= perf_cgroup_attach, - -	/* -	 * perf_event cgroup doesn't handle nesting correctly. -	 * ctx->nr_cgroups adjustments should be propagated through the -	 * cgroup hierarchy.  Fix it and remove the following. -	 */ -	.broken_hierarchy = true,  };  #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8..eb675c4d59d 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -16,7 +16,7 @@ struct ring_buffer {  	int				page_order;	/* allocation order  */  #endif  	int				nr_pages;	/* nr of data pages  */ -	int				writable;	/* are we writable   */ +	int				overwrite;	/* can overwrite itself */  	atomic_t			poll;		/* POLL_ for wakeups */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff397..97fddb09762 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -18,12 +18,24 @@  static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,  			      unsigned long offset, unsigned long head)  { -	unsigned long mask; +	unsigned long sz = perf_data_size(rb); +	unsigned long mask = sz - 1; -	if (!rb->writable) +	/* +	 * check if user-writable +	 * overwrite : over-write its own tail +	 * !overwrite: buffer possibly drops events. +	 */ +	if (rb->overwrite)  		return true; -	mask = perf_data_size(rb) - 1; +	/* +	 * verify that payload is not bigger than buffer +	 * otherwise masking logic may fail to detect +	 * the "not enough space" condition +	 */ +	if ((head - offset) > sz) +		return false;  	offset = (offset - tail) & mask;  	head   = (head   - tail) & mask; @@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)  		rb->watermark = max_size / 2;  	if (flags & RING_BUFFER_WRITABLE) -		rb->writable = 1; +		rb->overwrite = 0; +	else +		rb->overwrite = 1;  	atomic_set(&rb->refcount, 1); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a567c8c7ef3..f3569747d62 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -75,6 +75,15 @@ struct uprobe {  	struct arch_uprobe	arch;  }; +struct return_instance { +	struct uprobe		*uprobe; +	unsigned long		func; +	unsigned long		orig_ret_vaddr; /* original return address */ +	bool			chained;	/* true, if instance is nested */ + +	struct return_instance	*next;		/* keep as stack */ +}; +  /*   * valid_vma: Verify if the specified vma is an executable vma   * Relax restrictions while unregistering: vm_flags might have @@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)  	return *insn == UPROBE_SWBP_INSN;  } -static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +/** + * is_trap_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_trap_insn + * Returns true if @insn is a breakpoint instruction. + * + * This function is needed for the case where an architecture has multiple + * trap instructions (like powerpc). + */ +bool __weak is_trap_insn(uprobe_opcode_t *insn) +{ +	return is_swbp_insn(insn); +} + +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)  {  	void *kaddr = kmap_atomic(page); -	memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); +	memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); +	kunmap_atomic(kaddr); +} + +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) +{ +	void *kaddr = kmap_atomic(page); +	memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);  	kunmap_atomic(kaddr);  } @@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t  	uprobe_opcode_t old_opcode;  	bool is_swbp; -	copy_opcode(page, vaddr, &old_opcode); +	/* +	 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. +	 * We do not check if it is any other 'trap variant' which could +	 * be conditional trap instruction such as the one powerpc supports. +	 * +	 * The logic is that we do not care if the underlying instruction +	 * is a trap variant; uprobes always wins over any other (gdb) +	 * breakpoint. +	 */ +	copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);  	is_swbp = is_swbp_insn(&old_opcode);  	if (is_swbp_insn(new_opcode)) { @@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t   * Expect the breakpoint instruction to be the smallest size instruction for   * the architecture. If an arch has variable length instruction and the   * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify is_swbp_at_addr and + * supported by that architecture then we need to modify is_trap_at_addr and   * write_opcode accordingly. This would never be a problem for archs that   * have fixed length instructions.   */ @@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,  			uprobe_opcode_t opcode)  {  	struct page *old_page, *new_page; -	void *vaddr_old, *vaddr_new;  	struct vm_area_struct *vma;  	int ret; @@ -246,15 +284,8 @@ retry:  	__SetPageUptodate(new_page); -	/* copy the page now that we've got it stable */ -	vaddr_old = kmap_atomic(old_page); -	vaddr_new = kmap_atomic(new_page); - -	memcpy(vaddr_new, vaddr_old, PAGE_SIZE); -	memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); - -	kunmap_atomic(vaddr_new); -	kunmap_atomic(vaddr_old); +	copy_highpage(new_page, old_page); +	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);  	ret = anon_vma_prepare(vma);  	if (ret) @@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,  			unsigned long nbytes, loff_t offset)  {  	struct page *page; -	void *vaddr; -	unsigned long off; -	pgoff_t idx; - -	if (!filp) -		return -EINVAL;  	if (!mapping->a_ops->readpage)  		return -EIO; - -	idx = offset >> PAGE_CACHE_SHIFT; -	off = offset & ~PAGE_MASK; -  	/*  	 * Ensure that the page that has the original instruction is  	 * populated and in page-cache.  	 */ -	page = read_mapping_page(mapping, idx, filp); +	page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);  	if (IS_ERR(page))  		return PTR_ERR(page); -	vaddr = kmap_atomic(page); -	memcpy(insn, vaddr + off, nbytes); -	kunmap_atomic(vaddr); +	copy_from_page(page, offset, insn, nbytes);  	page_cache_release(page);  	return 0; @@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,  		goto out;  	ret = -ENOTSUPP; -	if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) +	if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))  		goto out;  	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)  		down_write(&mm->mmap_sem);  		vma = find_vma(mm, info->vaddr);  		if (!vma || !valid_vma(vma, is_register) || -		    vma->vm_file->f_mapping->host != uprobe->inode) +		    file_inode(vma->vm_file) != uprobe->inode)  			goto unlock;  		if (vma->vm_start > info->vaddr || @@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *  	struct uprobe *uprobe;  	int ret; +	/* Uprobe must have at least one set consumer */ +	if (!uc->handler && !uc->ret_handler) +		return -EINVAL; +  	/* Racy, just to catch the obvious mistakes */  	if (offset > i_size_read(inode))  		return -EINVAL; @@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)  		loff_t offset;  		if (!valid_vma(vma, false) || -		    vma->vm_file->f_mapping->host != uprobe->inode) +		    file_inode(vma->vm_file) != uprobe->inode)  			continue;  		offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; @@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)  	if (no_uprobe_events() || !valid_vma(vma, true))  		return 0; -	inode = vma->vm_file->f_mapping->host; +	inode = file_inode(vma->vm_file);  	if (!inode)  		return 0; @@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e  	struct inode *inode;  	struct rb_node *n; -	inode = vma->vm_file->f_mapping->host; +	inode = file_inode(vma->vm_file);  	min = vaddr_to_offset(vma, start);  	max = min + (end - start) - 1; @@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)  {  	struct mm_struct *mm = current->mm;  	struct xol_area *area; +	uprobe_opcode_t insn = UPROBE_SWBP_INSN;  	area = mm->uprobes_state.xol_area;  	if (area) @@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)  	if (!area->page)  		goto free_bitmap; +	/* allocate first slot of task's xol_area for the return probes */ +	set_bit(0, area->bitmap); +	copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); +	atomic_set(&area->slot_count, 1);  	init_waitqueue_head(&area->wq); +  	if (!xol_add_vma(area))  		return area; @@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)  static unsigned long xol_get_insn_slot(struct uprobe *uprobe)  {  	struct xol_area *area; -	unsigned long offset;  	unsigned long xol_vaddr; -	void *vaddr;  	area = get_xol_area();  	if (!area) @@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)  		return 0;  	/* Initialize the slot */ -	offset = xol_vaddr & ~PAGE_MASK; -	vaddr = kmap_atomic(area->page); -	memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); -	kunmap_atomic(vaddr); +	copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);  	/*  	 * We probably need flush_icache_user_range() but it needs vma.  	 * This should work on supported architectures too. @@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)  void uprobe_free_utask(struct task_struct *t)  {  	struct uprobe_task *utask = t->utask; +	struct return_instance *ri, *tmp;  	if (!utask)  		return; @@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)  	if (utask->active_uprobe)  		put_uprobe(utask->active_uprobe); +	ri = utask->return_instances; +	while (ri) { +		tmp = ri; +		ri = ri->next; + +		put_uprobe(tmp->uprobe); +		kfree(tmp); +	} +  	xol_free_insn_slot(t);  	kfree(utask);  	t->utask = NULL; @@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)  	return current->utask;  } +/* + * Current area->vaddr notion assume the trampoline address is always + * equal area->vaddr. + * + * Returns -1 in case the xol_area is not allocated. + */ +static unsigned long get_trampoline_vaddr(void) +{ +	struct xol_area *area; +	unsigned long trampoline_vaddr = -1; + +	area = current->mm->uprobes_state.xol_area; +	smp_read_barrier_depends(); +	if (area) +		trampoline_vaddr = area->vaddr; + +	return trampoline_vaddr; +} + +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +{ +	struct return_instance *ri; +	struct uprobe_task *utask; +	unsigned long orig_ret_vaddr, trampoline_vaddr; +	bool chained = false; + +	if (!get_xol_area()) +		return; + +	utask = get_utask(); +	if (!utask) +		return; + +	if (utask->depth >= MAX_URETPROBE_DEPTH) { +		printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" +				" nestedness limit pid/tgid=%d/%d\n", +				current->pid, current->tgid); +		return; +	} + +	ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); +	if (!ri) +		goto fail; + +	trampoline_vaddr = get_trampoline_vaddr(); +	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); +	if (orig_ret_vaddr == -1) +		goto fail; + +	/* +	 * We don't want to keep trampoline address in stack, rather keep the +	 * original return address of first caller thru all the consequent +	 * instances. This also makes breakpoint unwrapping easier. +	 */ +	if (orig_ret_vaddr == trampoline_vaddr) { +		if (!utask->return_instances) { +			/* +			 * This situation is not possible. Likely we have an +			 * attack from user-space. +			 */ +			pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", +						current->pid, current->tgid); +			goto fail; +		} + +		chained = true; +		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; +	} + +	atomic_inc(&uprobe->ref); +	ri->uprobe = uprobe; +	ri->func = instruction_pointer(regs); +	ri->orig_ret_vaddr = orig_ret_vaddr; +	ri->chained = chained; + +	utask->depth++; + +	/* add instance to the stack */ +	ri->next = utask->return_instances; +	utask->return_instances = ri; + +	return; + + fail: +	kfree(ri); +} +  /* Prepare to single-step probed instruction out of line. */  static int  pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) @@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)  	clear_bit(MMF_HAS_UPROBES, &mm->flags);  } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)  {  	struct page *page;  	uprobe_opcode_t opcode; @@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)  	if (result < 0)  		return result; -	copy_opcode(page, vaddr, &opcode); +	copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);  	put_page(page);   out: -	return is_swbp_insn(&opcode); +	/* This needs to return true for any variant of the trap insn */ +	return is_trap_insn(&opcode);  }  static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) @@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)  	vma = find_vma(mm, bp_vaddr);  	if (vma && vma->vm_start <= bp_vaddr) {  		if (valid_vma(vma, false)) { -			struct inode *inode = vma->vm_file->f_mapping->host; +			struct inode *inode = file_inode(vma->vm_file);  			loff_t offset = vaddr_to_offset(vma, bp_vaddr);  			uprobe = find_uprobe(inode, offset);  		}  		if (!uprobe) -			*is_swbp = is_swbp_at_addr(mm, bp_vaddr); +			*is_swbp = is_trap_at_addr(mm, bp_vaddr);  	} else {  		*is_swbp = -EFAULT;  	} @@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)  {  	struct uprobe_consumer *uc;  	int remove = UPROBE_HANDLER_REMOVE; +	bool need_prep = false; /* prepare return uprobe, when needed */  	down_read(&uprobe->register_rwsem);  	for (uc = uprobe->consumers; uc; uc = uc->next) { -		int rc = uc->handler(uc, regs); +		int rc = 0; + +		if (uc->handler) { +			rc = uc->handler(uc, regs); +			WARN(rc & ~UPROBE_HANDLER_MASK, +				"bad rc=0x%x from %pf()\n", rc, uc->handler); +		} + +		if (uc->ret_handler) +			need_prep = true; -		WARN(rc & ~UPROBE_HANDLER_MASK, -			"bad rc=0x%x from %pf()\n", rc, uc->handler);  		remove &= rc;  	} +	if (need_prep && !remove) +		prepare_uretprobe(uprobe, regs); /* put bp at return */ +  	if (remove && uprobe->consumers) {  		WARN_ON(!uprobe_is_active(uprobe));  		unapply_uprobe(uprobe, current->mm); @@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)  	up_read(&uprobe->register_rwsem);  } +static void +handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +{ +	struct uprobe *uprobe = ri->uprobe; +	struct uprobe_consumer *uc; + +	down_read(&uprobe->register_rwsem); +	for (uc = uprobe->consumers; uc; uc = uc->next) { +		if (uc->ret_handler) +			uc->ret_handler(uc, ri->func, regs); +	} +	up_read(&uprobe->register_rwsem); +} + +static bool handle_trampoline(struct pt_regs *regs) +{ +	struct uprobe_task *utask; +	struct return_instance *ri, *tmp; +	bool chained; + +	utask = current->utask; +	if (!utask) +		return false; + +	ri = utask->return_instances; +	if (!ri) +		return false; + +	/* +	 * TODO: we should throw out return_instance's invalidated by +	 * longjmp(), currently we assume that the probed function always +	 * returns. +	 */ +	instruction_pointer_set(regs, ri->orig_ret_vaddr); + +	for (;;) { +		handle_uretprobe_chain(ri, regs); + +		chained = ri->chained; +		put_uprobe(ri->uprobe); + +		tmp = ri; +		ri = ri->next; +		kfree(tmp); + +		if (!chained) +			break; + +		utask->depth--; + +		BUG_ON(!ri); +	} + +	utask->return_instances = ri; + +	return true; +} +  /*   * Run handler and ask thread to singlestep.   * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)  	int uninitialized_var(is_swbp);  	bp_vaddr = uprobe_get_swbp_addr(regs); -	uprobe = find_active_uprobe(bp_vaddr, &is_swbp); +	if (bp_vaddr == get_trampoline_vaddr()) { +		if (handle_trampoline(regs)) +			return; + +		pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", +						current->pid, current->tgid); +	} +	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);  	if (!uprobe) {  		if (is_swbp > 0) {  			/* No matching uprobe; signal SIGTRAP. */ @@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)   */  int uprobe_pre_sstep_notifier(struct pt_regs *regs)  { -	if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) +	if (!current->mm) +		return 0; + +	if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && +	    (!current->utask || !current->utask->return_instances))  		return 0;  	set_thread_flag(TIF_UPROBE); diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca993..60bc027c61c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code)  	/*  	 * Make sure we are holding no locks:  	 */ -	debug_check_no_locks_held(); +	debug_check_no_locks_held(tsk);  	/*  	 * We can do this unlocked here. The futex code uses this flag  	 * just to verify whether the pi state cleanup has been done diff --git a/kernel/fork.c b/kernel/fork.c index f3146ed4907..339f60dfd62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))  		return ERR_PTR(-EINVAL); +	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) +		return ERR_PTR(-EINVAL); +  	/*  	 * Thread groups must share signals as well, and detached threads  	 * can only be started up within the thread group. @@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)  	 * If unsharing a user namespace must also unshare the thread.  	 */  	if (unshare_flags & CLONE_NEWUSER) -		unshare_flags |= CLONE_THREAD; +		unshare_flags |= CLONE_THREAD | CLONE_FS;  	/*  	 * If unsharing a pid namespace must also unshare the thread.  	 */ diff --git a/kernel/futex.c b/kernel/futex.c index f0090a993da..b26dcfc02c9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key)   * @rw:		mapping needs to be read/write (values: VERIFY_READ,   *              VERIFY_WRITE)   * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + *   * The key words are stored in *key on success.   *   * For shared mappings, it's (page->index, file_inode(vma->vm_file), @@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,   *			be "current" except in the case of requeue pi.   * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)   * - * Returns: - *  0 - ready to wait - *  1 - acquired the lock + * Return: + *  0 - ready to wait; + *  1 - acquired the lock;   * <0 - error   *   * The hb->lock and futex_key refs shall be held by the caller. @@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,   * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.   * hb1 and hb2 must be held by the caller.   * - * Returns: - *  0 - failed to acquire the lock atomicly - *  1 - acquired the lock + * Return: + *  0 - failed to acquire the lock atomically; + *  1 - acquired the lock;   * <0 - error   */  static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,   * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire   * uaddr2 atomically on behalf of the top waiter.   * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken;   *  <0 - on error   */  static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)   * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must   * be paired with exactly one earlier call to queue_me().   * - * Returns: - *   1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + *   1 - if the futex_q was still queued (and we removed unqueued it);   *   0 - if the futex_q was already removed by the waking thread   */  static int unqueue_me(struct futex_q *q) @@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);   * the pi_state owner as well as handle race conditions that may allow us to   * acquire the lock. Must be called with the hb lock held.   * - * Returns: - *  1 - success, lock taken - *  0 - success, lock not taken + * Return: + *  1 - success, lock taken; + *  0 - success, lock not taken;   * <0 - on error (-EFAULT)   */  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,   * Return with the hb lock held and a q.key reference on success, and unlocked   * with no q.key reference on failure.   * - * Returns: - *  0 - uaddr contains val and hb has been locked + * Return: + *  0 - uaddr contains val and hb has been locked;   * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked   */  static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -2203,9 +2204,9 @@ pi_faulted:   * the wakeup and return the appropriate error code to the caller.  Must be   * called with the hb lock held.   * - * Returns - *  0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + *  0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR   */  static inline  int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,   * @val:	the expected value of uaddr   * @abs_time:	absolute timeout   * @bitset:	32 bit wakeup bitset set by userspace, defaults to all - * @clockrt:	whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)   * @uaddr2:	the pi futex we will take prior to returning to user-space   *   * The caller will wait on uaddr and will be requeued by futex_requeue() to @@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,   * there was a need to.   *   * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following--   * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()   * 2) wakeup on uaddr2 after a requeue   * 3) signal @@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,   *   * If 4 or 7, we cleanup and return with -ETIMEDOUT.   * - * Returns: - *  0 - On success + * Return: + *  0 - On success;   * <0 - On error   */  static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3fe..14be27feda4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -63,6 +63,7 @@  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =  { +	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),  	.clock_base =  	{  		{ @@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)  	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);  	int i; -	raw_spin_lock_init(&cpu_base->lock); -  	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {  		cpu_base->clock_base[i].cpu_base = cpu_base;  		timerqueue_init_head(&cpu_base->clock_base[i].active); diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b..b574920cbd4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -55,7 +55,7 @@ struct resource crashk_res = {  	.flags = IORESOURCE_BUSY | IORESOURCE_MEM  };  struct resource crashk_low_res = { -	.name  = "Crash kernel low", +	.name  = "Crash kernel",  	.start = 0,  	.end   = 0,  	.flags = IORESOURCE_BUSY | IORESOURCE_MEM @@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,  {  	unsigned long addr; -	for (addr = begin; addr < end; addr += PAGE_SIZE) { -		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); -		init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); -		free_page((unsigned long)__va(addr)); -		totalram_pages++; -	} +	for (addr = begin; addr < end; addr += PAGE_SIZE) +		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));  }  int crash_shrink_memory(unsigned long new_size) @@ -1368,35 +1364,114 @@ static int __init parse_crashkernel_simple(char 		*cmdline,  	return 0;  } +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW  1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { +	[SUFFIX_HIGH] = ",high", +	[SUFFIX_LOW]  = ",low", +	[SUFFIX_NULL] = NULL, +}; +  /* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. + * That function parses "suffix"  crashkernel command lines like + * + *	crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure.   */ +static int __init parse_crashkernel_suffix(char *cmdline, +					   unsigned long long	*crash_size, +					   unsigned long long	*crash_base, +					   const char *suffix) +{ +	char *cur = cmdline; + +	*crash_size = memparse(cmdline, &cur); +	if (cmdline == cur) { +		pr_warn("crashkernel: memory value expected\n"); +		return -EINVAL; +	} + +	/* check with suffix */ +	if (strncmp(cur, suffix, strlen(suffix))) { +		pr_warn("crashkernel: unrecognized char\n"); +		return -EINVAL; +	} +	cur += strlen(suffix); +	if (*cur != ' ' && *cur != '\0') { +		pr_warn("crashkernel: unrecognized char\n"); +		return -EINVAL; +	} + +	return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, +			     const char *name, +			     const char *suffix) +{ +	char *p = cmdline, *ck_cmdline = NULL; + +	/* find crashkernel and use the last one if there are more */ +	p = strstr(p, name); +	while (p) { +		char *end_p = strchr(p, ' '); +		char *q; + +		if (!end_p) +			end_p = p + strlen(p); + +		if (!suffix) { +			int i; + +			/* skip the one with any known suffix */ +			for (i = 0; suffix_tbl[i]; i++) { +				q = end_p - strlen(suffix_tbl[i]); +				if (!strncmp(q, suffix_tbl[i], +					     strlen(suffix_tbl[i]))) +					goto next; +			} +			ck_cmdline = p; +		} else { +			q = end_p - strlen(suffix); +			if (!strncmp(q, suffix, strlen(suffix))) +				ck_cmdline = p; +		} +next: +		p = strstr(p+1, name); +	} + +	if (!ck_cmdline) +		return NULL; + +	return ck_cmdline; +} +  static int __init __parse_crashkernel(char *cmdline,  			     unsigned long long system_ram,  			     unsigned long long *crash_size,  			     unsigned long long *crash_base, -				const char *name) +			     const char *name, +			     const char *suffix)  { -	char 	*p = cmdline, *ck_cmdline = NULL;  	char	*first_colon, *first_space; +	char	*ck_cmdline;  	BUG_ON(!crash_size || !crash_base);  	*crash_size = 0;  	*crash_base = 0; -	/* find crashkernel and use the last one if there are more */ -	p = strstr(p, name); -	while (p) { -		ck_cmdline = p; -		p = strstr(p+1, name); -	} +	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);  	if (!ck_cmdline)  		return -EINVAL;  	ck_cmdline += strlen(name); +	if (suffix) +		return parse_crashkernel_suffix(ck_cmdline, crash_size, +				crash_base, suffix);  	/*  	 * if the commandline contains a ':', then that's the extended  	 * syntax -- if not, it must be the classic syntax @@ -1413,13 +1488,26 @@ static int __init __parse_crashkernel(char *cmdline,  	return 0;  } +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */  int __init parse_crashkernel(char *cmdline,  			     unsigned long long system_ram,  			     unsigned long long *crash_size,  			     unsigned long long *crash_base)  {  	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, -					"crashkernel="); +					"crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, +			     unsigned long long system_ram, +			     unsigned long long *crash_size, +			     unsigned long long *crash_base) +{ +	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, +				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);  }  int __init parse_crashkernel_low(char *cmdline, @@ -1428,7 +1516,7 @@ int __init parse_crashkernel_low(char *cmdline,  			     unsigned long long *crash_base)  {  	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, -					"crashkernel_low="); +				"crashkernel=", suffix_tbl[SUFFIX_LOW]);  }  static void update_vmcoreinfo_note(void) @@ -1489,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_SYMBOL(swapper_pg_dir);  #endif  	VMCOREINFO_SYMBOL(_stext); -	VMCOREINFO_SYMBOL(vmlist); +	VMCOREINFO_SYMBOL(vmap_area_list);  #ifndef CONFIG_NEED_MULTIPLE_NODES  	VMCOREINFO_SYMBOL(mem_map); @@ -1527,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_OFFSET(free_area, free_list);  	VMCOREINFO_OFFSET(list_head, next);  	VMCOREINFO_OFFSET(list_head, prev); -	VMCOREINFO_OFFSET(vm_struct, addr); +	VMCOREINFO_OFFSET(vmap_area, va_start); +	VMCOREINFO_OFFSET(vmap_area, list);  	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);  	log_buf_kexec_setup();  	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e35be53f661..3fed7f0cbcd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -794,16 +794,16 @@ out:  }  #ifdef CONFIG_SYSCTL -/* This should be called with kprobe_mutex locked */  static void __kprobes optimize_all_kprobes(void)  {  	struct hlist_head *head;  	struct kprobe *p;  	unsigned int i; +	mutex_lock(&kprobe_mutex);  	/* If optimization is already allowed, just return */  	if (kprobes_allow_optimization) -		return; +		goto out;  	kprobes_allow_optimization = true;  	for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)  				optimize_kprobe(p);  	}  	printk(KERN_INFO "Kprobes globally optimized\n"); +out: +	mutex_unlock(&kprobe_mutex);  } -/* This should be called with kprobe_mutex locked */  static void __kprobes unoptimize_all_kprobes(void)  {  	struct hlist_head *head;  	struct kprobe *p;  	unsigned int i; +	mutex_lock(&kprobe_mutex);  	/* If optimization is already prohibited, just return */ -	if (!kprobes_allow_optimization) +	if (!kprobes_allow_optimization) { +		mutex_unlock(&kprobe_mutex);  		return; +	}  	kprobes_allow_optimization = false;  	for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)  				unoptimize_kprobe(p, false);  		}  	} +	mutex_unlock(&kprobe_mutex); +  	/* Wait for unoptimizing completion */  	wait_for_kprobe_optimizer();  	printk(KERN_INFO "Kprobes globally unoptimized\n");  } +static DEFINE_MUTEX(kprobe_sysctl_mutex);  int sysctl_kprobes_optimization;  int proc_kprobes_optimization_handler(struct ctl_table *table, int write,  				      void __user *buffer, size_t *length, @@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,  {  	int ret; -	mutex_lock(&kprobe_mutex); +	mutex_lock(&kprobe_sysctl_mutex);  	sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,  		optimize_all_kprobes();  	else  		unoptimize_all_kprobes(); -	mutex_unlock(&kprobe_mutex); +	mutex_unlock(&kprobe_sysctl_mutex);  	return ret;  } diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9ba..16d8ddd268b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,8 +52,21 @@ enum KTHREAD_BITS {  	KTHREAD_IS_PARKED,  }; -#define to_kthread(tsk)	\ -	container_of((tsk)->vfork_done, struct kthread, exited) +#define __to_kthread(vfork)	\ +	container_of(vfork, struct kthread, exited) + +static inline struct kthread *to_kthread(struct task_struct *k) +{ +	return __to_kthread(k->vfork_done); +} + +static struct kthread *to_live_kthread(struct task_struct *k) +{ +	struct completion *vfork = ACCESS_ONCE(k->vfork_done); +	if (likely(vfork)) +		return __to_kthread(vfork); +	return NULL; +}  /**   * kthread_should_stop - should this kthread return now? @@ -124,12 +137,12 @@ void *kthread_data(struct task_struct *task)  static void __kthread_parkme(struct kthread *self)  { -	__set_current_state(TASK_INTERRUPTIBLE); +	__set_current_state(TASK_PARKED);  	while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {  		if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))  			complete(&self->parked);  		schedule(); -		__set_current_state(TASK_INTERRUPTIBLE); +		__set_current_state(TASK_PARKED);  	}  	clear_bit(KTHREAD_IS_PARKED, &self->flags);  	__set_current_state(TASK_RUNNING); @@ -256,11 +269,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),  }  EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)  { +	/* Must have done schedule() in kthread() before we set_task_cpu */ +	if (!wait_task_inactive(p, state)) { +		WARN_ON(1); +		return; +	}  	/* It's safe because the task is inactive. */  	do_set_cpus_allowed(p, cpumask_of(cpu)); -	p->flags |= PF_THREAD_BOUND; +	p->flags |= PF_NO_SETAFFINITY;  }  /** @@ -274,12 +292,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)   */  void kthread_bind(struct task_struct *p, unsigned int cpu)  { -	/* Must have done schedule() in kthread() before we set_task_cpu */ -	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { -		WARN_ON(1); -		return; -	} -	__kthread_bind(p, cpu); +	__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);  }  EXPORT_SYMBOL(kthread_bind); @@ -311,17 +324,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),  	return p;  } -static struct kthread *task_get_live_kthread(struct task_struct *k) +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)  { -	struct kthread *kthread; - -	get_task_struct(k); -	kthread = to_kthread(k); -	/* It might have exited */ -	barrier(); -	if (k->vfork_done != NULL) -		return kthread; -	return NULL; +	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); +	/* +	 * We clear the IS_PARKED bit here as we don't wait +	 * until the task has left the park code. So if we'd +	 * park before that happens we'd see the IS_PARKED bit +	 * which might be about to be cleared. +	 */ +	if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { +		if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) +			__kthread_bind(k, kthread->cpu, TASK_PARKED); +		wake_up_state(k, TASK_PARKED); +	}  }  /** @@ -334,23 +350,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)   */  void kthread_unpark(struct task_struct *k)  { -	struct kthread *kthread = task_get_live_kthread(k); +	struct kthread *kthread = to_live_kthread(k); -	if (kthread) { -		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); -		/* -		 * We clear the IS_PARKED bit here as we don't wait -		 * until the task has left the park code. So if we'd -		 * park before that happens we'd see the IS_PARKED bit -		 * which might be about to be cleared. -		 */ -		if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { -			if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) -				__kthread_bind(k, kthread->cpu); -			wake_up_process(k); -		} -	} -	put_task_struct(k); +	if (kthread) +		__kthread_unpark(k, kthread);  }  /** @@ -367,7 +370,7 @@ void kthread_unpark(struct task_struct *k)   */  int kthread_park(struct task_struct *k)  { -	struct kthread *kthread = task_get_live_kthread(k); +	struct kthread *kthread = to_live_kthread(k);  	int ret = -ENOSYS;  	if (kthread) { @@ -380,7 +383,6 @@ int kthread_park(struct task_struct *k)  		}  		ret = 0;  	} -	put_task_struct(k);  	return ret;  } @@ -401,21 +403,23 @@ int kthread_park(struct task_struct *k)   */  int kthread_stop(struct task_struct *k)  { -	struct kthread *kthread = task_get_live_kthread(k); +	struct kthread *kthread;  	int ret;  	trace_sched_kthread_stop(k); + +	get_task_struct(k); +	kthread = to_live_kthread(k);  	if (kthread) {  		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); -		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); +		__kthread_unpark(k, kthread);  		wake_up_process(k);  		wait_for_completion(&kthread->exited);  	}  	ret = k->exit_code; -  	put_task_struct(k); -	trace_sched_kthread_stop_ret(ret); +	trace_sched_kthread_stop_ret(ret);  	return ret;  }  EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d..6a3bccba7e7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)  unsigned long nr_stack_trace_entries;  static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static void print_lockdep_off(const char *bug_msg) +{ +	printk(KERN_DEBUG "%s\n", bug_msg); +	printk(KERN_DEBUG "turning off the locking correctness validator.\n"); +	printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} +  static int save_trace(struct stack_trace *trace)  {  	trace->nr_entries = 0; @@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)  		if (!debug_locks_off_graph_unlock())  			return 0; -		printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");  		dump_stack();  		return 0; @@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)  		}  		raw_local_irq_restore(flags); -		printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");  		dump_stack();  		return NULL;  	} @@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)  		if (!debug_locks_off_graph_unlock())  			return NULL; -		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");  		dump_stack();  		return NULL;  	} @@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,  	struct lock_class *class = hlock_class(hlock);  	struct list_head *hash_head = chainhashentry(chain_key);  	struct lock_chain *chain; -	struct held_lock *hlock_curr, *hlock_next; +	struct held_lock *hlock_curr;  	int i, j;  	/* @@ -2048,8 +2052,7 @@ cache_hit:  		if (!debug_locks_off_graph_unlock())  			return 0; -		printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");  		dump_stack();  		return 0;  	} @@ -2057,12 +2060,10 @@ cache_hit:  	chain->chain_key = chain_key;  	chain->irq_context = hlock->irq_context;  	/* Find the first held_lock of current chain */ -	hlock_next = hlock;  	for (i = curr->lockdep_depth - 1; i >= 0; i--) {  		hlock_curr = curr->held_locks + i; -		if (hlock_curr->irq_context != hlock_next->irq_context) +		if (hlock_curr->irq_context != hlock->irq_context)  			break; -		hlock_next = hlock;  	}  	i++;  	chain->depth = curr->lockdep_depth + 1 - i; @@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  #endif  	if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {  		debug_locks_off(); -		printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n", +		print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); +		printk(KERN_DEBUG "depth: %i  max: %lu!\n",  		       curr->lockdep_depth, MAX_LOCK_DEPTH); -		printk("turning off the locking correctness validator.\n");  		lockdep_print_held_locks(current);  		debug_show_all_locks(); @@ -4088,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)  }  EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr)  {  	if (!debug_locks_off())  		return; @@ -4097,21 +4098,22 @@ static void print_held_locks_bug(void)  	printk("\n");  	printk("=====================================\n"); -	printk("[ BUG: %s/%d still has locks held! ]\n", -	       current->comm, task_pid_nr(current)); +	printk("[ BUG: lock held at task exit time! ]\n");  	print_kernel_ident();  	printk("-------------------------------------\n"); -	lockdep_print_held_locks(current); +	printk("%s/%d is exiting with locks still held!\n", +		curr->comm, task_pid_nr(curr)); +	lockdep_print_held_locks(curr); +  	printk("\nstack backtrace:\n");  	dump_stack();  } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task)  { -	if (unlikely(current->lockdep_depth > 0)) -		print_held_locks_bug(); +	if (unlikely(task->lockdep_depth > 0)) +		print_held_locks_bug(task);  } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held);  void debug_show_all_locks(void)  { diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e..ad53a664f11 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -37,6 +37,12 @@  # include <asm/mutex.h>  #endif +/* + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. + */ +#define	MUTEX_SHOW_NO_WAITER(mutex)	(atomic_read(&(mutex)->count) >= 0) +  void  __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)  { @@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)  	spin_lock_init(&lock->wait_lock);  	INIT_LIST_HEAD(&lock->wait_list);  	mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +	lock->spin_mlock = NULL; +#endif  	debug_mutex_init(lock, name, key);  } @@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)  EXPORT_SYMBOL(mutex_lock);  #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + * We don't inline mspin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +struct mspin_node { +	struct mspin_node *next ; +	int		  locked;	/* 1 if lock acquired */ +}; +#define	MLOCK(mutex)	((struct mspin_node **)&((mutex)->spin_mlock)) + +static noinline +void mspin_lock(struct mspin_node **lock, struct mspin_node *node) +{ +	struct mspin_node *prev; + +	/* Init node */ +	node->locked = 0; +	node->next   = NULL; + +	prev = xchg(lock, node); +	if (likely(prev == NULL)) { +		/* Lock acquired */ +		node->locked = 1; +		return; +	} +	ACCESS_ONCE(prev->next) = node; +	smp_wmb(); +	/* Wait until the lock holder passes the lock down */ +	while (!ACCESS_ONCE(node->locked)) +		arch_mutex_cpu_relax(); +} + +static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) +{ +	struct mspin_node *next = ACCESS_ONCE(node->next); + +	if (likely(!next)) { +		/* +		 * Release the lock by setting it to NULL +		 */ +		if (cmpxchg(lock, node, NULL) == node) +			return; +		/* Wait until the next pointer is set */ +		while (!(next = ACCESS_ONCE(node->next))) +			arch_mutex_cpu_relax(); +	} +	ACCESS_ONCE(next->locked) = 1; +	smp_wmb(); +} + +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ +	if (lock->owner != owner) +		return false; + +	/* +	 * Ensure we emit the owner->on_cpu, dereference _after_ checking +	 * lock->owner still matches owner, if that fails, owner might +	 * point to free()d memory, if it still matches, the rcu_read_lock() +	 * ensures the memory stays valid. +	 */ +	barrier(); + +	return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ +	rcu_read_lock(); +	while (owner_running(lock, owner)) { +		if (need_resched()) +			break; + +		arch_mutex_cpu_relax(); +	} +	rcu_read_unlock(); + +	/* +	 * We break out the loop above on need_resched() and when the +	 * owner changed, which is a sign for heavy contention. Return +	 * success only when lock->owner is NULL. +	 */ +	return lock->owner == NULL; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ +	int retval = 1; + +	rcu_read_lock(); +	if (lock->owner) +		retval = lock->owner->on_cpu; +	rcu_read_unlock(); +	/* +	 * if lock->owner is not set, the mutex owner may have just acquired +	 * it and not set the owner yet or the mutex has been released. +	 */ +	return retval; +} +#endif +  static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);  /** @@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	 *  	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock  	 * to serialize everything. +	 * +	 * The mutex spinners are queued up using MCS lock so that only one +	 * spinner can compete for the mutex. However, if mutex spinning isn't +	 * going to happen, there is no point in going through the lock/unlock +	 * overhead.  	 */ +	if (!mutex_can_spin_on_owner(lock)) +		goto slowpath;  	for (;;) {  		struct task_struct *owner; +		struct mspin_node  node;  		/*  		 * If there's an owner, wait for it to either  		 * release the lock or go to sleep.  		 */ +		mspin_lock(MLOCK(lock), &node);  		owner = ACCESS_ONCE(lock->owner); -		if (owner && !mutex_spin_on_owner(lock, owner)) +		if (owner && !mutex_spin_on_owner(lock, owner)) { +			mspin_unlock(MLOCK(lock), &node);  			break; +		} -		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { +		if ((atomic_read(&lock->count) == 1) && +		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {  			lock_acquired(&lock->dep_map, ip);  			mutex_set_owner(lock); +			mspin_unlock(MLOCK(lock), &node);  			preempt_enable();  			return 0;  		} +		mspin_unlock(MLOCK(lock), &node);  		/*  		 * When there's no owner, we might have preempted between the @@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 */  		arch_mutex_cpu_relax();  	} +slowpath:  #endif  	spin_lock_mutex(&lock->wait_lock, flags); @@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	list_add_tail(&waiter.list, &lock->wait_list);  	waiter.task = task; -	if (atomic_xchg(&lock->count, -1) == 1) +	if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))  		goto done;  	lock_contended(&lock->dep_map, ip); @@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 * that when we release the lock, we properly wake up the  		 * other waiters:  		 */ -		if (atomic_xchg(&lock->count, -1) == 1) +		if (MUTEX_SHOW_NO_WAITER(lock) && +		   (atomic_xchg(&lock->count, -1) == 1))  			break;  		/* diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c602..bea15bdf82b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	int nr;  	int rc;  	struct task_struct *task, *me = current; +	int init_pids = thread_group_leader(me) ? 1 : 2;  	/* Don't allow any more processes into the pid namespace */  	disable_pid_allocation(pid_ns); @@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	 */  	for (;;) {  		set_current_state(TASK_UNINTERRUPTIBLE); -		if (pid_ns->nr_hashed == 1) +		if (pid_ns->nr_hashed == init_pids)  			break;  		schedule();  	} diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335..376914e2869 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -49,13 +49,6 @@  #define CREATE_TRACE_POINTS  #include <trace/events/printk.h> -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} -  /* printk's without a loglevel use this.. */  #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -63,8 +56,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)  #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */  #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); -  int console_printk[4] = {  	DEFAULT_CONSOLE_LOGLEVEL,	/* console_loglevel */  	DEFAULT_MESSAGE_LOGLEVEL,	/* default_message_loglevel */ @@ -224,6 +215,7 @@ struct log {  static DEFINE_RAW_SPINLOCK(logbuf_lock);  #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait);  /* the next printk record to read by syslog(READ) or /proc/kmsg */  static u64 syslog_seq;  static u32 syslog_idx; @@ -609,7 +601,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)  		/* return error when data has vanished underneath us */  		if (user->seq < log_first_seq)  			ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; -		ret = POLLIN|POLLRDNORM; +		else +			ret = POLLIN|POLLRDNORM;  	}  	raw_spin_unlock_irq(&logbuf_lock); @@ -1266,7 +1259,7 @@ static void call_console_drivers(int level, const char *text, size_t len)  {  	struct console *con; -	trace_console(text, 0, len, len); +	trace_console(text, len);  	if (level >= console_loglevel && !ignore_loglevel)  		return; @@ -1724,6 +1717,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }  #endif /* CONFIG_PRINTK */ +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +void early_vprintk(const char *fmt, va_list ap) +{ +	if (early_console) { +		char buf[512]; +		int n = vscnprintf(buf, sizeof(buf), fmt, ap); + +		early_console->write(early_console, buf, n); +	} +} + +asmlinkage void early_printk(const char *fmt, ...) +{ +	va_list ap; + +	va_start(ap, fmt); +	early_vprintk(fmt, ap); +	va_end(ap); +} +#endif +  static int __add_preferred_console(char *name, int idx, char *options,  				   char *brl_options)  { @@ -1957,45 +1973,6 @@ int is_console_locked(void)  	return console_locked;  } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE		512 - -#define PRINTK_PENDING_WAKEUP	0x01 -#define PRINTK_PENDING_SCHED	0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ -	int pending = __this_cpu_xchg(printk_pending, 0); - -	if (pending & PRINTK_PENDING_SCHED) { -		char *buf = __get_cpu_var(printk_sched_buf); -		printk(KERN_WARNING "[sched_delayed] %s", buf); -	} - -	if (pending & PRINTK_PENDING_WAKEUP) -		wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { -	.func = wake_up_klogd_work_func, -	.flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ -	preempt_disable(); -	if (waitqueue_active(&log_wait)) { -		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); -		irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); -	} -	preempt_enable(); -} -  static void console_cont_flush(char *text, size_t size)  {  	unsigned long flags; @@ -2458,6 +2435,44 @@ static int __init printk_late_init(void)  late_initcall(printk_late_init);  #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE		512 + +#define PRINTK_PENDING_WAKEUP	0x01 +#define PRINTK_PENDING_SCHED	0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ +	int pending = __this_cpu_xchg(printk_pending, 0); + +	if (pending & PRINTK_PENDING_SCHED) { +		char *buf = __get_cpu_var(printk_sched_buf); +		printk(KERN_WARNING "[sched_delayed] %s", buf); +	} + +	if (pending & PRINTK_PENDING_WAKEUP) +		wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { +	.func = wake_up_klogd_work_func, +	.flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ +	preempt_disable(); +	if (waitqueue_active(&log_wait)) { +		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); +		irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); +	} +	preempt_enable(); +}  int printk_sched(const char *fmt, ...)  { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd8..d8534308fd0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -64,7 +64,7 @@  static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];  static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, cr) { \ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \  	.level = { &sname##_state.node[0] }, \  	.call = cr, \  	.fqs_state = RCU_GP_IDLE, \ @@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];  	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \  	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \  	.name = #sname, \ +	.abbr = sabbr, \  }  struct rcu_state rcu_sched_state = -	RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); +	RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);  DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);  DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);  static struct rcu_state *rcu_state; @@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;  module_param(jiffies_till_first_fqs, ulong, 0644);  module_param(jiffies_till_next_fqs, ulong, 0644); +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, +				  struct rcu_data *rdp);  static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));  static void force_quiescent_state(struct rcu_state *rsp);  static int rcu_pending(int cpu); @@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)  	if (rcu_gp_in_progress(rsp))  		return 0;  /* No, a grace period is already in progress. */ +	if (rcu_nocb_needs_gp(rsp)) +		return 1;  /* Yes, a no-CBs CPU needs one. */  	if (!rdp->nxttail[RCU_NEXT_TAIL])  		return 0;  /* No, this is a no-CBs (or offline) CPU. */  	if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) @@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp)  {  	int i; +	if (init_nocb_callback_list(rdp)) +		return;  	rdp->nxtlist = NULL;  	for (i = 0; i < RCU_NEXT_SIZE; i++)  		rdp->nxttail[i] = &rdp->nxtlist; -	init_nocb_callback_list(rdp);  }  /* @@ -1071,6 +1077,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,  }  /* + * Trace-event helper function for rcu_start_future_gp() and + * rcu_nocb_wait_gp(). + */ +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, +				unsigned long c, char *s) +{ +	trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, +				      rnp->completed, c, rnp->level, +				      rnp->grplo, rnp->grphi, s); +} + +/* + * Start some future grace period, as needed to handle newly arrived + * callbacks.  The required future grace periods are recorded in each + * rcu_node structure's ->need_future_gp field. + * + * The caller must hold the specified rcu_node structure's ->lock. + */ +static unsigned long __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +{ +	unsigned long c; +	int i; +	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + +	/* +	 * Pick up grace-period number for new callbacks.  If this +	 * grace period is already marked as needed, return to the caller. +	 */ +	c = rcu_cbs_completed(rdp->rsp, rnp); +	trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); +	if (rnp->need_future_gp[c & 0x1]) { +		trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); +		return c; +	} + +	/* +	 * If either this rcu_node structure or the root rcu_node structure +	 * believe that a grace period is in progress, then we must wait +	 * for the one following, which is in "c".  Because our request +	 * will be noticed at the end of the current grace period, we don't +	 * need to explicitly start one. +	 */ +	if (rnp->gpnum != rnp->completed || +	    ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { +		rnp->need_future_gp[c & 0x1]++; +		trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); +		return c; +	} + +	/* +	 * There might be no grace period in progress.  If we don't already +	 * hold it, acquire the root rcu_node structure's lock in order to +	 * start one (if needed). +	 */ +	if (rnp != rnp_root) +		raw_spin_lock(&rnp_root->lock); + +	/* +	 * Get a new grace-period number.  If there really is no grace +	 * period in progress, it will be smaller than the one we obtained +	 * earlier.  Adjust callbacks as needed.  Note that even no-CBs +	 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. +	 */ +	c = rcu_cbs_completed(rdp->rsp, rnp_root); +	for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) +		if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) +			rdp->nxtcompleted[i] = c; + +	/* +	 * If the needed for the required grace period is already +	 * recorded, trace and leave. +	 */ +	if (rnp_root->need_future_gp[c & 0x1]) { +		trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); +		goto unlock_out; +	} + +	/* Record the need for the future grace period. */ +	rnp_root->need_future_gp[c & 0x1]++; + +	/* If a grace period is not already in progress, start one. */ +	if (rnp_root->gpnum != rnp_root->completed) { +		trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); +	} else { +		trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); +		rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); +	} +unlock_out: +	if (rnp != rnp_root) +		raw_spin_unlock(&rnp_root->lock); +	return c; +} + +/* + * Clean up any old requests for the just-ended grace period.  Also return + * whether any additional grace periods have been requested.  Also invoke + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads + * waiting for this grace period to complete. + */ +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ +	int c = rnp->completed; +	int needmore; +	struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + +	rcu_nocb_gp_cleanup(rsp, rnp); +	rnp->need_future_gp[c & 0x1] = 0; +	needmore = rnp->need_future_gp[(c + 1) & 0x1]; +	trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); +	return needmore; +} + +/*   * If there is room, assign a ->completed number to any callbacks on   * this CPU that have not already been assigned.  Also accelerate any   * callbacks that were previously assigned a ->completed number that has @@ -1129,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,  		rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];  		rdp->nxtcompleted[i] = c;  	} +	/* Record any needed additional grace periods. */ +	rcu_start_future_gp(rnp, rdp);  	/* Trace depending on how much we were able to accelerate. */  	if (!*rdp->nxttail[RCU_WAIT_TAIL]) @@ -1308,9 +1430,9 @@ static int rcu_gp_init(struct rcu_state *rsp)  		rdp = this_cpu_ptr(rsp->rda);  		rcu_preempt_check_blocked_tasks(rnp);  		rnp->qsmask = rnp->qsmaskinit; -		rnp->gpnum = rsp->gpnum; +		ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;  		WARN_ON_ONCE(rnp->completed != rsp->completed); -		rnp->completed = rsp->completed; +		ACCESS_ONCE(rnp->completed) = rsp->completed;  		if (rnp == rdp->mynode)  			rcu_start_gp_per_cpu(rsp, rnp, rdp);  		rcu_preempt_boost_start_gp(rnp); @@ -1319,7 +1441,8 @@ static int rcu_gp_init(struct rcu_state *rsp)  					    rnp->grphi, rnp->qsmask);  		raw_spin_unlock_irq(&rnp->lock);  #ifdef CONFIG_PROVE_RCU_DELAY -		if ((random32() % (rcu_num_nodes * 8)) == 0) +		if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && +		    system_state == SYSTEM_RUNNING)  			schedule_timeout_uninterruptible(2);  #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */  		cond_resched(); @@ -1361,6 +1484,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)  static void rcu_gp_cleanup(struct rcu_state *rsp)  {  	unsigned long gp_duration; +	int nocb = 0;  	struct rcu_data *rdp;  	struct rcu_node *rnp = rcu_get_root(rsp); @@ -1390,17 +1514,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)  	 */  	rcu_for_each_node_breadth_first(rsp, rnp) {  		raw_spin_lock_irq(&rnp->lock); -		rnp->completed = rsp->gpnum; +		ACCESS_ONCE(rnp->completed) = rsp->gpnum; +		rdp = this_cpu_ptr(rsp->rda); +		if (rnp == rdp->mynode) +			__rcu_process_gp_end(rsp, rnp, rdp); +		nocb += rcu_future_gp_cleanup(rsp, rnp);  		raw_spin_unlock_irq(&rnp->lock);  		cond_resched();  	}  	rnp = rcu_get_root(rsp);  	raw_spin_lock_irq(&rnp->lock); +	rcu_nocb_gp_set(rnp, nocb);  	rsp->completed = rsp->gpnum; /* Declare grace period done. */  	trace_rcu_grace_period(rsp->name, rsp->completed, "end");  	rsp->fqs_state = RCU_GP_IDLE;  	rdp = this_cpu_ptr(rsp->rda); +	rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */  	if (cpu_needs_another_gp(rsp, rdp))  		rsp->gp_flags = 1;  	raw_spin_unlock_irq(&rnp->lock); @@ -1476,57 +1606,62 @@ static int __noreturn rcu_gp_kthread(void *arg)  /*   * Start a new RCU grace period if warranted, re-initializing the hierarchy   * in preparation for detecting the next grace period.  The caller must hold - * the root node's ->lock, which is released before return.  Hard irqs must - * be disabled. + * the root node's ->lock and hard irqs must be disabled.   *   * Note that it is legal for a dying CPU (which is marked as offline) to   * invoke this function.  This can happen when the dying CPU reports its   * quiescent state.   */  static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) -	__releases(rcu_get_root(rsp)->lock) +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, +		      struct rcu_data *rdp)  { -	struct rcu_data *rdp = this_cpu_ptr(rsp->rda); -	struct rcu_node *rnp = rcu_get_root(rsp); - -	if (!rsp->gp_kthread || -	    !cpu_needs_another_gp(rsp, rdp)) { +	if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {  		/*  		 * Either we have not yet spawned the grace-period  		 * task, this CPU does not need another grace period,  		 * or a grace period is already in progress.  		 * Either way, don't start a new grace period.  		 */ -		raw_spin_unlock_irqrestore(&rnp->lock, flags);  		return;  	} - -	/* -	 * Because there is no grace period in progress right now, -	 * any callbacks we have up to this point will be satisfied -	 * by the next grace period.  So this is a good place to -	 * assign a grace period number to recently posted callbacks. -	 */ -	rcu_accelerate_cbs(rsp, rnp, rdp); -  	rsp->gp_flags = RCU_GP_FLAG_INIT; -	raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ - -	/* Ensure that CPU is aware of completion of last grace period. */ -	rcu_process_gp_end(rsp, rdp); -	local_irq_restore(flags);  	/* Wake up rcu_gp_kthread() to start the grace period. */  	wake_up(&rsp->gp_wq);  }  /* + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's + * callbacks.  Note that rcu_start_gp_advanced() cannot do this because it + * is invoked indirectly from rcu_advance_cbs(), which would result in + * endless recursion -- or would do so if it wasn't for the self-deadlock + * that is encountered beforehand. + */ +static void +rcu_start_gp(struct rcu_state *rsp) +{ +	struct rcu_data *rdp = this_cpu_ptr(rsp->rda); +	struct rcu_node *rnp = rcu_get_root(rsp); + +	/* +	 * If there is no grace period in progress right now, any +	 * callbacks we have up to this point will be satisfied by the +	 * next grace period.  Also, advancing the callbacks reduces the +	 * probability of false positives from cpu_needs_another_gp() +	 * resulting in pointless grace periods.  So, advance callbacks +	 * then start the grace period! +	 */ +	rcu_advance_cbs(rsp, rnp, rdp); +	rcu_start_gp_advanced(rsp, rnp, rdp); +} + +/*   * Report a full set of quiescent states to the specified rcu_state   * data structure.  This involves cleaning up after the prior grace   * period and letting rcu_start_gp() start up the next grace period - * if one is needed.  Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * if one is needed.  Note that the caller must hold rnp->lock, which + * is released before return.   */  static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)  	__releases(rcu_get_root(rsp)->lock) @@ -2124,7 +2259,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)  	local_irq_save(flags);  	if (cpu_needs_another_gp(rsp, rdp)) {  		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ -		rcu_start_gp(rsp, flags);  /* releases above lock */ +		rcu_start_gp(rsp); +		raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);  	} else {  		local_irq_restore(flags);  	} @@ -2169,7 +2305,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)  static void invoke_rcu_core(void)  { -	raise_softirq(RCU_SOFTIRQ); +	if (cpu_online(smp_processor_id())) +		raise_softirq(RCU_SOFTIRQ);  }  /* @@ -2204,11 +2341,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,  		/* Start a new grace period if one not already started. */  		if (!rcu_gp_in_progress(rsp)) { -			unsigned long nestflag;  			struct rcu_node *rnp_root = rcu_get_root(rsp); -			raw_spin_lock_irqsave(&rnp_root->lock, nestflag); -			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */ +			raw_spin_lock(&rnp_root->lock); +			rcu_start_gp(rsp); +			raw_spin_unlock(&rnp_root->lock);  		} else {  			/* Give the grace period a kick. */  			rdp->blimit = LONG_MAX; @@ -2628,19 +2765,27 @@ static int rcu_pending(int cpu)  }  /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. + * Return true if the specified CPU has any callback.  If all_lazy is + * non-NULL, store an indication of whether all callbacks are lazy. + * (If there are no callbacks, all of them are deemed to be lazy.)   */ -static int rcu_cpu_has_callbacks(int cpu) +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)  { +	bool al = true; +	bool hc = false; +	struct rcu_data *rdp;  	struct rcu_state *rsp; -	/* RCU callbacks either ready or pending? */ -	for_each_rcu_flavor(rsp) -		if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) -			return 1; -	return 0; +	for_each_rcu_flavor(rsp) { +		rdp = per_cpu_ptr(rsp->rda, cpu); +		if (rdp->qlen != rdp->qlen_lazy) +			al = false; +		if (rdp->nxtlist) +			hc = true; +	} +	if (all_lazy) +		*all_lazy = al; +	return hc;  }  /* @@ -2859,7 +3004,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  	atomic_set(&rdp->dynticks->dynticks,  		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); -	rcu_prepare_for_idle_init(cpu);  	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */  	/* Add CPU to rcu_node bitmasks. */ @@ -2909,7 +3053,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);  	struct rcu_node *rnp = rdp->mynode;  	struct rcu_state *rsp; -	int ret = NOTIFY_OK;  	trace_rcu_utilization("Start CPU hotplug");  	switch (action) { @@ -2923,21 +3066,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  		rcu_boost_kthread_setaffinity(rnp, -1);  		break;  	case CPU_DOWN_PREPARE: -		if (nocb_cpu_expendable(cpu)) -			rcu_boost_kthread_setaffinity(rnp, cpu); -		else -			ret = NOTIFY_BAD; +		rcu_boost_kthread_setaffinity(rnp, cpu);  		break;  	case CPU_DYING:  	case CPU_DYING_FROZEN: -		/* -		 * The whole machine is "stopped" except this CPU, so we can -		 * touch any data without introducing corruption. We send the -		 * dying CPU's callbacks to an arbitrarily chosen online CPU. -		 */  		for_each_rcu_flavor(rsp)  			rcu_cleanup_dying_cpu(rsp); -		rcu_cleanup_after_idle(cpu);  		break;  	case CPU_DEAD:  	case CPU_DEAD_FROZEN: @@ -2950,7 +3084,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  		break;  	}  	trace_rcu_utilization("End CPU hotplug"); -	return ret; +	return NOTIFY_OK;  }  /* @@ -3085,6 +3219,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,  			}  			rnp->level = i;  			INIT_LIST_HEAD(&rnp->blkd_tasks); +			rcu_init_one_nocb(rnp);  		}  	} @@ -3170,8 +3305,7 @@ void __init rcu_init(void)  	rcu_init_one(&rcu_sched_state, &rcu_sched_data);  	rcu_init_one(&rcu_bh_state, &rcu_bh_data);  	__rcu_init_preempt(); -	rcu_init_nocb(); -	 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);  	/*  	 * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b5045d9..14ee40795d6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,18 +88,13 @@ struct rcu_dynticks {  	int dynticks_nmi_nesting;   /* Track NMI nesting level. */  	atomic_t dynticks;	    /* Even value for idle, else odd. */  #ifdef CONFIG_RCU_FAST_NO_HZ -	int dyntick_drain;	    /* Prepare-for-idle state variable. */ -	unsigned long dyntick_holdoff; -				    /* No retries for the jiffy of failure. */ -	struct timer_list idle_gp_timer; -				    /* Wake up CPU sleeping with callbacks. */ -	unsigned long idle_gp_timer_expires; -				    /* When to wake up CPU (for repost). */ -	bool idle_first_pass;	    /* First pass of attempt to go idle? */ +	bool all_lazy;		    /* Are all CPU's CBs lazy? */  	unsigned long nonlazy_posted;  				    /* # times non-lazy CBs posted to CPU. */  	unsigned long nonlazy_posted_snap;  				    /* idle-period nonlazy_posted snapshot. */ +	unsigned long last_accelerate; +				    /* Last jiffy CBs were accelerated. */  	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */  #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */  }; @@ -134,9 +129,6 @@ struct rcu_node {  				/*  elements that need to drain to allow the */  				/*  current expedited grace period to */  				/*  complete (only for TREE_PREEMPT_RCU). */ -	atomic_t wakemask;	/* CPUs whose kthread needs to be awakened. */ -				/*  Since this has meaning only for leaf */ -				/*  rcu_node structures, 32 bits suffices. */  	unsigned long qsmaskinit;  				/* Per-GP initial value for qsmask & expmask. */  	unsigned long grpmask;	/* Mask to apply to parent qsmask. */ @@ -196,6 +188,12 @@ struct rcu_node {  				/* Refused to boost: not sure why, though. */  				/*  This can happen due to race conditions. */  #endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_NOCB_CPU +	wait_queue_head_t nocb_gp_wq[2]; +				/* Place for rcu_nocb_kthread() to wait GP. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +	int need_future_gp[2]; +				/* Counts of upcoming no-CB GP requests. */  	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;  } ____cacheline_internodealigned_in_smp; @@ -328,6 +326,11 @@ struct rcu_data {  	struct task_struct *nocb_kthread;  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +	/* 8) RCU CPU stall data. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO +	unsigned int softirq_snap;	/* Snapshot of softirq activity. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ +  	int cpu;  	struct rcu_state *rsp;  }; @@ -375,12 +378,6 @@ struct rcu_state {  	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */  	void (*call)(struct rcu_head *head,	/* call_rcu() flavor. */  		     void (*func)(struct rcu_head *head)); -#ifdef CONFIG_RCU_NOCB_CPU -	void (*call_remote)(struct rcu_head *head, -		     void (*func)(struct rcu_head *head)); -						/* call_rcu() flavor, but for */ -						/*  placing on remote CPU. */ -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */  	/* The following fields are guarded by the root rcu_node's lock. */ @@ -443,6 +440,7 @@ struct rcu_state {  	unsigned long gp_max;			/* Maximum GP duration in */  						/*  jiffies. */  	char *name;				/* Name of structure. */ +	char abbr;				/* Abbreviated name. */  	struct list_head flavors;		/* List of RCU flavors. */  }; @@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,  						 struct rcu_node *rnp);  #endif /* #ifdef CONFIG_RCU_BOOST */  static void __cpuinit rcu_prepare_kthreads(int cpu); -static void rcu_prepare_for_idle_init(int cpu);  static void rcu_cleanup_after_idle(int cpu);  static void rcu_prepare_for_idle(int cpu);  static void rcu_idle_count_callbacks_posted(void); @@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);  static void print_cpu_stall_info_end(void);  static void zero_cpu_stall_ticks(struct rcu_data *rdp);  static void increment_cpu_stall_ticks(void); +static int rcu_nocb_needs_gp(struct rcu_state *rsp); +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_init_one_nocb(struct rcu_node *rnp);  static bool is_nocb_cpu(int cpu);  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  			    bool lazy);  static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,  				      struct rcu_data *rdp); -static bool nocb_cpu_expendable(int cpu);  static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);  static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void init_nocb_callback_list(struct rcu_data *rdp); -static void __init rcu_init_nocb(void); +static bool init_nocb_callback_list(struct rcu_data *rdp);  #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9..d084ae3f281 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -85,11 +85,21 @@ static void __init rcu_bootup_announce_oddness(void)  	if (nr_cpu_ids != NR_CPUS)  		printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);  #ifdef CONFIG_RCU_NOCB_CPU +#ifndef CONFIG_RCU_NOCB_CPU_NONE +	if (!have_rcu_nocb_mask) { +		alloc_bootmem_cpumask_var(&rcu_nocb_mask); +		have_rcu_nocb_mask = true; +	} +#ifdef CONFIG_RCU_NOCB_CPU_ZERO +	pr_info("\tExperimental no-CBs CPU 0\n"); +	cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL +	pr_info("\tExperimental no-CBs for all CPUs\n"); +	cpumask_setall(rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */  	if (have_rcu_nocb_mask) { -		if (cpumask_test_cpu(0, rcu_nocb_mask)) { -			cpumask_clear_cpu(0, rcu_nocb_mask); -			pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); -		}  		cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);  		pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);  		if (rcu_nocb_poll) @@ -101,7 +111,7 @@ static void __init rcu_bootup_announce_oddness(void)  #ifdef CONFIG_TREE_PREEMPT_RCU  struct rcu_state rcu_preempt_state = -	RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); +	RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);  DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);  static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -1533,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)  int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)  {  	*delta_jiffies = ULONG_MAX; -	return rcu_cpu_has_callbacks(cpu); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ +	return rcu_cpu_has_callbacks(cpu, NULL);  }  /* @@ -1577,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void)   *   * The following three proprocessor symbols control this state machine:   * - * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt - *	to satisfy RCU.  Beyond this point, it is better to incur a periodic - *	scheduling-clock interrupt than to loop through the state machine - *	at full power. - * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are - *	optional if RCU does not need anything immediately from this - *	CPU, even if this CPU still has RCU callbacks queued.  The first - *	times through the state machine are mandatory: we need to give - *	the state machine a chance to communicate a quiescent state - *	to the RCU core.   * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted   *	to sleep in dyntick-idle mode with RCU callbacks pending.  This   *	is sized to be roughly one RCU grace period.  Those energy-efficiency @@ -1602,186 +1595,108 @@ static void rcu_idle_count_callbacks_posted(void)   * adjustment, they can be converted into kernel config parameters, though   * making the state machine smarter might be a better option.   */ -#define RCU_IDLE_FLUSHES 5		/* Number of dyntick-idle tries. */ -#define RCU_IDLE_OPT_FLUSHES 3		/* Optional dyntick-idle tries. */  #define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */  #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */ -extern int tick_nohz_enabled; - -/* - * Does the specified flavor of RCU have non-lazy callbacks pending on - * the specified CPU?  Both RCU flavor and CPU are specified by the - * rcu_data structure. - */ -static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) -{ -	return rdp->qlen != rdp->qlen_lazy; -} +static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; +module_param(rcu_idle_gp_delay, int, 0644); +static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; +module_param(rcu_idle_lazy_gp_delay, int, 0644); -#ifdef CONFIG_TREE_PREEMPT_RCU +extern int tick_nohz_enabled;  /* - * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there - * is no RCU-preempt in the kernel.) + * Try to advance callbacks for all flavors of RCU on the current CPU. + * Afterwards, if there are any callbacks ready for immediate invocation, + * return true.   */ -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +static bool rcu_try_advance_all_cbs(void)  { -	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - -	return __rcu_cpu_has_nonlazy_callbacks(rdp); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +	bool cbs_ready = false; +	struct rcu_data *rdp; +	struct rcu_node *rnp; +	struct rcu_state *rsp; -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) -{ -	return 0; -} +	for_each_rcu_flavor(rsp) { +		rdp = this_cpu_ptr(rsp->rda); +		rnp = rdp->mynode; -#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ +		/* +		 * Don't bother checking unless a grace period has +		 * completed since we last checked and there are +		 * callbacks not yet ready to invoke. +		 */ +		if (rdp->completed != rnp->completed && +		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) +			rcu_process_gp_end(rsp, rdp); -/* - * Does any flavor of RCU have non-lazy callbacks on the specified CPU? - */ -static bool rcu_cpu_has_nonlazy_callbacks(int cpu) -{ -	return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || -	       __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || -	       rcu_preempt_cpu_has_nonlazy_callbacks(cpu); +		if (cpu_has_callbacks_ready_to_invoke(rdp)) +			cbs_ready = true; +	} +	return cbs_ready;  }  /* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it.  After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! + * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready + * to invoke.  If the CPU has callbacks, try to advance them.  Tell the + * caller to set the timeout based on whether or not there are non-lazy + * callbacks.   * - * The delta_jiffies argument is used to store the time when RCU is - * going to need the CPU again if it still has callbacks.  The reason - * for this is that rcu_prepare_for_idle() might need to post a timer, - * but if so, it will do so after tick_nohz_stop_sched_tick() has set - * the wakeup time for this CPU.  This means that RCU's timer can be - * delayed until the wakeup time, which defeats the purpose of posting - * a timer. + * The caller must have disabled interrupts.   */ -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(int cpu, unsigned long *dj)  {  	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); -	/* Flag a new idle sojourn to the idle-entry state machine. */ -	rdtp->idle_first_pass = 1; +	/* Snapshot to detect later posting of non-lazy callback. */ +	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; +  	/* If no callbacks, RCU doesn't need the CPU. */ -	if (!rcu_cpu_has_callbacks(cpu)) { -		*delta_jiffies = ULONG_MAX; +	if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { +		*dj = ULONG_MAX;  		return 0;  	} -	if (rdtp->dyntick_holdoff == jiffies) { -		/* RCU recently tried and failed, so don't try again. */ -		*delta_jiffies = 1; + +	/* Attempt to advance callbacks. */ +	if (rcu_try_advance_all_cbs()) { +		/* Some ready to invoke, so initiate later invocation. */ +		invoke_rcu_core();  		return 1;  	} -	/* Set up for the possibility that RCU will post a timer. */ -	if (rcu_cpu_has_nonlazy_callbacks(cpu)) { -		*delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, -					  RCU_IDLE_GP_DELAY) - jiffies; +	rdtp->last_accelerate = jiffies; + +	/* Request timer delay depending on laziness, and round. */ +	if (rdtp->all_lazy) { +		*dj = round_up(rcu_idle_gp_delay + jiffies, +			       rcu_idle_gp_delay) - jiffies;  	} else { -		*delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; -		*delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; +		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;  	}  	return 0;  }  /* - * Handler for smp_call_function_single().  The only point of this - * handler is to wake the CPU up, so the handler does only tracing. - */ -void rcu_idle_demigrate(void *unused) -{ -	trace_rcu_prep_idle("Demigrate"); -} - -/* - * Timer handler used to force CPU to start pushing its remaining RCU - * callbacks in the case where it entered dyntick-idle mode with callbacks - * pending.  The hander doesn't really need to do anything because the - * real work is done upon re-entry to idle, or by the next scheduling-clock - * interrupt should idle not be re-entered. - * - * One special case: the timer gets migrated without awakening the CPU - * on which the timer was scheduled on.  In this case, we must wake up - * that CPU.  We do so with smp_call_function_single(). - */ -static void rcu_idle_gp_timer_func(unsigned long cpu_in) -{ -	int cpu = (int)cpu_in; - -	trace_rcu_prep_idle("Timer"); -	if (cpu != smp_processor_id()) -		smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); -	else -		WARN_ON_ONCE(1); /* Getting here can hang the system... */ -} - -/* - * Initialize the timer used to pull CPUs out of dyntick-idle mode. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ -	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - -	rdtp->dyntick_holdoff = jiffies - 1; -	setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); -	rdtp->idle_gp_timer_expires = jiffies - 1; -	rdtp->idle_first_pass = 1; -} - -/* - * Clean up for exit from idle.  Because we are exiting from idle, there - * is no longer any point to ->idle_gp_timer, so cancel it.  This will - * do nothing if this timer is not active, so just cancel it unconditionally. - */ -static void rcu_cleanup_after_idle(int cpu) -{ -	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - -	del_timer(&rdtp->idle_gp_timer); -	trace_rcu_prep_idle("Cleanup after idle"); -	rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); -} - -/* - * Check to see if any RCU-related work can be done by the current CPU, - * and if so, schedule a softirq to get it done.  This function is part - * of the RCU implementation; it is -not- an exported member of the RCU API. - * - * The idea is for the current CPU to clear out all work required by the - * RCU core for the current grace period, so that this CPU can be permitted - * to enter dyntick-idle mode.  In some cases, it will need to be awakened - * at the end of the grace period by whatever CPU ends the grace period. - * This allows CPUs to go dyntick-idle more quickly, and to reduce the - * number of wakeups by a modest integer factor. - * - * Because it is not legal to invoke rcu_process_callbacks() with irqs - * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later.  The ->dyntick_drain field controls the sequencing. + * Prepare a CPU for idle from an RCU perspective.  The first major task + * is to sense whether nohz mode has been enabled or disabled via sysfs. + * The second major task is to check to see if a non-lazy callback has + * arrived at a CPU that previously had only lazy callbacks.  The third + * major task is to accelerate (that is, assign grace-period numbers to) + * any recently arrived callbacks.   *   * The caller must have disabled interrupts.   */  static void rcu_prepare_for_idle(int cpu)  { -	struct timer_list *tp; +	struct rcu_data *rdp;  	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); +	struct rcu_node *rnp; +	struct rcu_state *rsp;  	int tne;  	/* Handle nohz enablement switches conservatively. */  	tne = ACCESS_ONCE(tick_nohz_enabled);  	if (tne != rdtp->tick_nohz_enabled_snap) { -		if (rcu_cpu_has_callbacks(cpu)) +		if (rcu_cpu_has_callbacks(cpu, NULL))  			invoke_rcu_core(); /* force nohz to see update. */  		rdtp->tick_nohz_enabled_snap = tne;  		return; @@ -1789,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu)  	if (!tne)  		return; -	/* Adaptive-tick mode, where usermode execution is idle to RCU. */ -	if (!is_idle_task(current)) { -		rdtp->dyntick_holdoff = jiffies - 1; -		if (rcu_cpu_has_nonlazy_callbacks(cpu)) { -			trace_rcu_prep_idle("User dyntick with callbacks"); -			rdtp->idle_gp_timer_expires = -				round_up(jiffies + RCU_IDLE_GP_DELAY, -					 RCU_IDLE_GP_DELAY); -		} else if (rcu_cpu_has_callbacks(cpu)) { -			rdtp->idle_gp_timer_expires = -				round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); -			trace_rcu_prep_idle("User dyntick with lazy callbacks"); -		} else { -			return; -		} -		tp = &rdtp->idle_gp_timer; -		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); +	/* If this is a no-CBs CPU, no callbacks, just return. */ +	if (is_nocb_cpu(cpu))  		return; -	}  	/* -	 * If this is an idle re-entry, for example, due to use of -	 * RCU_NONIDLE() or the new idle-loop tracing API within the idle -	 * loop, then don't take any state-machine actions, unless the -	 * momentary exit from idle queued additional non-lazy callbacks. -	 * Instead, repost the ->idle_gp_timer if this CPU has callbacks -	 * pending. +	 * If a non-lazy callback arrived at a CPU having only lazy +	 * callbacks, invoke RCU core for the side-effect of recalculating +	 * idle duration on re-entry to idle.  	 */ -	if (!rdtp->idle_first_pass && -	    (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { -		if (rcu_cpu_has_callbacks(cpu)) { -			tp = &rdtp->idle_gp_timer; -			mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); -		} +	if (rdtp->all_lazy && +	    rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { +		invoke_rcu_core();  		return;  	} -	rdtp->idle_first_pass = 0; -	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;  	/* -	 * If there are no callbacks on this CPU, enter dyntick-idle mode. -	 * Also reset state to avoid prejudicing later attempts. +	 * If we have not yet accelerated this jiffy, accelerate all +	 * callbacks on this CPU.  	 */ -	if (!rcu_cpu_has_callbacks(cpu)) { -		rdtp->dyntick_holdoff = jiffies - 1; -		rdtp->dyntick_drain = 0; -		trace_rcu_prep_idle("No callbacks"); +	if (rdtp->last_accelerate == jiffies)  		return; +	rdtp->last_accelerate = jiffies; +	for_each_rcu_flavor(rsp) { +		rdp = per_cpu_ptr(rsp->rda, cpu); +		if (!*rdp->nxttail[RCU_DONE_TAIL]) +			continue; +		rnp = rdp->mynode; +		raw_spin_lock(&rnp->lock); /* irqs already disabled. */ +		rcu_accelerate_cbs(rsp, rnp, rdp); +		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */  	} +} -	/* -	 * If in holdoff mode, just return.  We will presumably have -	 * refrained from disabling the scheduling-clock tick. -	 */ -	if (rdtp->dyntick_holdoff == jiffies) { -		trace_rcu_prep_idle("In holdoff"); -		return; -	} +/* + * Clean up for exit from idle.  Attempt to advance callbacks based on + * any grace periods that elapsed while the CPU was idle, and if any + * callbacks are now ready to invoke, initiate invocation. + */ +static void rcu_cleanup_after_idle(int cpu) +{ +	struct rcu_data *rdp; +	struct rcu_state *rsp; -	/* Check and update the ->dyntick_drain sequencing. */ -	if (rdtp->dyntick_drain <= 0) { -		/* First time through, initialize the counter. */ -		rdtp->dyntick_drain = RCU_IDLE_FLUSHES; -	} else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && -		   !rcu_pending(cpu) && -		   !local_softirq_pending()) { -		/* Can we go dyntick-idle despite still having callbacks? */ -		rdtp->dyntick_drain = 0; -		rdtp->dyntick_holdoff = jiffies; -		if (rcu_cpu_has_nonlazy_callbacks(cpu)) { -			trace_rcu_prep_idle("Dyntick with callbacks"); -			rdtp->idle_gp_timer_expires = -				round_up(jiffies + RCU_IDLE_GP_DELAY, -					 RCU_IDLE_GP_DELAY); -		} else { -			rdtp->idle_gp_timer_expires = -				round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); -			trace_rcu_prep_idle("Dyntick with lazy callbacks"); -		} -		tp = &rdtp->idle_gp_timer; -		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); -		rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; -		return; /* Nothing more to do immediately. */ -	} else if (--(rdtp->dyntick_drain) <= 0) { -		/* We have hit the limit, so time to give up. */ -		rdtp->dyntick_holdoff = jiffies; -		trace_rcu_prep_idle("Begin holdoff"); -		invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */ +	if (is_nocb_cpu(cpu))  		return; -	} - -	/* -	 * Do one step of pushing the remaining RCU callbacks through -	 * the RCU core state machine. -	 */ -#ifdef CONFIG_TREE_PREEMPT_RCU -	if (per_cpu(rcu_preempt_data, cpu).nxtlist) { -		rcu_preempt_qs(cpu); -		force_quiescent_state(&rcu_preempt_state); -	} -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -	if (per_cpu(rcu_sched_data, cpu).nxtlist) { -		rcu_sched_qs(cpu); -		force_quiescent_state(&rcu_sched_state); -	} -	if (per_cpu(rcu_bh_data, cpu).nxtlist) { -		rcu_bh_qs(cpu); -		force_quiescent_state(&rcu_bh_state); -	} - -	/* -	 * If RCU callbacks are still pending, RCU still needs this CPU. -	 * So try forcing the callbacks through the grace period. -	 */ -	if (rcu_cpu_has_callbacks(cpu)) { -		trace_rcu_prep_idle("More callbacks"); -		invoke_rcu_core(); -	} else { -		trace_rcu_prep_idle("Callbacks drained"); +	rcu_try_advance_all_cbs(); +	for_each_rcu_flavor(rsp) { +		rdp = per_cpu_ptr(rsp->rda, cpu); +		if (cpu_has_callbacks_ready_to_invoke(rdp)) +			invoke_rcu_core();  	}  } @@ -2015,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier);  static void print_cpu_stall_fast_no_hz(char *cp, int cpu)  {  	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); -	struct timer_list *tltp = &rdtp->idle_gp_timer; -	char c; +	unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; -	c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; -	if (timer_pending(tltp)) -		sprintf(cp, "drain=%d %c timer=%lu", -			rdtp->dyntick_drain, c, tltp->expires - jiffies); -	else -		sprintf(cp, "drain=%d %c timer not pending", -			rdtp->dyntick_drain, c); +	sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", +		rdtp->last_accelerate & 0xffff, jiffies & 0xffff, +		ulong2long(nlpd), +		rdtp->all_lazy ? 'L' : '.', +		rdtp->tick_nohz_enabled_snap ? '.' : 'D');  }  #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -2070,10 +1913,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)  		ticks_value = rsp->gpnum - rdp->gpnum;  	}  	print_cpu_stall_fast_no_hz(fast_no_hz, cpu); -	printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", +	printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",  	       cpu, ticks_value, ticks_title,  	       atomic_read(&rdtp->dynticks) & 0xfff,  	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, +	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),  	       fast_no_hz);  } @@ -2087,6 +1931,7 @@ static void print_cpu_stall_info_end(void)  static void zero_cpu_stall_ticks(struct rcu_data *rdp)  {  	rdp->ticks_this_gp = 0; +	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());  }  /* Increment ->ticks_this_gp for all flavors of RCU. */ @@ -2165,6 +2010,47 @@ static int __init parse_rcu_nocb_poll(char *arg)  }  early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +/* + * Do any no-CBs CPUs need another grace period? + * + * Interrupts must be disabled.  If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ +	struct rcu_node *rnp = rcu_get_root(rsp); + +	return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ +	wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); +} + +/* + * Set the root rcu_node structure's ->need_future_gp field + * based on the sum of those of all rcu_node structures.  This does + * double-count the root rcu_node structure's requests, but this + * is necessary to handle the possibility of a rcu_nocb_kthread() + * having awakened during the time that the rcu_node structures + * were being updated for the end of the previous grace period. + */ +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ +	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +	init_waitqueue_head(&rnp->nocb_gp_wq[0]); +	init_waitqueue_head(&rnp->nocb_gp_wq[1]); +} +  /* Is the specified CPU a no-CPUs CPU? */  static bool is_nocb_cpu(int cpu)  { @@ -2227,6 +2113,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  	if (!is_nocb_cpu(rdp->cpu))  		return 0;  	__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); +	if (__is_kfree_rcu_offset((unsigned long)rhp->func)) +		trace_rcu_kfree_callback(rdp->rsp->name, rhp, +					 (unsigned long)rhp->func, +					 rdp->qlen_lazy, rdp->qlen); +	else +		trace_rcu_callback(rdp->rsp->name, rhp, +				   rdp->qlen_lazy, rdp->qlen);  	return 1;  } @@ -2265,95 +2158,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,  }  /* - * There must be at least one non-no-CBs CPU in operation at any given - * time, because no-CBs CPUs are not capable of initiating grace periods - * independently.  This function therefore complains if the specified - * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to - * avoid offlining the last such CPU.  (Recursion is a wonderful thing, - * but you have to have a base case!) + * If necessary, kick off a new grace period, and either way wait + * for a subsequent grace period to complete.   */ -static bool nocb_cpu_expendable(int cpu) +static void rcu_nocb_wait_gp(struct rcu_data *rdp)  { -	cpumask_var_t non_nocb_cpus; -	int ret; +	unsigned long c; +	bool d; +	unsigned long flags; +	struct rcu_node *rnp = rdp->mynode; + +	raw_spin_lock_irqsave(&rnp->lock, flags); +	c = rcu_start_future_gp(rnp, rdp); +	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	/* -	 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, -	 * then offlining this CPU is harmless.  Let it happen. +	 * Wait for the grace period.  Do so interruptibly to avoid messing +	 * up the load average.  	 */ -	if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) -		return 1; - -	/* If no memory, play it safe and keep the CPU around. */ -	if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) -		return 0; -	cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); -	cpumask_clear_cpu(cpu, non_nocb_cpus); -	ret = !cpumask_empty(non_nocb_cpus); -	free_cpumask_var(non_nocb_cpus); -	return ret; -} - -/* - * Helper structure for remote registry of RCU callbacks. - * This is needed for when a no-CBs CPU needs to start a grace period. - * If it just invokes call_rcu(), the resulting callback will be queued, - * which can result in deadlock. - */ -struct rcu_head_remote { -	struct rcu_head *rhp; -	call_rcu_func_t *crf; -	void (*func)(struct rcu_head *rhp); -}; - -/* - * Register a callback as specified by the rcu_head_remote struct. - * This function is intended to be invoked via smp_call_function_single(). - */ -static void call_rcu_local(void *arg) -{ -	struct rcu_head_remote *rhrp = -		container_of(arg, struct rcu_head_remote, rhp); - -	rhrp->crf(rhrp->rhp, rhrp->func); -} - -/* - * Set up an rcu_head_remote structure and the invoke call_rcu_local() - * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via - * smp_call_function_single(). - */ -static void invoke_crf_remote(struct rcu_head *rhp, -			      void (*func)(struct rcu_head *rhp), -			      call_rcu_func_t crf) -{ -	struct rcu_head_remote rhr; - -	rhr.rhp = rhp; -	rhr.crf = crf; -	rhr.func = func; -	smp_call_function_single(0, call_rcu_local, &rhr, 1); -} - -/* - * Helper functions to be passed to wait_rcu_gp(), each of which - * invokes invoke_crf_remote() to register a callback appropriately. - */ -static void __maybe_unused -call_rcu_preempt_remote(struct rcu_head *rhp, -			void (*func)(struct rcu_head *rhp)) -{ -	invoke_crf_remote(rhp, func, call_rcu); -} -static void call_rcu_bh_remote(struct rcu_head *rhp, -			       void (*func)(struct rcu_head *rhp)) -{ -	invoke_crf_remote(rhp, func, call_rcu_bh); -} -static void call_rcu_sched_remote(struct rcu_head *rhp, -				  void (*func)(struct rcu_head *rhp)) -{ -	invoke_crf_remote(rhp, func, call_rcu_sched); +	trace_rcu_future_gp(rnp, rdp, c, "StartWait"); +	for (;;) { +		wait_event_interruptible( +			rnp->nocb_gp_wq[c & 0x1], +			(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); +		if (likely(d)) +			break; +		flush_signals(current); +		trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); +	} +	trace_rcu_future_gp(rnp, rdp, c, "EndWait"); +	smp_mb(); /* Ensure that CB invocation happens after GP end. */  }  /* @@ -2390,7 +2224,7 @@ static int rcu_nocb_kthread(void *arg)  		cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);  		ACCESS_ONCE(rdp->nocb_p_count) += c;  		ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; -		wait_rcu_gp(rdp->rsp->call_remote); +		rcu_nocb_wait_gp(rdp);  		/* Each pass through the following loop invokes a callback. */  		trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2436,32 +2270,41 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)  		return;  	for_each_cpu(cpu, rcu_nocb_mask) {  		rdp = per_cpu_ptr(rsp->rda, cpu); -		t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); +		t = kthread_run(rcu_nocb_kthread, rdp, +				"rcuo%c/%d", rsp->abbr, cpu);  		BUG_ON(IS_ERR(t));  		ACCESS_ONCE(rdp->nocb_kthread) = t;  	}  }  /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static void init_nocb_callback_list(struct rcu_data *rdp) +static bool init_nocb_callback_list(struct rcu_data *rdp)  {  	if (rcu_nocb_mask == NULL ||  	    !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) -		return; +		return false;  	rdp->nxttail[RCU_NEXT_TAIL] = NULL; +	return true; +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ +	return 0;  } -/* Initialize the ->call_remote fields in the rcu_state structures. */ -static void __init rcu_init_nocb(void) +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)  { -#ifdef CONFIG_PREEMPT_RCU -	rcu_preempt_state.call_remote = call_rcu_preempt_remote; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ -	rcu_bh_state.call_remote = call_rcu_bh_remote; -	rcu_sched_state.call_remote = call_rcu_sched_remote;  } -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +}  static bool is_nocb_cpu(int cpu)  { @@ -2480,11 +2323,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,  	return 0;  } -static bool nocb_cpu_expendable(int cpu) -{ -	return 1; -} -  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)  {  } @@ -2493,12 +2331,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)  {  } -static void init_nocb_callback_list(struct rcu_data *rdp) -{ -} - -static void __init rcu_init_nocb(void) +static bool init_nocb_callback_list(struct rcu_data *rdp)  { +	return false;  }  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0d095dcaa67..49099e81c87 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,8 +46,6 @@  #define RCU_TREE_NONCORE  #include "rcutree.h" -#define ulong2long(a) (*(long *)(&(a))) -  static int r_open(struct inode *inode, struct file *file,  					const struct seq_operations *op)  { diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b..d7386986e10 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@  #include <linux/seq_file.h>  #include <linux/device.h>  #include <linux/pfn.h> +#include <linux/mm.h>  #include <asm/io.h> @@ -50,6 +51,14 @@ struct resource_constraint {  static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); +  static void *r_next(struct seq_file *m, void *v, loff_t *pos)  {  	struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init);  #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ +	if (!res) +		return; + +	if (!PageSlab(virt_to_head_page(res))) { +		spin_lock(&bootmem_resource_lock); +		res->sibling = bootmem_resource_free; +		bootmem_resource_free = res; +		spin_unlock(&bootmem_resource_lock); +	} else { +		kfree(res); +	} +} + +static struct resource *alloc_resource(gfp_t flags) +{ +	struct resource *res = NULL; + +	spin_lock(&bootmem_resource_lock); +	if (bootmem_resource_free) { +		res = bootmem_resource_free; +		bootmem_resource_free = res->sibling; +	} +	spin_unlock(&bootmem_resource_lock); + +	if (res) +		memset(res, 0, sizeof(struct resource)); +	else +		res = kzalloc(sizeof(struct resource), flags); + +	return res; +} +  /* Return the conflict entry if you can't request it */  static struct resource * __request_resource(struct resource *root, struct resource *new)  { @@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)  	write_unlock(&resource_lock);  } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments.  Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, +				resource_size_t size)  {  	struct resource *tmp, *parent = res->parent;  	resource_size_t end = start + size - 1;  	int result = -EBUSY; -	write_lock(&resource_lock); -  	if (!parent)  		goto skip; @@ -751,6 +783,26 @@ skip:  	result = 0;   out: +	return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments.  Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, +			resource_size_t size) +{ +	int result; + +	write_lock(&resource_lock); +	result = __adjust_resource(res, start, size);  	write_unlock(&resource_lock);  	return result;  } @@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,  {  	struct resource *parent = root;  	struct resource *conflict; -	struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); +	struct resource *res = alloc_resource(GFP_ATOMIC);  	struct resource *next_res = NULL;  	if (!res) @@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,  		/* conflict covered whole area */  		if (conflict->start <= res->start &&  				conflict->end >= res->end) { -			kfree(res); +			free_resource(res);  			WARN_ON(next_res);  			break;  		} @@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,  			end = res->end;  			res->end = conflict->start - 1;  			if (conflict->end < end) { -				next_res = kzalloc(sizeof(*next_res), -						GFP_ATOMIC); +				next_res = alloc_resource(GFP_ATOMIC);  				if (!next_res) { -					kfree(res); +					free_resource(res);  					break;  				}  				next_res->name = name; @@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,  				   const char *name, int flags)  {  	DECLARE_WAITQUEUE(wait, current); -	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); +	struct resource *res = alloc_resource(GFP_KERNEL);  	if (!res)  		return NULL; @@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,  			continue;  		}  		/* Uhhuh, that didn't work out.. */ -		kfree(res); +		free_resource(res);  		res = NULL;  		break;  	} @@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,  		return -EBUSY;  	release_resource(res); -	kfree(res); +	free_resource(res);  	return 0;  }  EXPORT_SYMBOL(__check_region); @@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,  			write_unlock(&resource_lock);  			if (res->flags & IORESOURCE_MUXED)  				wake_up(&muxed_resource_wait); -			kfree(res); +			free_resource(res);  			return;  		}  		p = &res->sibling; @@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,  }  EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete.  The requested region + * is released from a currently busy memory resource.  The requested region + * must either match exactly or fit into a single busy resource entry.  In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + *   supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + *   assumes that all children remain in the lower address entry for + *   simplicity.  Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, +			resource_size_t start, resource_size_t size) +{ +	struct resource **p; +	struct resource *res; +	struct resource *new_res; +	resource_size_t end; +	int ret = -EINVAL; + +	end = start + size - 1; +	if ((start < parent->start) || (end > parent->end)) +		return ret; + +	/* The alloc_resource() result gets checked later */ +	new_res = alloc_resource(GFP_KERNEL); + +	p = &parent->child; +	write_lock(&resource_lock); + +	while ((res = *p)) { +		if (res->start >= end) +			break; + +		/* look for the next resource if it does not fit into */ +		if (res->start > start || res->end < end) { +			p = &res->sibling; +			continue; +		} + +		if (!(res->flags & IORESOURCE_MEM)) +			break; + +		if (!(res->flags & IORESOURCE_BUSY)) { +			p = &res->child; +			continue; +		} + +		/* found the target resource; let's adjust accordingly */ +		if (res->start == start && res->end == end) { +			/* free the whole entry */ +			*p = res->sibling; +			free_resource(res); +			ret = 0; +		} else if (res->start == start && res->end != end) { +			/* adjust the start */ +			ret = __adjust_resource(res, end + 1, +						res->end - end); +		} else if (res->start != start && res->end == end) { +			/* adjust the end */ +			ret = __adjust_resource(res, res->start, +						start - res->start); +		} else { +			/* split into two entries */ +			if (!new_res) { +				ret = -ENOMEM; +				break; +			} +			new_res->name = res->name; +			new_res->start = end + 1; +			new_res->end = res->end; +			new_res->flags = res->flags; +			new_res->parent = res->parent; +			new_res->sibling = res->sibling; +			new_res->child = NULL; + +			ret = __adjust_resource(res, res->start, +						start - res->start); +			if (ret) +				break; +			res->sibling = new_res; +			new_res = NULL; +		} + +		break; +	} + +	write_unlock(&resource_lock); +	free_resource(new_res); +	return ret; +} +#endif	/* CONFIG_MEMORY_HOTREMOVE */ +  /*   * Managed region resource   */ diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 7890b10084a..1d96dd0d93c 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -14,6 +14,7 @@  #include <linux/spinlock.h>  #include <linux/timer.h>  #include <linux/freezer.h> +#include <linux/stat.h>  #include "rtmutex.h" @@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at  	return curr - buf;  } -static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); -static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);  static struct bus_type rttest_subsys = {  	.name = "rttest", diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492d..c3ae1446461 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)  	u64 this_clock, remote_clock;  	u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: +	/* +	 * Careful here: The local and the remote clock values need to +	 * be read out atomic as we need to compare the values and +	 * then update either the local or the remote side. So the +	 * cmpxchg64 below only protects one readout. +	 * +	 * We must reread via sched_clock_local() in the retry case on +	 * 32bit as an NMI could use sched_clock_local() via the +	 * tracer and hit between the readout of +	 * the low32bit and the high 32bit portion. +	 */ +	this_clock = sched_clock_local(my_scd); +	/* +	 * We must enforce atomic readout on 32bit, otherwise the +	 * update on the remote cpu can hit inbetween the readout of +	 * the low32bit and the high 32bit portion. +	 */ +	remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else +	/* +	 * On 64bit the read of [my]scd->clock is atomic versus the +	 * update, so we can avoid the above 32bit dance. +	 */  	sched_clock_local(my_scd);  again:  	this_clock = my_scd->clock;  	remote_clock = scd->clock; +#endif  	/*  	 * Use the opportunity that we have both locks diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cb49b2ab0e1..ebdb1954121 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1498,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p)  {  	struct rq *rq = task_rq(p); -	BUG_ON(rq != this_rq()); -	BUG_ON(p == current); +	if (WARN_ON_ONCE(rq != this_rq()) || +	    WARN_ON_ONCE(p == current)) +		return; +  	lockdep_assert_held(&rq->lock);  	if (!raw_spin_trylock(&p->pi_lock)) { @@ -2997,51 +2999,6 @@ void __sched schedule_preempt_disabled(void)  	preempt_disable();  } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ -	if (lock->owner != owner) -		return false; - -	/* -	 * Ensure we emit the owner->on_cpu, dereference _after_ checking -	 * lock->owner still matches owner, if that fails, owner might -	 * point to free()d memory, if it still matches, the rcu_read_lock() -	 * ensures the memory stays valid. -	 */ -	barrier(); - -	return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ -	if (!sched_feat(OWNER_SPIN)) -		return 0; - -	rcu_read_lock(); -	while (owner_running(lock, owner)) { -		if (need_resched()) -			break; - -		arch_mutex_cpu_relax(); -	} -	rcu_read_unlock(); - -	/* -	 * We break out the loop above on need_resched() and when the -	 * owner changed, which is a sign for heavy contention. Return -	 * success only when lock->owner is NULL. -	 */ -	return lock->owner == NULL; -} -#endif -  #ifdef CONFIG_PREEMPT  /*   * this is the entry point to schedule() from in-kernel preemption @@ -4130,6 +4087,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	get_task_struct(p);  	rcu_read_unlock(); +	if (p->flags & PF_NO_SETAFFINITY) { +		retval = -EINVAL; +		goto out_put_task; +	}  	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {  		retval = -ENOMEM;  		goto out_put_task; @@ -4777,11 +4738,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  		goto out;  	} -	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { -		ret = -EINVAL; -		goto out; -	} -  	do_set_cpus_allowed(p, new_mask);  	/* Can the task run on the task's current CPU? If so, we're done */ @@ -5003,7 +4959,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)  }  static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1;  static void  set_table_entry(struct ctl_table *entry, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 33508dc78d0..ea32f02bf2c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -294,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)  	t = tsk;  	do { -		task_cputime(tsk, &utime, &stime); +		task_cputime(t, &utime, &stime);  		times->utime += utime;  		times->stime += stime;  		times->sum_exec_runtime += task_sched_runtime(t); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395..99399f8e479 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)  SCHED_FEAT(LB_BIAS, true)  /* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - -/*   * Decrement CPU power based on time not spent running tasks   */  SCHED_FEAT(NONTASK_POWER, true) diff --git a/kernel/signal.c b/kernel/signal.c index 2ec870a4c3c..598dc06be42 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)  		if (force_default || ka->sa.sa_handler != SIG_IGN)  			ka->sa.sa_handler = SIG_DFL;  		ka->sa.sa_flags = 0; +#ifdef __ARCH_HAS_SA_RESTORER +		ka->sa.sa_restorer = NULL; +#endif  		sigemptyset(&ka->sa.sa_mask);  		ka++;  	} @@ -2682,7 +2685,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize)  /**   *  sys_rt_sigpending - examine a pending signal that has been raised   *			while blocked - *  @set: stores pending signals + *  @uset: stores pending signals   *  @sigsetsize: size of sigset_t type or larger   */  SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) @@ -2945,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)  static int do_tkill(pid_t tgid, pid_t pid, int sig)  { -	struct siginfo info; +	struct siginfo info = {};  	info.si_signo = sig;  	info.si_errno = 0; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9bde572782..02fc5c93367 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data)  			continue;  		} -		//BUG_ON(td->cpu != smp_processor_id()); +		BUG_ON(td->cpu != smp_processor_id());  		/* Check for state change setup */  		switch (td->status) { @@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)  	}  	get_task_struct(tsk);  	*per_cpu_ptr(ht->store, cpu) = tsk; -	if (ht->create) -		ht->create(cpu); +	if (ht->create) { +		/* +		 * Make sure that the task has actually scheduled out +		 * into park position, before calling the create +		 * callback. At least the migration thread callback +		 * requires that the task is off the runqueue. +		 */ +		if (!wait_task_inactive(tsk, TASK_PARKED)) +			WARN_ON(1); +		else +			ht->create(cpu); +	}  	return 0;  } @@ -209,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp  {  	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); +	if (ht->pre_unpark) +		ht->pre_unpark(cpu);  	kthread_unpark(tsk);  } diff --git a/kernel/softirq.c b/kernel/softirq.c index b4d252fd195..14d7758074a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -323,18 +323,10 @@ void irq_enter(void)  static inline void invoke_softirq(void)  { -	if (!force_irqthreads) { -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED +	if (!force_irqthreads)  		__do_softirq(); -#else -		do_softirq(); -#endif -	} else { -		__local_bh_disable((unsigned long)__builtin_return_address(0), -				SOFTIRQ_OFFSET); +	else  		wakeup_softirqd(); -		__local_bh_enable(SOFTIRQ_OFFSET); -	}  }  /* @@ -342,9 +334,15 @@ static inline void invoke_softirq(void)   */  void irq_exit(void)  { +#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED +	local_irq_disable(); +#else +	WARN_ON_ONCE(!irqs_disabled()); +#endif +  	account_irq_exit_time(current);  	trace_hardirq_exit(); -	sub_preempt_count(IRQ_EXIT_OFFSET); +	sub_preempt_count(HARDIRQ_OFFSET);  	if (!in_interrupt() && local_softirq_pending())  		invoke_softirq(); @@ -354,7 +352,6 @@ void irq_exit(void)  		tick_nohz_irq_exit();  #endif  	rcu_irq_exit(); -	sched_preempt_enable_no_resched();  }  /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 95d178c62d5..c09f2955ae3 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {  	.create			= cpu_stop_create,  	.setup			= cpu_stop_unpark,  	.park			= cpu_stop_park, -	.unpark			= cpu_stop_unpark, +	.pre_unpark		= cpu_stop_unpark,  	.selfparking		= true,  }; diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba..0da73cf73e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd)  	system_state = SYSTEM_RESTART;  	usermodehelper_disable();  	device_shutdown(); -	syscore_shutdown();  }  /** @@ -370,6 +369,7 @@ void kernel_restart(char *cmd)  {  	kernel_restart_prepare(cmd);  	disable_nonboot_cpus(); +	syscore_shutdown();  	if (!cmd)  		printk(KERN_EMERG "Restarting system.\n");  	else @@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)  void kernel_halt(void)  {  	kernel_shutdown_prepare(SYSTEM_HALT); +	disable_nonboot_cpus();  	syscore_shutdown();  	printk(KERN_EMERG "System halted.\n");  	kmsg_dump(KMSG_DUMP_HALT); @@ -2185,9 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,  char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static int __orderly_poweroff(void) +static int __orderly_poweroff(bool force)  { -	int argc;  	char **argv;  	static char *envp[] = {  		"HOME=/", @@ -2196,20 +2196,40 @@ static int __orderly_poweroff(void)  	};  	int ret; -	argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); -	if (argv == NULL) { +	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); +	if (argv) { +		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); +		argv_free(argv); +	} else {  		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", -		       __func__, poweroff_cmd); -		return -ENOMEM; +					 __func__, poweroff_cmd); +		ret = -ENOMEM;  	} -	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, -				      NULL, NULL, NULL); -	argv_free(argv); +	if (ret && force) { +		printk(KERN_WARNING "Failed to start orderly shutdown: " +					"forcing the issue\n"); +		/* +		 * I guess this should try to kick off some daemon to sync and +		 * poweroff asap.  Or not even bother syncing if we're doing an +		 * emergency shutdown? +		 */ +		emergency_sync(); +		kernel_power_off(); +	}  	return ret;  } +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ +	__orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); +  /**   * orderly_poweroff - Trigger an orderly system poweroff   * @force: force poweroff if command execution fails @@ -2219,21 +2239,9 @@ static int __orderly_poweroff(void)   */  int orderly_poweroff(bool force)  { -	int ret = __orderly_poweroff(); - -	if (ret && force) { -		printk(KERN_WARNING "Failed to start orderly shutdown: " -		       "forcing the issue\n"); - -		/* -		 * I guess this should try to kick off some daemon to sync and -		 * poweroff asap.  Or not even bother syncing if we're doing an -		 * emergency shutdown? -		 */ -		emergency_sync(); -		kernel_power_off(); -	} - -	return ret; +	if (force) /* do not override the pending "true" */ +		poweroff_force = true; +	schedule_work(&poweroff_work); +	return 0;  }  EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afc1dc60f3f..9edcf456e0f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;  #endif  extern int pid_max;  extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches;  extern int percpu_pagelist_fraction;  extern int compat_log;  extern int latencytop_enabled; @@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {  		.extra2		= &one,  	},  #endif +	{ +		.procname	= "user_reserve_kbytes", +		.data		= &sysctl_user_reserve_kbytes, +		.maxlen		= sizeof(sysctl_user_reserve_kbytes), +		.mode		= 0644, +		.proc_handler	= proc_doulongvec_minmax, +	}, +	{ +		.procname	= "admin_reserve_kbytes", +		.data		= &sysctl_admin_reserve_kbytes, +		.maxlen		= sizeof(sysctl_admin_reserve_kbytes), +		.mode		= 0644, +		.proc_handler	= proc_doulongvec_minmax, +	},  	{ }  }; diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index f8b11a28317..12d6ebbfdd8 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -365,7 +365,7 @@ int init_test_probes(void)  	target2 = kprobe_target2;  	do { -		rand1 = random32(); +		rand1 = prandom_u32();  	} while (rand1 <= div_factor);  	printk(KERN_INFO "Kprobe smoke test started\n"); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8..7f32fe0e52c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)   */  int tick_check_broadcast_device(struct clock_event_device *dev)  { -	if ((tick_broadcast_device.evtdev && +	if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || +	    (tick_broadcast_device.evtdev &&  	     tick_broadcast_device.evtdev->rating >= dev->rating) ||  	     (dev->features & CLOCK_EVT_FEAT_C3STOP))  		return 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 192473b2279..5e9efd4b83a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -176,6 +176,8 @@ config IRQSOFF_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in irqs-off critical  	  sections, with microsecond accuracy. @@ -198,6 +200,8 @@ config PREEMPT_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in preemption-off critical  	  sections, with microsecond accuracy. @@ -217,6 +221,7 @@ config SCHED_TRACER  	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	select TRACER_MAX_TRACE +	select TRACER_SNAPSHOT  	help  	  This tracer tracks the latency of the highest priority task  	  to be scheduled in, starting from the point it has woken up. @@ -248,6 +253,27 @@ config TRACER_SNAPSHOT  	      echo 1 > /sys/kernel/debug/tracing/snapshot  	      cat snapshot +config TRACER_SNAPSHOT_PER_CPU_SWAP +        bool "Allow snapshot to swap per CPU" +	depends on TRACER_SNAPSHOT +	select RING_BUFFER_ALLOW_SWAP +	help +	  Allow doing a snapshot of a single CPU buffer instead of a +	  full swap (all buffers). If this is set, then the following is +	  allowed: + +	      echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + +	  After which, only the tracing buffer for CPU 2 was swapped with +	  the main tracing buffer, and the other CPU buffers remain the same. + +	  When this is enabled, this adds a little more overhead to the +	  trace recording, as it needs to add some checks to synchronize +	  recording with swaps. But this does not affect the performance +	  of the overall system. This is enabled by default when the preempt +	  or irq latency tracers are enabled, as those need to swap as well +	  and already adds the overhead (plus a lot more). +  config TRACE_BRANCH_PROFILING  	bool  	select GENERIC_TRACER @@ -414,24 +440,28 @@ config PROBE_EVENTS  	def_bool n  config DYNAMIC_FTRACE -	bool "enable/disable ftrace tracepoints dynamically" +	bool "enable/disable function tracing dynamically"  	depends on FUNCTION_TRACER  	depends on HAVE_DYNAMIC_FTRACE  	default y  	help -          This option will modify all the calls to ftrace dynamically -	  (will patch them out of the binary image and replace them -	  with a No-Op instruction) as they are called. A table is -	  created to dynamically enable them again. +	  This option will modify all the calls to function tracing +	  dynamically (will patch them out of the binary image and +	  replace them with a No-Op instruction) on boot up. During +	  compile time, a table is made of all the locations that ftrace +	  can function trace, and this table is linked into the kernel +	  image. When this is enabled, functions can be individually +	  enabled, and the functions not enabled will not affect +	  performance of the system. + +	  See the files in /sys/kernel/debug/tracing: +	    available_filter_functions +	    set_ftrace_filter +	    set_ftrace_notrace  	  This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but  	  otherwise has native performance as long as no tracing is active. -	  The changes to the code are done by a kernel thread that -	  wakes up once a second and checks to see if any ftrace calls -	  were made. If so, it runs stop_machine (stops all CPUS) -	  and modifies the code to jump over the call to ftrace. -  config DYNAMIC_FTRACE_WITH_REGS  	def_bool y  	depends on DYNAMIC_FTRACE @@ -520,6 +550,29 @@ config RING_BUFFER_BENCHMARK  	  If unsure, say N. +config RING_BUFFER_STARTUP_TEST +       bool "Ring buffer startup self test" +       depends on RING_BUFFER +       help +         Run a simple self test on the ring buffer on boot up. Late in the +	 kernel boot sequence, the test will start that kicks off +	 a thread per cpu. Each thread will write various size events +	 into the ring buffer. Another thread is created to send IPIs +	 to each of the threads, where the IPI handler will also write +	 to the ring buffer, to test/stress the nesting ability. +	 If any anomalies are discovered, a warning will be displayed +	 and all ring buffers will be disabled. + +	 The test runs for 10 seconds. This will slow your boot time +	 by at least 10 more seconds. + +	 At the end of the test, statics and more checks are done. +	 It will output the stats of each per cpu buffer. What +	 was written, the sizes, what was read, what was lost, and +	 other similar details. + +	 If unsure, say N +  endif # FTRACE  endif # TRACING_SUPPORT diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272ee..ed58a3216a6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,  	bool blk_tracer = blk_tracer_enabled;  	if (blk_tracer) { -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + len, @@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  	if (blk_tracer) {  		tracing_record_cmdline(current); -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + pdu_len, @@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,  				      struct request_queue *q,  				      struct request *rq)  { -	struct blk_trace *bt = q->blk_trace; - -	/* if control ever passes through here, it's a request based driver */ -	if (unlikely(bt && !bt->rq_based)) -		bt->rq_based = true; -  	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);  } @@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,  	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);  } -static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) +static void blk_add_trace_bio_complete(void *ignore, +				       struct request_queue *q, struct bio *bio, +				       int error)  { -	struct request_queue *q; -	struct blk_trace *bt; - -	if (!bio->bi_bdev) -		return; - -	q = bdev_get_queue(bio->bi_bdev); -	bt = q->blk_trace; - -	/* -	 * Request based drivers will generate both rq and bio completions. -	 * Ignore bio ones. -	 */ -	if (likely(!bt) || bt->rq_based) -		return; -  	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);  } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ab25b88aae5..8a5c017bb50 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,7 +66,7 @@  static struct ftrace_ops ftrace_list_end __read_mostly = {  	.func		= ftrace_stub, -	.flags		= FTRACE_OPS_FL_RECURSION_SAFE, +	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,  };  /* ftrace_enabled is a method to turn ftrace on or off */ @@ -486,7 +486,6 @@ struct ftrace_profile_stat {  #define PROFILES_PER_PAGE					\  	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly;  static int ftrace_profile_enabled __read_mostly;  /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);  static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)  static void *  function_stat_next(void *v, int idx) @@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); -	for (i = 0; i < pages; i++) { +	for (i = 1; i < pages; i++) {  		pg->next = (void *)get_zeroed_page(GFP_KERNEL);  		if (!pg->next)  			goto out_free; @@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  		free_page(tmp);  	} -	free_page((unsigned long)stat->pages);  	stat->pages = NULL;  	stat->start = NULL; @@ -725,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)  	if (!stat->hash)  		return -ENOMEM; -	if (!ftrace_profile_bits) { -		size--; - -		for (; size; size >>= 1) -			ftrace_profile_bits++; -	} -  	/* Preallocate the function profiling pages */  	if (ftrace_profile_pages_init(stat) < 0) {  		kfree(stat->hash); @@ -764,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)  	struct hlist_head *hhd;  	unsigned long key; -	key = hash_long(ip, ftrace_profile_bits); +	key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);  	hhd = &stat->hash[key];  	if (hlist_empty(hhd)) @@ -783,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,  {  	unsigned long key; -	key = hash_long(rec->ip, ftrace_profile_bits); +	key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);  	hlist_add_head_rcu(&rec->node, &stat->hash[key]);  } @@ -1053,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)  static struct pid * const ftrace_swapper_pid = &init_struct_pid; +loff_t +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +{ +	loff_t ret; + +	if (file->f_mode & FMODE_READ) +		ret = seq_lseek(file, offset, whence); +	else +		file->f_pos = ret = 1; + +	return ret; +} +  #ifdef CONFIG_DYNAMIC_FTRACE  #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -1067,7 +1072,7 @@ struct ftrace_func_probe {  	unsigned long		flags;  	unsigned long		ip;  	void			*data; -	struct rcu_head		rcu; +	struct list_head	free_list;  };  struct ftrace_func_entry { @@ -1317,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	struct hlist_head *hhd;  	struct ftrace_hash *old_hash;  	struct ftrace_hash *new_hash; -	unsigned long key;  	int size = src->count;  	int bits = 0;  	int ret; @@ -1360,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	for (i = 0; i < size; i++) {  		hhd = &src->buckets[i];  		hlist_for_each_entry_safe(entry, tn, hhd, hlist) { -			if (bits > 0) -				key = hash_long(entry->ip, bits); -			else -				key = 0;  			remove_hash_entry(src, entry);  			__add_hash_entry(new_hash, entry);  		} @@ -2613,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)   * routine, you can use ftrace_filter_write() for the write   * routine if @flag has FTRACE_ITER_FILTER set, or   * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * ftrace_filter_lseek() should be used as the lseek routine, and   * release must call ftrace_regex_release().   */  int @@ -2697,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)  				 inode, file);  } -loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int whence) -{ -	loff_t ret; - -	if (file->f_mode & FMODE_READ) -		ret = seq_lseek(file, offset, whence); -	else -		file->f_pos = ret = 1; - -	return ret; -} -  static int ftrace_match(char *str, char *regex, int len, int type)  {  	int matched = 0; @@ -2974,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)  } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry)  { -	struct ftrace_func_probe *entry = -		container_of(rhp, struct ftrace_func_probe, rcu); -  	if (entry->ops->free) -		entry->ops->free(&entry->data); +		entry->ops->free(entry->ops, entry->ip, &entry->data);  	kfree(entry);  } -  int  register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  			      void *data)  {  	struct ftrace_func_probe *entry; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct ftrace_hash *hash;  	struct ftrace_page *pg;  	struct dyn_ftrace *rec;  	int type, len, not;  	unsigned long key;  	int count = 0;  	char *search; +	int ret;  	type = filter_parse_regex(glob, strlen(glob), &search, ¬);  	len = strlen(search); @@ -3006,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	mutex_lock(&ftrace_lock); -	if (unlikely(ftrace_disabled)) +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) { +		count = -ENOMEM; +		goto out_unlock; +	} + +	if (unlikely(ftrace_disabled)) { +		count = -ENODEV;  		goto out_unlock; +	}  	do_for_each_ftrace_rec(pg, rec) { @@ -3031,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		 * for each function we find. We call the callback  		 * to give the caller an opportunity to do so.  		 */ -		if (ops->callback) { -			if (ops->callback(rec->ip, &entry->data) < 0) { +		if (ops->init) { +			if (ops->init(ops, rec->ip, &entry->data) < 0) {  				/* caller does not like this func */  				kfree(entry);  				continue;  			}  		} +		ret = enter_record(hash, rec, 0); +		if (ret < 0) { +			kfree(entry); +			count = ret; +			goto out_unlock; +		} +  		entry->ops = ops;  		entry->ip = rec->ip; @@ -3046,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);  	} while_for_each_ftrace_rec(); + +	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	if (ret < 0) +		count = ret; +  	__enable_ftrace_function_probe();   out_unlock:  	mutex_unlock(&ftrace_lock); +	free_ftrace_hash(hash);  	return count;  } @@ -3063,7 +3070,12 @@ static void  __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  				  void *data, int flags)  { +	struct ftrace_func_entry *rec_entry;  	struct ftrace_func_probe *entry; +	struct ftrace_func_probe *p; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct list_head free_list; +	struct ftrace_hash *hash;  	struct hlist_node *tmp;  	char str[KSYM_SYMBOL_LEN];  	int type = MATCH_FULL; @@ -3084,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	}  	mutex_lock(&ftrace_lock); + +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) +		/* Hmm, should report this somehow */ +		goto out_unlock; + +	INIT_LIST_HEAD(&free_list); +  	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {  		struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -3104,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  					continue;  			} -			hlist_del(&entry->node); -			call_rcu(&entry->rcu, ftrace_free_entry_rcu); +			rec_entry = ftrace_lookup_ip(hash, entry->ip); +			/* It is possible more than one entry had this ip */ +			if (rec_entry) +				free_hash_entry(hash, rec_entry); + +			hlist_del_rcu(&entry->node); +			list_add(&entry->free_list, &free_list);  		}  	}  	__disable_ftrace_function_probe(); +	/* +	 * Remove after the disable is called. Otherwise, if the last +	 * probe is removed, a null hash means *all enabled*. +	 */ +	ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	synchronize_sched(); +	list_for_each_entry_safe(entry, p, &free_list, free_list) { +		list_del(&entry->free_list); +		ftrace_free_entry(entry); +	} +		 + out_unlock:  	mutex_unlock(&ftrace_lock); +	free_ftrace_hash(hash);  }  void @@ -3441,14 +3479,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;  static int __init set_ftrace_notrace(char *str)  { -	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); +	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_notrace=", set_ftrace_notrace);  static int __init set_ftrace_filter(char *str)  { -	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); +	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_filter=", set_ftrace_filter); @@ -3571,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = {  	.open = ftrace_filter_open,  	.read = seq_read,  	.write = ftrace_filter_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -3579,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = {  	.open = ftrace_notrace_open,  	.read = seq_read,  	.write = ftrace_notrace_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -3737,7 +3775,8 @@ out:  	if (fail)  		return -EINVAL; -	ftrace_graph_filter_enabled = 1; +	ftrace_graph_filter_enabled = !!(*idx); +  	return 0;  } @@ -3784,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = {  	.open		= ftrace_graph_open,  	.read		= seq_read,  	.write		= ftrace_graph_write, +	.llseek		= ftrace_filter_lseek,  	.release	= ftrace_graph_release, -	.llseek		= seq_lseek,  };  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4131,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  	preempt_disable_notrace();  	trace_recursion_set(TRACE_CONTROL_BIT);  	do_for_each_ftrace_op(op, ftrace_control_list) { -		if (!ftrace_function_local_disabled(op) && +		if (!(op->flags & FTRACE_OPS_FL_STUB) && +		    !ftrace_function_local_disabled(op) &&  		    ftrace_ops_test(op, ip))  			op->func(ip, parent_ip, op, regs);  	} while_for_each_ftrace_op(op); @@ -4439,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = {  	.open		= ftrace_pid_open,  	.write		= ftrace_pid_write,  	.read		= seq_read, -	.llseek		= seq_lseek, +	.llseek		= ftrace_filter_lseek,  	.release	= ftrace_pid_release,  }; @@ -4555,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  		ftrace_startup_sysctl();  		/* we are starting ftrace again */ -		if (ftrace_ops_list != &ftrace_list_end) { -			if (ftrace_ops_list->next == &ftrace_list_end) -				ftrace_trace_function = ftrace_ops_list->func; -			else -				ftrace_trace_function = ftrace_ops_list_func; -		} +		if (ftrace_ops_list != &ftrace_list_end) +			update_ftrace_function();  	} else {  		/* stopping ftrace calls (just send to ftrace_stub) */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6989df2ba19..b59aea2c48c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,13 +8,16 @@  #include <linux/trace_clock.h>  #include <linux/trace_seq.h>  #include <linux/spinlock.h> +#include <linux/irq_work.h>  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/hardirq.h> +#include <linux/kthread.h>	/* for self test */  #include <linux/kmemcheck.h>  #include <linux/module.h>  #include <linux/percpu.h>  #include <linux/mutex.h> +#include <linux/delay.h>  #include <linux/slab.h>  #include <linux/init.h>  #include <linux/hash.h> @@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)  	return ret;  } +struct rb_irq_work { +	struct irq_work			work; +	wait_queue_head_t		waiters; +	bool				waiters_pending; +}; +  /*   * head_page == tail_page && head == tail then buffer is empty.   */ @@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {  	struct list_head		new_pages; /* new pages to add */  	struct work_struct		update_pages_work;  	struct completion		update_done; + +	struct rb_irq_work		irq_work;  };  struct ring_buffer { @@ -497,6 +508,8 @@ struct ring_buffer {  	struct notifier_block		cpu_notify;  #endif  	u64				(*clock)(void); + +	struct rb_irq_work		irq_work;  };  struct ring_buffer_iter { @@ -508,6 +521,118 @@ struct ring_buffer_iter {  	u64				read_stamp;  }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ +	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + +	wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	DEFINE_WAIT(wait); +	struct rb_irq_work *work; + +	/* +	 * Depending on what the caller is waiting for, either any +	 * data in any cpu buffer, or a specific buffer, put the +	 * caller on the appropriate wait queue. +	 */ +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + + +	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + +	/* +	 * The events can happen in critical sections where +	 * checking a work queue can cause deadlocks. +	 * After adding a task to the queue, this flag is set +	 * only to notify events to try to wake up the queue +	 * using irq_work. +	 * +	 * We don't clear it even if the buffer is no longer +	 * empty. The flag only causes the next event to run +	 * irq_work to do the work queue wake up. The worse +	 * that can happen if we race with !trace_empty() is that +	 * an event will cause an irq_work to try to wake up +	 * an empty queue. +	 * +	 * There's no reason to protect this flag either, as +	 * the work queue and irq_work logic will do the necessary +	 * synchronization for the wake ups. The only thing +	 * that is necessary is that the wake up happens after +	 * a task has been queued. It's OK for spurious wake ups. +	 */ +	work->waiters_pending = true; + +	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) +		schedule(); + +	finish_wait(&work->waiters, &wait); +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +			  struct file *filp, poll_table *poll_table) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct rb_irq_work *work; + +	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) +		return POLLIN | POLLRDNORM; + +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + +	work->waiters_pending = true; +	poll_wait(filp, &work->waiters, poll_table); + +	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) +		return POLLIN | POLLRDNORM; +	return 0; +} +  /* buffer may be either ring_buffer or ring_buffer_per_cpu */  #define RB_WARN_ON(b, cond)						\  	({								\ @@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)  	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);  	init_completion(&cpu_buffer->update_done); +	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&cpu_buffer->irq_work.waiters);  	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),  			    GFP_KERNEL, cpu_to_node(cpu)); @@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	buffer->clock = trace_clock_local;  	buffer->reader_lock_key = key; +	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&buffer->irq_work.waiters); +  	/* need at least two pages */  	if (nr_pages < 2)  		nr_pages = 2; @@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  			if (!cpu_buffer->nr_pages_to_update)  				continue; -			if (cpu_online(cpu)) +			/* The update must run on the CPU that is being updated. */ +			preempt_disable(); +			if (cpu == smp_processor_id() || !cpu_online(cpu)) { +				rb_update_pages(cpu_buffer); +				cpu_buffer->nr_pages_to_update = 0; +			} else { +				/* +				 * Can not disable preemption for schedule_work_on() +				 * on PREEMPT_RT. +				 */ +				preempt_enable();  				schedule_work_on(cpu,  						&cpu_buffer->update_pages_work); -			else -				rb_update_pages(cpu_buffer); +				preempt_disable(); +			} +			preempt_enable();  		}  		/* wait for all the updates to complete */ @@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  		get_online_cpus(); -		if (cpu_online(cpu_id)) { +		preempt_disable(); +		/* The update must run on the CPU that is being updated. */ +		if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) +			rb_update_pages(cpu_buffer); +		else { +			/* +			 * Can not disable preemption for schedule_work_on() +			 * on PREEMPT_RT. +			 */ +			preempt_enable();  			schedule_work_on(cpu_id,  					 &cpu_buffer->update_pages_work);  			wait_for_completion(&cpu_buffer->update_done); -		} else -			rb_update_pages(cpu_buffer); +			preempt_disable(); +		} +		preempt_enable();  		cpu_buffer->nr_pages_to_update = 0;  		put_online_cpus(); @@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,  	rb_end_commit(cpu_buffer);  } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ +	if (buffer->irq_work.waiters_pending) { +		buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&buffer->irq_work.work); +	} + +	if (cpu_buffer->irq_work.waiters_pending) { +		cpu_buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&cpu_buffer->irq_work.work); +	} +} +  /**   * ring_buffer_unlock_commit - commit a reserved   * @buffer: The buffer to commit to @@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	trace_recursive_unlock();  	preempt_enable_notrace(); @@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	ret = 0;   out:  	preempt_enable_notrace(); @@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  }  #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { +	struct ring_buffer	*buffer; +	unsigned long		events; +	unsigned long		bytes_written; +	unsigned long		bytes_alloc; +	unsigned long		bytes_dropped; +	unsigned long		events_nested; +	unsigned long		bytes_written_nested; +	unsigned long		bytes_alloc_nested; +	unsigned long		bytes_dropped_nested; +	int			min_size_nested; +	int			max_size_nested; +	int			max_size; +	int			min_size; +	int			cpu; +	int			cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE	1048576 + +static char rb_string[] __initdata = +	"abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" +	"?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" +	"!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { +	int size; +	char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ +	struct ring_buffer_event *event; +	struct rb_item *item; +	bool started; +	int event_len; +	int size; +	int len; +	int cnt; + +	/* Have nested writes different that what is written */ +	cnt = data->cnt + (nested ? 27 : 0); + +	/* Multiply cnt by ~e, to make some unique increment */ +	size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + +	len = size + sizeof(struct rb_item); + +	started = rb_test_started; +	/* read rb_test_started before checking buffer enabled */ +	smp_rmb(); + +	event = ring_buffer_lock_reserve(data->buffer, len); +	if (!event) { +		/* Ignore dropped events before test starts. */ +		if (started) { +			if (nested) +				data->bytes_dropped += len; +			else +				data->bytes_dropped_nested += len; +		} +		return len; +	} + +	event_len = ring_buffer_event_length(event); + +	if (RB_WARN_ON(data->buffer, event_len < len)) +		goto out; + +	item = ring_buffer_event_data(event); +	item->size = size; +	memcpy(item->str, rb_string, size); + +	if (nested) { +		data->bytes_alloc_nested += event_len; +		data->bytes_written_nested += len; +		data->events_nested++; +		if (!data->min_size_nested || len < data->min_size_nested) +			data->min_size_nested = len; +		if (len > data->max_size_nested) +			data->max_size_nested = len; +	} else { +		data->bytes_alloc += event_len; +		data->bytes_written += len; +		data->events++; +		if (!data->min_size || len < data->min_size) +			data->max_size = len; +		if (len > data->max_size) +			data->max_size = len; +	} + + out: +	ring_buffer_unlock_commit(data->buffer, event); + +	return 0; +} + +static __init int rb_test(void *arg) +{ +	struct rb_test_data *data = arg; + +	while (!kthread_should_stop()) { +		rb_write_something(data, false); +		data->cnt++; + +		set_current_state(TASK_INTERRUPTIBLE); +		/* Now sleep between a min of 100-300us and a max of 1ms */ +		usleep_range(((data->cnt % 3) + 1) * 100, 1000); +	} + +	return 0; +} + +static __init void rb_ipi(void *ignore) +{ +	struct rb_test_data *data; +	int cpu = smp_processor_id(); + +	data = &rb_data[cpu]; +	rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ +	while (!kthread_should_stop()) { + +		/* Send an IPI to all cpus to write data! */ +		smp_call_function(rb_ipi, NULL, 1); +		/* No sleep, but for non preempt, let others run */ +		schedule(); +	} + +	return 0; +} + +static __init int test_ringbuffer(void) +{ +	struct task_struct *rb_hammer; +	struct ring_buffer *buffer; +	int cpu; +	int ret = 0; + +	pr_info("Running ring buffer tests...\n"); + +	buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); +	if (WARN_ON(!buffer)) +		return 0; + +	/* Disable buffer so that threads can't write to it yet */ +	ring_buffer_record_off(buffer); + +	for_each_online_cpu(cpu) { +		rb_data[cpu].buffer = buffer; +		rb_data[cpu].cpu = cpu; +		rb_data[cpu].cnt = cpu; +		rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], +						 "rbtester/%d", cpu); +		if (WARN_ON(!rb_threads[cpu])) { +			pr_cont("FAILED\n"); +			ret = -1; +			goto out_free; +		} + +		kthread_bind(rb_threads[cpu], cpu); + 		wake_up_process(rb_threads[cpu]); +	} + +	/* Now create the rb hammer! */ +	rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); +	if (WARN_ON(!rb_hammer)) { +		pr_cont("FAILED\n"); +		ret = -1; +		goto out_free; +	} + +	ring_buffer_record_on(buffer); +	/* +	 * Show buffer is enabled before setting rb_test_started. +	 * Yes there's a small race window where events could be +	 * dropped and the thread wont catch it. But when a ring +	 * buffer gets enabled, there will always be some kind of +	 * delay before other CPUs see it. Thus, we don't care about +	 * those dropped events. We care about events dropped after +	 * the threads see that the buffer is active. +	 */ +	smp_wmb(); +	rb_test_started = true; + +	set_current_state(TASK_INTERRUPTIBLE); +	/* Just run for 10 seconds */; +	schedule_timeout(10 * HZ); + +	kthread_stop(rb_hammer); + + out_free: +	for_each_online_cpu(cpu) { +		if (!rb_threads[cpu]) +			break; +		kthread_stop(rb_threads[cpu]); +	} +	if (ret) { +		ring_buffer_free(buffer); +		return ret; +	} + +	/* Report! */ +	pr_info("finished\n"); +	for_each_online_cpu(cpu) { +		struct ring_buffer_event *event; +		struct rb_test_data *data = &rb_data[cpu]; +		struct rb_item *item; +		unsigned long total_events; +		unsigned long total_dropped; +		unsigned long total_written; +		unsigned long total_alloc; +		unsigned long total_read = 0; +		unsigned long total_size = 0; +		unsigned long total_len = 0; +		unsigned long total_lost = 0; +		unsigned long lost; +		int big_event_size; +		int small_event_size; + +		ret = -1; + +		total_events = data->events + data->events_nested; +		total_written = data->bytes_written + data->bytes_written_nested; +		total_alloc = data->bytes_alloc + data->bytes_alloc_nested; +		total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + +		big_event_size = data->max_size + data->max_size_nested; +		small_event_size = data->min_size + data->min_size_nested; + +		pr_info("CPU %d:\n", cpu); +		pr_info("              events:    %ld\n", total_events); +		pr_info("       dropped bytes:    %ld\n", total_dropped); +		pr_info("       alloced bytes:    %ld\n", total_alloc); +		pr_info("       written bytes:    %ld\n", total_written); +		pr_info("       biggest event:    %d\n", big_event_size); +		pr_info("      smallest event:    %d\n", small_event_size); + +		if (RB_WARN_ON(buffer, total_dropped)) +			break; + +		ret = 0; + +		while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { +			total_lost += lost; +			item = ring_buffer_event_data(event); +			total_len += ring_buffer_event_length(event); +			total_size += item->size + sizeof(struct rb_item); +			if (memcmp(&item->str[0], rb_string, item->size) != 0) { +				pr_info("FAILED!\n"); +				pr_info("buffer had: %.*s\n", item->size, item->str); +				pr_info("expected:   %.*s\n", item->size, rb_string); +				RB_WARN_ON(buffer, 1); +				ret = -1; +				break; +			} +			total_read++; +		} +		if (ret) +			break; + +		ret = -1; + +		pr_info("         read events:   %ld\n", total_read); +		pr_info("         lost events:   %ld\n", total_lost); +		pr_info("        total events:   %ld\n", total_lost + total_read); +		pr_info("  recorded len bytes:   %ld\n", total_len); +		pr_info(" recorded size bytes:   %ld\n", total_size); +		if (total_lost) +			pr_info(" With dropped events, record len and size may not match\n" +				" alloced and written from above\n"); +		if (!total_lost) { +			if (RB_WARN_ON(buffer, total_len != total_alloc || +				       total_size != total_written)) +				break; +		} +		if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) +			break; + +		ret = 0; +	} +	if (!ret) +		pr_info("Ring buffer PASSED!\n"); + +	ring_buffer_free(buffer); +	return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c2e2c231037..ae6fa2d1cdf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@  /*   * ring buffer based function tracer   * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>   * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>   *   * Originally taken from the RT patch by: @@ -19,7 +19,6 @@  #include <linux/seq_file.h>  #include <linux/notifier.h>  #include <linux/irqflags.h> -#include <linux/irq_work.h>  #include <linux/debugfs.h>  #include <linux/pagemap.h>  #include <linux/hardirq.h> @@ -48,7 +47,7 @@   * On boot up, the ring buffer is set to the minimum size, so that   * we do not waste memory on systems that are not using tracing.   */ -int ring_buffer_expanded; +bool ring_buffer_expanded;  /*   * We need to change this state when a selftest is running. @@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)  static DEFINE_PER_CPU(bool, trace_cmdline_save);  /* - * When a reader is waiting for data, then this variable is - * set to true. - */ -static bool trace_wakeup_needed; - -static struct irq_work trace_work_wakeup; - -/*   * Kill all tracing for good (never come back).   * It is initialized to 1 but will turn to zero if the initialization   * of the tracer is successful. But that is the only place that sets @@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);  static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;  static char *default_bootup_tracer; +static bool allocate_snapshot; +  static int __init set_cmdline_ftrace(char *str)  { -	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); +	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);  	default_bootup_tracer = bootup_tracer_buf;  	/* We are using ftrace early, expand it */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true;  	return 1;  }  __setup("ftrace=", set_cmdline_ftrace); @@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str)  }  __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init boot_alloc_snapshot(char *str) +{ +	allocate_snapshot = true; +	/* We also need the main ring buffer expanded */ +	ring_buffer_expanded = true; +	return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); +  static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;  static char *trace_boot_options __initdata;  static int __init set_trace_boot_options(char *str)  { -	strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); +	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);  	trace_boot_options = trace_boot_options_buf;  	return 0;  } @@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)   */  static struct trace_array	global_trace; -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +LIST_HEAD(ftrace_trace_arrays);  int filter_current_check_discard(struct ring_buffer *buffer,  				 struct ftrace_event_call *call, void *rec, @@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)  	u64 ts;  	/* Early boot up does not have a buffer yet */ -	if (!global_trace.buffer) +	if (!global_trace.trace_buffer.buffer)  		return trace_clock_local(); -	ts = ring_buffer_time_stamp(global_trace.buffer, cpu); -	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); +	ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); +	ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);  	return ts;  } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array	max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); -  int tracing_is_enabled(void)  {  	return tracing_is_on(); @@ -249,9 +237,6 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;  /* trace_types holds a link list of available tracers. */  static struct tracer		*trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer		*current_trace __read_mostly = &nop_trace; -  /*   * trace_types_lock is used to protect the trace_types list.   */ @@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);  static inline void trace_access_lock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		/* gain it for accessing the whole ring buffer. */  		down_write(&all_cpu_access_lock);  	} else {  		/* gain it for accessing a cpu ring buffer. */ -		/* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ +		/* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */  		down_read(&all_cpu_access_lock);  		/* Secondly block other access to this @cpu ring buffer. */ @@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)  static inline void trace_access_unlock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		up_write(&all_cpu_access_lock);  	} else {  		mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)  #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); -  /* trace_flags holds trace_options default values */  unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |  	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |  	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | -	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; - -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); - -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -static void trace_wake_up(struct irq_work *work) -{ -	wake_up_all(&trace_wait); - -} +	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;  /**   * tracing_on - enable tracing buffers @@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)   */  void tracing_on(void)  { -	if (global_trace.buffer) -		ring_buffer_record_on(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		ring_buffer_record_on(global_trace.trace_buffer.buffer);  	/*  	 * This flag is only looked at when buffers haven't been  	 * allocated yet. We don't really care about the race @@ -385,6 +351,196 @@ void tracing_on(void)  EXPORT_SYMBOL_GPL(tracing_on);  /** + * __trace_puts - write a constant string into the trace buffer. + * @ip:	   The address of the caller + * @str:   The constant string to write + * @size:  The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct print_entry *entry; +	unsigned long irq_flags; +	int alloc; + +	alloc = sizeof(*entry) + size + 2; /* possible \n added */ + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,  +					  irq_flags, preempt_count()); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip = ip; + +	memcpy(&entry->buf, str, size); + +	/* Add a newline if necessary */ +	if (entry->buf[size - 1] != '\n') { +		entry->buf[size] = '\n'; +		entry->buf[size + 1] = '\0'; +	} else +		entry->buf[size] = '\0'; + +	__buffer_unlock_commit(buffer, event); + +	return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip:	   The address of the caller + * @str:   The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct bputs_entry *entry; +	unsigned long irq_flags; +	int size = sizeof(struct bputs_entry); + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, +					  irq_flags, preempt_count()); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip			= ip; +	entry->str			= str; + +	__buffer_unlock_commit(buffer, event); + +	return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *tracer = tr->current_trace; +	unsigned long flags; + +	if (in_nmi()) { +		internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); +		internal_trace_puts("*** snapshot is being ignored        ***\n"); +		return; +	} + +	if (!tr->allocated_snapshot) { +		internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); +		internal_trace_puts("*** stopping trace here!   ***\n"); +		tracing_off(); +		return; +	} + +	/* Note, snapshot can not be used when the tracer uses it */ +	if (tracer->use_max_tr) { +		internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); +		internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); +		return; +	} + +	local_irq_save(flags); +	update_max_tr(tr, current, smp_processor_id()); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ +	int ret; + +	if (!tr->allocated_snapshot) { + +		/* allocate spare buffer */ +		ret = resize_buffer_duplicate_size(&tr->max_buffer, +				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS); +		if (ret < 0) +			return ret; + +		tr->allocated_snapshot = true; +	} + +	return 0; +} + +void free_snapshot(struct trace_array *tr) +{ +	/* +	 * We don't free the ring buffer. instead, resize it because +	 * The max_tr ring buffer has some state (e.g. ring->clock) and +	 * we want preserve it. +	 */ +	ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); +	set_buffer_entries(&tr->max_buffer, 1); +	tracing_reset_online_cpus(&tr->max_buffer); +	tr->allocated_snapshot = false; +} + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ +	struct trace_array *tr = &global_trace; +	int ret; + +	ret = alloc_snapshot(tr); +	if (WARN_ON(ret < 0)) +		return; + +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ +	WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_alloc(void) +{ +	/* Give warning */ +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +/**   * tracing_off - turn off tracing buffers   *   * This function stops the tracing buffers from recording data. @@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);   */  void tracing_off(void)  { -	if (global_trace.buffer) -		ring_buffer_record_off(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		ring_buffer_record_off(global_trace.trace_buffer.buffer);  	/*  	 * This flag is only looked at when buffers haven't been  	 * allocated yet. We don't really care about the race @@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);   */  int tracing_is_on(void)  { -	if (global_trace.buffer) -		return ring_buffer_record_is_on(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);  	return !global_trace.buffer_disabled;  }  EXPORT_SYMBOL_GPL(tracing_is_on); @@ -479,6 +635,7 @@ static const char *trace_options[] = {  	"disable_on_free",  	"irq-info",  	"markers", +	"function-trace",  	NULL  }; @@ -490,6 +647,8 @@ static struct {  	{ trace_clock_local,	"local",	1 },  	{ trace_clock_global,	"global",	1 },  	{ trace_clock_counter,	"counter",	0 }, +	{ trace_clock_jiffies,	"uptime",	1 }, +	{ trace_clock,		"perf",		1 },  	ARCH_TRACE_CLOCKS  }; @@ -670,13 +829,14 @@ unsigned long __read_mostly	tracing_max_latency;  static void  __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct trace_array_cpu *data = tr->data[cpu]; -	struct trace_array_cpu *max_data; +	struct trace_buffer *trace_buf = &tr->trace_buffer; +	struct trace_buffer *max_buf = &tr->max_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); +	struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); -	max_tr.cpu = cpu; -	max_tr.time_start = data->preempt_timestamp; +	max_buf->cpu = cpu; +	max_buf->time_start = data->preempt_timestamp; -	max_data = max_tr.data[cpu];  	max_data->saved_latency = tracing_max_latency;  	max_data->critical_start = data->critical_start;  	max_data->critical_end = data->critical_end; @@ -704,23 +864,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  void  update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct ring_buffer *buf = tr->buffer; +	struct ring_buffer *buf; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (!current_trace->allocated_snapshot) { +	if (!tr->allocated_snapshot) {  		/* Only the nop tracer should hit this when disabling */ -		WARN_ON_ONCE(current_trace != &nop_trace); +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return;  	}  	arch_spin_lock(&ftrace_max_lock); -	tr->buffer = max_tr.buffer; -	max_tr.buffer = buf; +	buf = tr->trace_buffer.buffer; +	tr->trace_buffer.buffer = tr->max_buffer.buffer; +	tr->max_buffer.buffer = buf;  	__update_max_tr(tr, tsk, cpu);  	arch_spin_unlock(&ftrace_max_lock); @@ -739,16 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  {  	int ret; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) +	if (!tr->allocated_snapshot) { +		/* Only the nop tracer should hit this when disabling */ +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return; +	}  	arch_spin_lock(&ftrace_max_lock); -	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); +	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);  	if (ret == -EBUSY) {  		/* @@ -757,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  		 * the max trace buffer (no one writes directly to it)  		 * and flag that it failed.  		 */ -		trace_array_printk(&max_tr, _THIS_IP_, +		trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,  			"Failed to swap buffers due to commit in progress\n");  	} @@ -770,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  static void default_wait_pipe(struct trace_iterator *iter)  { -	DEFINE_WAIT(wait); +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return; -	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); +	ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *saved_tracer = tr->current_trace; +	int ret; + +	if (!type->selftest || tracing_selftest_disabled) +		return 0;  	/* -	 * The events can happen in critical sections where -	 * checking a work queue can cause deadlocks. -	 * After adding a task to the queue, this flag is set -	 * only to notify events to try to wake up the queue -	 * using irq_work. -	 * -	 * We don't clear it even if the buffer is no longer -	 * empty. The flag only causes the next event to run -	 * irq_work to do the work queue wake up. The worse -	 * that can happen if we race with !trace_empty() is that -	 * an event will cause an irq_work to try to wake up -	 * an empty queue. -	 * -	 * There's no reason to protect this flag either, as -	 * the work queue and irq_work logic will do the necessary -	 * synchronization for the wake ups. The only thing -	 * that is necessary is that the wake up happens after -	 * a task has been queued. It's OK for spurious wake ups. +	 * Run a selftest on this tracer. +	 * Here we reset the trace buffer, and set the current +	 * tracer to be this tracer. The tracer can then run some +	 * internal tracing to verify that everything is in order. +	 * If we fail, we do not register this tracer.  	 */ -	trace_wakeup_needed = true; +	tracing_reset_online_cpus(&tr->trace_buffer); -	if (trace_empty(iter)) -		schedule(); +	tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		/* If we expanded the buffers, make sure the max is expanded too */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, +					   RING_BUFFER_ALL_CPUS); +		tr->allocated_snapshot = true; +	} +#endif -	finish_wait(&trace_wait, &wait); +	/* the test is responsible for initializing and enabling */ +	pr_info("Testing tracer %s: ", type->name); +	ret = type->selftest(type, tr); +	/* the test is responsible for resetting too */ +	tr->current_trace = saved_tracer; +	if (ret) { +		printk(KERN_CONT "FAILED!\n"); +		/* Add the warning after printing 'FAILED' */ +		WARN_ON(1); +		return -1; +	} +	/* Only reset on passing, to avoid touching corrupted buffers */ +	tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		tr->allocated_snapshot = false; + +		/* Shrink the max buffer again */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, 1, +					   RING_BUFFER_ALL_CPUS); +	} +#endif + +	printk(KERN_CONT "PASSED\n"); +	return 0;  } +#else +static inline int run_tracer_selftest(struct tracer *type) +{ +	return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */  /**   * register_tracer - register a tracer with the ftrace system. @@ -847,57 +1052,9 @@ int register_tracer(struct tracer *type)  	if (!type->wait_pipe)  		type->wait_pipe = default_wait_pipe; - -#ifdef CONFIG_FTRACE_STARTUP_TEST -	if (type->selftest && !tracing_selftest_disabled) { -		struct tracer *saved_tracer = current_trace; -		struct trace_array *tr = &global_trace; - -		/* -		 * Run a selftest on this tracer. -		 * Here we reset the trace buffer, and set the current -		 * tracer to be this tracer. The tracer can then run some -		 * internal tracing to verify that everything is in order. -		 * If we fail, we do not register this tracer. -		 */ -		tracing_reset_online_cpus(tr); - -		current_trace = type; - -		if (type->use_max_tr) { -			/* If we expanded the buffers, make sure the max is expanded too */ -			if (ring_buffer_expanded) -				ring_buffer_resize(max_tr.buffer, trace_buf_size, -						   RING_BUFFER_ALL_CPUS); -			type->allocated_snapshot = true; -		} - -		/* the test is responsible for initializing and enabling */ -		pr_info("Testing tracer %s: ", type->name); -		ret = type->selftest(type, tr); -		/* the test is responsible for resetting too */ -		current_trace = saved_tracer; -		if (ret) { -			printk(KERN_CONT "FAILED!\n"); -			/* Add the warning after printing 'FAILED' */ -			WARN_ON(1); -			goto out; -		} -		/* Only reset on passing, to avoid touching corrupted buffers */ -		tracing_reset_online_cpus(tr); - -		if (type->use_max_tr) { -			type->allocated_snapshot = false; - -			/* Shrink the max buffer again */ -			if (ring_buffer_expanded) -				ring_buffer_resize(max_tr.buffer, 1, -						   RING_BUFFER_ALL_CPUS); -		} - -		printk(KERN_CONT "PASSED\n"); -	} -#endif +	ret = run_tracer_selftest(type); +	if (ret < 0) +		goto out;  	type->next = trace_types;  	trace_types = type; @@ -917,7 +1074,7 @@ int register_tracer(struct tracer *type)  	tracing_set_tracer(type->name);  	default_bootup_tracer = NULL;  	/* disable other selftests, since this will break it. */ -	tracing_selftest_disabled = 1; +	tracing_selftest_disabled = true;  #ifdef CONFIG_FTRACE_STARTUP_TEST  	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",  	       type->name); @@ -927,9 +1084,9 @@ int register_tracer(struct tracer *type)  	return ret;  } -void tracing_reset(struct trace_array *tr, int cpu) +void tracing_reset(struct trace_buffer *buf, int cpu)  { -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = buf->buffer;  	if (!buffer)  		return; @@ -943,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)  	ring_buffer_record_enable(buffer);  } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf)  { -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = buf->buffer;  	int cpu;  	if (!buffer) @@ -956,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)  	/* Make sure all commits have finished */  	synchronize_sched(); -	tr->time_start = ftrace_now(tr->cpu); +	buf->time_start = ftrace_now(buf->cpu);  	for_each_online_cpu(cpu)  		ring_buffer_reset_cpu(buffer, cpu); @@ -966,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)  void tracing_reset_current(int cpu)  { -	tracing_reset(&global_trace, cpu); +	tracing_reset(&global_trace.trace_buffer, cpu);  } -void tracing_reset_current_online_cpus(void) +void tracing_reset_all_online_cpus(void)  { -	tracing_reset_online_cpus(&global_trace); +	struct trace_array *tr; + +	mutex_lock(&trace_types_lock); +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE +		tracing_reset_online_cpus(&tr->max_buffer); +#endif +	} +	mutex_unlock(&trace_types_lock);  }  #define SAVED_CMDLINES 128 @@ -994,7 +1160,7 @@ static void trace_init_cmdlines(void)  int is_tracing_stopped(void)  { -	return trace_stop_count; +	return global_trace.stop_count;  }  /** @@ -1026,12 +1192,12 @@ void tracing_start(void)  	if (tracing_disabled)  		return; -	raw_spin_lock_irqsave(&tracing_start_lock, flags); -	if (--trace_stop_count) { -		if (trace_stop_count < 0) { +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (--global_trace.stop_count) { +		if (global_trace.stop_count < 0) {  			/* Someone screwed up their debugging */  			WARN_ON_ONCE(1); -			trace_stop_count = 0; +			global_trace.stop_count = 0;  		}  		goto out;  	} @@ -1039,19 +1205,52 @@ void tracing_start(void)  	/* Prevent the buffers from switching */  	arch_spin_lock(&ftrace_max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); +#endif  	arch_spin_unlock(&ftrace_max_lock);  	ftrace_start();   out: -	raw_spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	if (tracing_disabled) +		return; + +	/* If global, we need to also start the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_start(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); + +	if (--tr->stop_count) { +		if (tr->stop_count < 0) { +			/* Someone screwed up their debugging */ +			WARN_ON_ONCE(1); +			tr->stop_count = 0; +		} +		goto out; +	} + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_enable(buffer); + + out: +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  /** @@ -1066,25 +1265,48 @@ void tracing_stop(void)  	unsigned long flags;  	ftrace_stop(); -	raw_spin_lock_irqsave(&tracing_start_lock, flags); -	if (trace_stop_count++) +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (global_trace.stop_count++)  		goto out;  	/* Prevent the buffers from switching */  	arch_spin_lock(&ftrace_max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); +#endif  	arch_spin_unlock(&ftrace_max_lock);   out: -	raw_spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	/* If global, we need to also stop the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_stop(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); +	if (tr->stop_count++) +		goto out; + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_disable(buffer); + + out: +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  void trace_stop_cmdline_recording(void); @@ -1217,11 +1439,6 @@ void  __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)  {  	__this_cpu_write(trace_cmdline_save, true); -	if (trace_wakeup_needed) { -		trace_wakeup_needed = false; -		/* irq_work_queue() supplies it's own memory barriers */ -		irq_work_queue(&trace_work_wakeup); -	}  	ring_buffer_unlock_commit(buffer, event);  } @@ -1245,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,  EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);  struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, +			  struct ftrace_event_file *ftrace_file, +			  int type, unsigned long len, +			  unsigned long flags, int pc) +{ +	*current_rb = ftrace_file->tr->trace_buffer.buffer; +	return trace_buffer_lock_reserve(*current_rb, +					 type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +struct ring_buffer_event *  trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,  				  int type, unsigned long len,  				  unsigned long flags, int pc)  { -	*current_rb = global_trace.buffer; +	*current_rb = global_trace.trace_buffer.buffer;  	return trace_buffer_lock_reserve(*current_rb,  					 type, len, flags, pc);  } @@ -1288,7 +1517,7 @@ trace_function(struct trace_array *tr,  	       int pc)  {  	struct ftrace_event_call *call = &event_function; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ftrace_entry *entry; @@ -1429,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,  void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,  		   int pc)  { -	__ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); +	__ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);  }  /**   * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers)   */ -void trace_dump_stack(void) +void trace_dump_stack(int skip)  {  	unsigned long flags; @@ -1444,8 +1674,13 @@ void trace_dump_stack(void)  	local_save_flags(flags); -	/* skipping 3 traces, seems to get us at the caller of this function */ -	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); +	/* +	 * Skip 3 more, seems to get us at the caller of +	 * this function. +	 */ +	skip += 3; +	__ftrace_trace_stack(global_trace.trace_buffer.buffer, +			     flags, skip, preempt_count(), NULL);  }  static DEFINE_PER_CPU(int, user_stack_count); @@ -1615,7 +1850,7 @@ void trace_printk_init_buffers(void)  	 * directly here. If the global_trace.buffer is already  	 * allocated here, then this was called by module code.  	 */ -	if (global_trace.buffer) +	if (global_trace.trace_buffer.buffer)  		tracing_start_cmdline_record();  } @@ -1675,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	local_save_flags(flags);  	size = sizeof(*entry) + sizeof(u32) * len; -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,  					  flags, pc);  	if (!event) @@ -1698,27 +1933,12 @@ out:  }  EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, -		       unsigned long ip, const char *fmt, ...) -{ -	int ret; -	va_list ap; - -	if (!(trace_flags & TRACE_ITER_PRINTK)) -		return 0; - -	va_start(ap, fmt); -	ret = trace_array_vprintk(tr, ip, fmt, ap); -	va_end(ap); -	return ret; -} - -int trace_array_vprintk(struct trace_array *tr, -			unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, +		      unsigned long ip, const char *fmt, va_list args)  {  	struct ftrace_event_call *call = &event_print;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer;  	int len = 0, size, pc;  	struct print_entry *entry;  	unsigned long flags; @@ -1746,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,  	local_save_flags(flags);  	size = sizeof(*entry) + len + 1; -	buffer = tr->buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,  					  flags, pc);  	if (!event) @@ -1767,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,  	return len;  } +int trace_array_vprintk(struct trace_array *tr, +			unsigned long ip, const char *fmt, va_list args) +{ +	return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, +		       unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = trace_array_vprintk(tr, ip, fmt, ap); +	va_end(ap); +	return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = __trace_array_vprintk(buffer, ip, fmt, ap); +	va_end(ap); +	return ret; +} +  int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  {  	return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1792,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,  	if (buf_iter)  		event = ring_buffer_iter_peek(buf_iter, ts);  	else -		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, +		event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,  					 lost_events);  	if (event) { @@ -1807,7 +2062,7 @@ static struct trace_entry *  __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  		  unsigned long *missing_events, u64 *ent_ts)  { -	struct ring_buffer *buffer = iter->tr->buffer; +	struct ring_buffer *buffer = iter->trace_buffer->buffer;  	struct trace_entry *ent, *next = NULL;  	unsigned long lost_events = 0, next_lost = 0;  	int cpu_file = iter->cpu_file; @@ -1820,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  	 * If we are in a per_cpu trace file, don't bother by iterating over  	 * all cpu and peek directly.  	 */ -	if (cpu_file > TRACE_PIPE_ALL_CPU) { +	if (cpu_file > RING_BUFFER_ALL_CPUS) {  		if (ring_buffer_empty_cpu(buffer, cpu_file))  			return NULL;  		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1884,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)  static void trace_consume(struct trace_iterator *iter)  { -	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, +	ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,  			    &iter->lost_events);  } @@ -1917,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)  void tracing_iter_reset(struct trace_iterator *iter, int cpu)  { -	struct trace_array *tr = iter->tr;  	struct ring_buffer_event *event;  	struct ring_buffer_iter *buf_iter;  	unsigned long entries = 0;  	u64 ts; -	tr->data[cpu]->skipped_entries = 0; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;  	buf_iter = trace_buffer_iter(iter, cpu);  	if (!buf_iter) @@ -1937,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  	 * by the timestamp being before the start of the buffer.  	 */  	while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { -		if (ts >= iter->tr->time_start) +		if (ts >= iter->trace_buffer->time_start)  			break;  		entries++;  		ring_buffer_read(buf_iter, NULL);  	} -	tr->data[cpu]->skipped_entries = entries; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;  }  /* @@ -1953,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  static void *s_start(struct seq_file *m, loff_t *pos)  {  	struct trace_iterator *iter = m->private; +	struct trace_array *tr = iter->tr;  	int cpu_file = iter->cpu_file;  	void *p = NULL;  	loff_t l = 0; @@ -1965,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)  	 * will point to the same string as current_trace->name.  	 */  	mutex_lock(&trace_types_lock); -	if (unlikely(current_trace && iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock); +#ifdef CONFIG_TRACER_MAX_TRACE  	if (iter->snapshot && iter->trace->use_max_tr)  		return ERR_PTR(-EBUSY); +#endif  	if (!iter->snapshot)  		atomic_inc(&trace_record_cmdline_disabled); @@ -1980,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)  		iter->cpu = 0;  		iter->idx = -1; -		if (cpu_file == TRACE_PIPE_ALL_CPU) { +		if (cpu_file == RING_BUFFER_ALL_CPUS) {  			for_each_tracing_cpu(cpu)  				tracing_iter_reset(iter, cpu);  		} else @@ -2012,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)  {  	struct trace_iterator *iter = m->private; +#ifdef CONFIG_TRACER_MAX_TRACE  	if (iter->snapshot && iter->trace->use_max_tr)  		return; +#endif  	if (!iter->snapshot)  		atomic_dec(&trace_record_cmdline_disabled); +  	trace_access_unlock(iter->cpu_file);  	trace_event_read_unlock();  }  static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +get_total_entries(struct trace_buffer *buf, +		  unsigned long *total, unsigned long *entries)  {  	unsigned long count;  	int cpu; @@ -2031,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e  	*entries = 0;  	for_each_tracing_cpu(cpu) { -		count = ring_buffer_entries_cpu(tr->buffer, cpu); +		count = ring_buffer_entries_cpu(buf->buffer, cpu);  		/*  		 * If this buffer has skipped entries, then we hold all  		 * entries for the trace and we need to ignore the  		 * ones before the time stamp.  		 */ -		if (tr->data[cpu]->skipped_entries) { -			count -= tr->data[cpu]->skipped_entries; +		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { +			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;  			/* total is the same as the entries */  			*total += count;  		} else  			*total += count + -				ring_buffer_overrun_cpu(tr->buffer, cpu); +				ring_buffer_overrun_cpu(buf->buffer, cpu);  		*entries += count;  	}  } @@ -2060,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)  	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");  } -static void print_event_info(struct trace_array *tr, struct seq_file *m) +static void print_event_info(struct trace_buffer *buf, struct seq_file *m)  {  	unsigned long total;  	unsigned long entries; -	get_total_entries(tr, &total, &entries); +	get_total_entries(buf, &total, &entries);  	seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",  		   entries, total, num_online_cpus());  	seq_puts(m, "#\n");  } -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)  { -	print_event_info(tr, m); +	print_event_info(buf, m);  	seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");  	seq_puts(m, "#              | |       |          |         |\n");  } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)  { -	print_event_info(tr, m); +	print_event_info(buf, m);  	seq_puts(m, "#                              _-----=> irqs-off\n");  	seq_puts(m, "#                             / _----=> need-resched\n");  	seq_puts(m, "#                            | / _---=> hardirq/softirq\n"); @@ -2094,16 +2355,16 @@ void  print_trace_header(struct seq_file *m, struct trace_iterator *iter)  {  	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); -	struct trace_array *tr = iter->tr; -	struct trace_array_cpu *data = tr->data[tr->cpu]; -	struct tracer *type = current_trace; +	struct trace_buffer *buf = iter->trace_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); +	struct tracer *type = iter->trace;  	unsigned long entries;  	unsigned long total;  	const char *name = "preemption";  	name = type->name; -	get_total_entries(tr, &total, &entries); +	get_total_entries(buf, &total, &entries);  	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",  		   name, UTS_RELEASE); @@ -2114,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  		   nsecs_to_usecs(data->saved_latency),  		   entries,  		   total, -		   tr->cpu, +		   buf->cpu,  #if defined(CONFIG_PREEMPT_NONE)  		   "server",  #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -2165,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)  	if (cpumask_test_cpu(iter->cpu, iter->started))  		return; -	if (iter->tr->data[iter->cpu]->skipped_entries) +	if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)  		return;  	cpumask_set_cpu(iter->cpu, iter->started); @@ -2288,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)  	int cpu;  	/* If we are looking at one CPU buffer, only check that one */ -	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { +	if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {  		cpu = iter->cpu_file;  		buf_iter = trace_buffer_iter(iter, cpu);  		if (buf_iter) {  			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  		return 1; @@ -2307,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)  			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  	} @@ -2331,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)  			return ret;  	} +	if (iter->ent->type == TRACE_BPUTS && +			trace_flags & TRACE_ITER_PRINTK && +			trace_flags & TRACE_ITER_PRINTK_MSGONLY) +		return trace_print_bputs_msg_only(iter); +  	if (iter->ent->type == TRACE_BPRINT &&  			trace_flags & TRACE_ITER_PRINTK &&  			trace_flags & TRACE_ITER_PRINTK_MSGONLY) @@ -2385,9 +2651,9 @@ void trace_default_header(struct seq_file *m)  	} else {  		if (!(trace_flags & TRACE_ITER_VERBOSE)) {  			if (trace_flags & TRACE_ITER_IRQ_INFO) -				print_func_help_header_irq(iter->tr, m); +				print_func_help_header_irq(iter->trace_buffer, m);  			else -				print_func_help_header(iter->tr, m); +				print_func_help_header(iter->trace_buffer, m);  		}  	}  } @@ -2400,6 +2666,50 @@ static void test_ftrace_alive(struct seq_file *m)  	seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");  } +#ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ +	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); +	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); +	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n"); +	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); +	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n"); +	seq_printf(m, "#                       is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ +	seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP +	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); +	seq_printf(m, "#                      Takes a snapshot of the main buffer for this cpu.\n"); +#else +	seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); +	seq_printf(m, "#                     Must use main snapshot file to allocate.\n"); +#endif +	seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); +	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n"); +	seq_printf(m, "#                       is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ +	if (iter->tr->allocated_snapshot) +		seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); +	else +		seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + +	seq_printf(m, "# Snapshot commands:\n"); +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +		show_snapshot_main_help(m); +	else +		show_snapshot_percpu_help(m); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif +  static int s_show(struct seq_file *m, void *v)  {  	struct trace_iterator *iter = v; @@ -2411,7 +2721,9 @@ static int s_show(struct seq_file *m, void *v)  			seq_puts(m, "#\n");  			test_ftrace_alive(m);  		} -		if (iter->trace && iter->trace->print_header) +		if (iter->snapshot && trace_empty(iter)) +			print_snapshot_help(m, iter); +		else if (iter->trace && iter->trace->print_header)  			iter->trace->print_header(m);  		else  			trace_default_header(m); @@ -2452,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {  static struct trace_iterator *  __tracing_open(struct inode *inode, struct file *file, bool snapshot)  { -	long cpu_file = (long) inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct trace_iterator *iter;  	int cpu; @@ -2477,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	if (!iter->trace)  		goto fail; -	*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))  		goto fail; -	if (current_trace->print_max || snapshot) -		iter->tr = &max_tr; +	iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE +	/* Currently only the top directory has a snapshot */ +	if (tr->current_trace->print_max || snapshot) +		iter->trace_buffer = &tr->max_buffer;  	else -		iter->tr = &global_trace; +#endif +		iter->trace_buffer = &tr->trace_buffer;  	iter->snapshot = snapshot;  	iter->pos = -1;  	mutex_init(&iter->mutex); -	iter->cpu_file = cpu_file; +	iter->cpu_file = tc->cpu;  	/* Notify the tracer early; before we stop tracing. */  	if (iter->trace && iter->trace->open)  		iter->trace->open(iter);  	/* Annotate start of buffers if we had overruns */ -	if (ring_buffer_overruns(iter->tr->buffer)) +	if (ring_buffer_overruns(iter->trace_buffer->buffer))  		iter->iter_flags |= TRACE_FILE_ANNOTATE;  	/* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -2505,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	/* stop the trace while dumping if we are not opening "snapshot" */  	if (!iter->snapshot) -		tracing_stop(); +		tracing_stop_tr(tr); -	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter->buffer_iter[cpu] = -				ring_buffer_read_prepare(iter->tr->buffer, cpu); +				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		}  		ring_buffer_read_prepare_sync();  		for_each_tracing_cpu(cpu) { @@ -2520,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	} else {  		cpu = iter->cpu_file;  		iter->buffer_iter[cpu] = -			ring_buffer_read_prepare(iter->tr->buffer, cpu); +			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		ring_buffer_read_prepare_sync();  		ring_buffer_read_start(iter->buffer_iter[cpu]);  		tracing_iter_reset(iter, cpu);  	} +	tr->ref++; +  	mutex_unlock(&trace_types_lock);  	return iter; @@ -2552,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)  {  	struct seq_file *m = file->private_data;  	struct trace_iterator *iter; +	struct trace_array *tr;  	int cpu;  	if (!(file->f_mode & FMODE_READ))  		return 0;  	iter = m->private; +	tr = iter->tr;  	mutex_lock(&trace_types_lock); + +	WARN_ON(!tr->ref); +	tr->ref--; +  	for_each_tracing_cpu(cpu) {  		if (iter->buffer_iter[cpu])  			ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2570,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)  	if (!iter->snapshot)  		/* reenable tracing if it was previously enabled */ -		tracing_start(); +		tracing_start_tr(tr);  	mutex_unlock(&trace_types_lock);  	mutex_destroy(&iter->mutex); @@ -2589,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)  	/* If this file was open for write, then erase contents */  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) { -		long cpu = (long) inode->i_private; +		struct trace_cpu *tc = inode->i_private; +		struct trace_array *tr = tc->tr; -		if (cpu == TRACE_PIPE_ALL_CPU) -			tracing_reset_online_cpus(&global_trace); +		if (tc->cpu == RING_BUFFER_ALL_CPUS) +			tracing_reset_online_cpus(&tr->trace_buffer);  		else -			tracing_reset(&global_trace, cpu); +			tracing_reset(&tr->trace_buffer, tc->cpu);  	}  	if (file->f_mode & FMODE_READ) { @@ -2741,8 +3068,9 @@ static ssize_t  tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		      size_t count, loff_t *ppos)  { -	int err, cpu; +	struct trace_array *tr = filp->private_data;  	cpumask_var_t tracing_cpumask_new; +	int err, cpu;  	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))  		return -ENOMEM; @@ -2762,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		 */  		if (cpumask_test_cpu(cpu, tracing_cpumask) &&  				!cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_inc(&global_trace.data[cpu]->disabled); -			ring_buffer_record_disable_cpu(global_trace.buffer, cpu); +			atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);  		}  		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&  				cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_dec(&global_trace.data[cpu]->disabled); -			ring_buffer_record_enable_cpu(global_trace.buffer, cpu); +			atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);  		}  	}  	arch_spin_unlock(&ftrace_max_lock); @@ -2797,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {  static int tracing_trace_options_show(struct seq_file *m, void *v)  {  	struct tracer_opt *trace_opts; +	struct trace_array *tr = m->private;  	u32 tracer_flags;  	int i;  	mutex_lock(&trace_types_lock); -	tracer_flags = current_trace->flags->val; -	trace_opts = current_trace->flags->opts; +	tracer_flags = tr->current_trace->flags->val; +	trace_opts = tr->current_trace->flags->opts;  	for (i = 0; trace_options[i]; i++) {  		if (trace_flags & (1 << i)) @@ -2857,11 +3186,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)  	return -EINVAL;  } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ +	if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) +		return -1; + +	return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)  {  	/* do nothing if flag is already set */  	if (!!(trace_flags & mask) == !!enabled) -		return; +		return 0; + +	/* Give the tracer a chance to approve the change */ +	if (tr->current_trace->flag_changed) +		if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) +			return -EINVAL;  	if (enabled)  		trace_flags |= mask; @@ -2871,18 +3214,24 @@ static void set_tracer_flags(unsigned int mask, int enabled)  	if (mask == TRACE_ITER_RECORD_CMD)  		trace_event_enable_cmd_record(enabled); -	if (mask == TRACE_ITER_OVERWRITE) -		ring_buffer_change_overwrite(global_trace.buffer, enabled); +	if (mask == TRACE_ITER_OVERWRITE) { +		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE +		ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#endif +	}  	if (mask == TRACE_ITER_PRINTK)  		trace_printk_start_stop_comm(enabled); + +	return 0;  } -static int trace_set_options(char *option) +static int trace_set_options(struct trace_array *tr, char *option)  {  	char *cmp;  	int neg = 0; -	int ret = 0; +	int ret = -ENODEV;  	int i;  	cmp = strstrip(option); @@ -2892,19 +3241,20 @@ static int trace_set_options(char *option)  		cmp += 2;  	} +	mutex_lock(&trace_types_lock); +  	for (i = 0; trace_options[i]; i++) {  		if (strcmp(cmp, trace_options[i]) == 0) { -			set_tracer_flags(1 << i, !neg); +			ret = set_tracer_flag(tr, 1 << i, !neg);  			break;  		}  	}  	/* If no option could be set, test the specific tracer options */ -	if (!trace_options[i]) { -		mutex_lock(&trace_types_lock); -		ret = set_tracer_option(current_trace, cmp, neg); -		mutex_unlock(&trace_types_lock); -	} +	if (!trace_options[i]) +		ret = set_tracer_option(tr->current_trace, cmp, neg); + +	mutex_unlock(&trace_types_lock);  	return ret;  } @@ -2913,7 +3263,10 @@ static ssize_t  tracing_trace_options_write(struct file *filp, const char __user *ubuf,  			size_t cnt, loff_t *ppos)  { +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private;  	char buf[64]; +	int ret;  	if (cnt >= sizeof(buf))  		return -EINVAL; @@ -2923,7 +3276,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,  	buf[cnt] = 0; -	trace_set_options(buf); +	ret = trace_set_options(tr, buf); +	if (ret < 0) +		return ret;  	*ppos += cnt; @@ -2934,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)  {  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_trace_options_show, NULL); + +	return single_open(file, tracing_trace_options_show, inode->i_private);  }  static const struct file_operations tracing_iter_fops = { @@ -2947,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {  static const char readme_msg[] =  	"tracing mini-HOWTO:\n\n" -	"# mount -t debugfs nodev /sys/kernel/debug\n\n" -	"# cat /sys/kernel/debug/tracing/available_tracers\n" -	"wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"nop\n" -	"# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"wakeup\n" -	"# cat /sys/kernel/debug/tracing/trace_options\n" -	"noprint-parent nosym-offset nosym-addr noverbose\n" -	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" -	"# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" -	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" -	"# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" +	"# echo 0 > tracing_on : quick way to disable tracing\n" +	"# echo 1 > tracing_on : quick way to re-enable tracing\n\n" +	" Important files:\n" +	"  trace\t\t\t- The static contents of the buffer\n" +	"\t\t\t  To clear the buffer write into this file: echo > trace\n" +	"  trace_pipe\t\t- A consuming read to see the contents of the buffer\n" +	"  current_tracer\t- function and latency tracers\n" +	"  available_tracers\t- list of configured tracers for current_tracer\n" +	"  buffer_size_kb\t- view and modify size of per cpu buffer\n" +	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n" +	"  trace_clock\t\t-change the clock used to order events\n" +	"       local:   Per cpu clock but may not be synced across CPUs\n" +	"      global:   Synced across CPUs but slows tracing down.\n" +	"     counter:   Not a clock, but just an increment\n" +	"      uptime:   Jiffy counter from time of boot\n" +	"        perf:   Same clock that perf events use\n" +#ifdef CONFIG_X86_64 +	"     x86-tsc:   TSC cycle counter\n" +#endif +	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n" +	"  tracing_cpumask\t- Limit which CPUs to trace\n" +	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n" +	"\t\t\t  Remove sub-buffer with rmdir\n" +	"  trace_options\t\t- Set format or modify how tracing happens\n" +	"\t\t\t  Disable an option by adding a suffix 'no' to the option name\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"\n  available_filter_functions - list of functions that can be filtered on\n" +	"  set_ftrace_filter\t- echo function name in here to only trace these functions\n" +	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"            modules: Can select a group via module\n" +	"             Format: :mod:<module-name>\n" +	"             example: echo :mod:ext3 > set_ftrace_filter\n" +	"            triggers: a command to perform when function is hit\n" +	"              Format: <function>:<trigger>[:count]\n" +	"             trigger: traceon, traceoff\n" +	"                      enable_event:<system>:<event>\n" +	"                      disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE +	"                      stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"                      snapshot\n" +#endif +	"             example: echo do_fault:traceoff > set_ftrace_filter\n" +	"                      echo do_trap:traceoff:3 > set_ftrace_filter\n" +	"             The first one will disable tracing every time do_fault is hit\n" +	"             The second will disable tracing at most 3 times when do_trap is hit\n" +	"               The first time do trap is hit and it disables tracing, the counter\n" +	"               will decrement to 2. If tracing is already disabled, the counter\n" +	"               will not decrement. It only decrements when the trigger did work\n" +	"             To remove trigger without count:\n" +	"               echo '!<function>:<trigger> > set_ftrace_filter\n" +	"             To remove trigger with a count:\n" +	"               echo '!<function>:<trigger>:0 > set_ftrace_filter\n" +	"  set_ftrace_notrace\t- echo function name in here to never trace.\n" +	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"            modules: Can select a group via module command :mod:\n" +	"            Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER +	"  set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	"  set_graph_function\t- Trace the nested calls of a function (function_graph)\n" +	"  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"\n  snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" +	"\t\t\t  Read the contents for more information\n" +#endif +#ifdef CONFIG_STACKTRACE +	"  stack_trace\t\t- Shows the max stack trace when active\n" +	"  stack_max_size\t- Shows current max stack size that was traced\n" +	"\t\t\t  Write into this file to reset the max size (trigger a new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" +#endif +#endif /* CONFIG_STACKTRACE */  ;  static ssize_t @@ -3032,11 +3452,12 @@ static ssize_t  tracing_set_trace_read(struct file *filp, char __user *ubuf,  		       size_t cnt, loff_t *ppos)  { +	struct trace_array *tr = filp->private_data;  	char buf[MAX_TRACER_SIZE+2];  	int r;  	mutex_lock(&trace_types_lock); -	r = sprintf(buf, "%s\n", current_trace->name); +	r = sprintf(buf, "%s\n", tr->current_trace->name);  	mutex_unlock(&trace_types_lock);  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3044,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,  int tracer_init(struct tracer *t, struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	return t->init(tr);  } -static void set_buffer_entries(struct trace_array *tr, unsigned long val) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)  {  	int cpu; +  	for_each_tracing_cpu(cpu) -		tr->data[cpu]->entries = val; +		per_cpu_ptr(buf->data, cpu)->entries = val;  } +#ifdef CONFIG_TRACER_MAX_TRACE  /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_array *tr, -					struct trace_array *size_tr, int cpu_id) +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id)  {  	int cpu, ret = 0;  	if (cpu_id == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) { -			ret = ring_buffer_resize(tr->buffer, -					size_tr->data[cpu]->entries, cpu); +			ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);  			if (ret < 0)  				break; -			tr->data[cpu]->entries = size_tr->data[cpu]->entries; +			per_cpu_ptr(trace_buf->data, cpu)->entries = +				per_cpu_ptr(size_buf->data, cpu)->entries;  		}  	} else { -		ret = ring_buffer_resize(tr->buffer, -					size_tr->data[cpu_id]->entries, cpu_id); +		ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);  		if (ret == 0) -			tr->data[cpu_id]->entries = -				size_tr->data[cpu_id]->entries; +			per_cpu_ptr(trace_buf->data, cpu_id)->entries = +				per_cpu_ptr(size_buf->data, cpu_id)->entries;  	}  	return ret;  } +#endif /* CONFIG_TRACER_MAX_TRACE */ -static int __tracing_resize_ring_buffer(unsigned long size, int cpu) +static int __tracing_resize_ring_buffer(struct trace_array *tr, +					unsigned long size, int cpu)  {  	int ret; @@ -3089,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)  	 * we use the size that was given, and we can forget about  	 * expanding it later.  	 */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true;  	/* May be called before buffers are initialized */ -	if (!global_trace.buffer) +	if (!tr->trace_buffer.buffer)  		return 0; -	ret = ring_buffer_resize(global_trace.buffer, size, cpu); +	ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);  	if (ret < 0)  		return ret; -	if (!current_trace->use_max_tr) +#ifdef CONFIG_TRACER_MAX_TRACE +	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || +	    !tr->current_trace->use_max_tr)  		goto out; -	ret = ring_buffer_resize(max_tr.buffer, size, cpu); +	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);  	if (ret < 0) { -		int r = resize_buffer_duplicate_size(&global_trace, -						     &global_trace, cpu); +		int r = resize_buffer_duplicate_size(&tr->trace_buffer, +						     &tr->trace_buffer, cpu);  		if (r < 0) {  			/*  			 * AARGH! We are left with different @@ -3128,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)  	}  	if (cpu == RING_BUFFER_ALL_CPUS) -		set_buffer_entries(&max_tr, size); +		set_buffer_entries(&tr->max_buffer, size);  	else -		max_tr.data[cpu]->entries = size; +		per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;   out: +#endif /* CONFIG_TRACER_MAX_TRACE */ +  	if (cpu == RING_BUFFER_ALL_CPUS) -		set_buffer_entries(&global_trace, size); +		set_buffer_entries(&tr->trace_buffer, size);  	else -		global_trace.data[cpu]->entries = size; +		per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;  	return ret;  } -static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, +					  unsigned long size, int cpu_id)  {  	int ret = size; @@ -3155,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)  		}  	} -	ret = __tracing_resize_ring_buffer(size, cpu_id); +	ret = __tracing_resize_ring_buffer(tr, size, cpu_id);  	if (ret < 0)  		ret = -ENOMEM; @@ -3182,7 +3613,7 @@ int tracing_update_buffers(void)  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) -		ret = __tracing_resize_ring_buffer(trace_buf_size, +		ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,  						RING_BUFFER_ALL_CPUS);  	mutex_unlock(&trace_types_lock); @@ -3192,7 +3623,7 @@ int tracing_update_buffers(void)  struct trace_option_dentry;  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer);  static void  destroy_trace_option_files(struct trace_option_dentry *topts); @@ -3202,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)  	static struct trace_option_dentry *topts;  	struct trace_array *tr = &global_trace;  	struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE  	bool had_max_tr; +#endif  	int ret = 0;  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) { -		ret = __tracing_resize_ring_buffer(trace_buf_size, +		ret = __tracing_resize_ring_buffer(tr, trace_buf_size,  						RING_BUFFER_ALL_CPUS);  		if (ret < 0)  			goto out; @@ -3223,15 +3656,21 @@ static int tracing_set_tracer(const char *buf)  		ret = -EINVAL;  		goto out;  	} -	if (t == current_trace) +	if (t == tr->current_trace)  		goto out;  	trace_branch_disable(); -	if (current_trace->reset) -		current_trace->reset(tr); -	had_max_tr = current_trace->allocated_snapshot; -	current_trace = &nop_trace; +	tr->current_trace->enabled = false; + +	if (tr->current_trace->reset) +		tr->current_trace->reset(tr); + +	/* Current trace needs to be nop_trace before synchronize_sched */ +	tr->current_trace = &nop_trace; + +#ifdef CONFIG_TRACER_MAX_TRACE +	had_max_tr = tr->allocated_snapshot;  	if (had_max_tr && !t->use_max_tr) {  		/* @@ -3242,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)  		 * so a synchronized_sched() is sufficient.  		 */  		synchronize_sched(); -		/* -		 * We don't free the ring buffer. instead, resize it because -		 * The max_tr ring buffer has some state (e.g. ring->clock) and -		 * we want preserve it. -		 */ -		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); -		set_buffer_entries(&max_tr, 1); -		tracing_reset_online_cpus(&max_tr); -		current_trace->allocated_snapshot = false; +		free_snapshot(tr);  	} +#endif  	destroy_trace_option_files(topts); -	topts = create_trace_option_files(t); +	topts = create_trace_option_files(tr, t); + +#ifdef CONFIG_TRACER_MAX_TRACE  	if (t->use_max_tr && !had_max_tr) { -		/* we need to make per cpu buffer sizes equivalent */ -		ret = resize_buffer_duplicate_size(&max_tr, &global_trace, -						   RING_BUFFER_ALL_CPUS); +		ret = alloc_snapshot(tr);  		if (ret < 0)  			goto out; -		t->allocated_snapshot = true;  	} +#endif  	if (t->init) {  		ret = tracer_init(t, tr); @@ -3270,7 +3702,8 @@ static int tracing_set_tracer(const char *buf)  			goto out;  	} -	current_trace = t; +	tr->current_trace = t; +	tr->current_trace->enabled = true;  	trace_branch_enable(tr);   out:  	mutex_unlock(&trace_types_lock); @@ -3344,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,  static int tracing_open_pipe(struct inode *inode, struct file *filp)  { -	long cpu_file = (long) inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct trace_iterator *iter;  	int ret = 0; @@ -3369,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  		ret = -ENOMEM;  		goto fail;  	} -	*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {  		ret = -ENOMEM; @@ -3386,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  	if (trace_clocks[trace_clock_id].in_ns)  		iter->iter_flags |= TRACE_FILE_TIME_IN_NS; -	iter->cpu_file = cpu_file; -	iter->tr = &global_trace; +	iter->cpu_file = tc->cpu; +	iter->tr = tc->tr; +	iter->trace_buffer = &tc->tr->trace_buffer;  	mutex_init(&iter->mutex);  	filp->private_data = iter; @@ -3426,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)  }  static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)  { -	struct trace_iterator *iter = filp->private_data; +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return POLLIN | POLLRDNORM; -	if (trace_flags & TRACE_ITER_BLOCK) { +	if (trace_flags & TRACE_ITER_BLOCK)  		/*  		 * Always select as readable when in blocking mode  		 */  		return POLLIN | POLLRDNORM; -	} else { -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; -		poll_wait(filp, &trace_wait, poll_table); -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; +	else +		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, +					     filp, poll_table); +} -		return 0; -	} +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ +	struct trace_iterator *iter = filp->private_data; + +	return trace_poll(iter, filp, poll_table);  }  /* @@ -3509,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  		  size_t cnt, loff_t *ppos)  {  	struct trace_iterator *iter = filp->private_data; +	struct trace_array *tr = iter->tr;  	ssize_t sret;  	/* return any leftover data */ @@ -3520,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	/* @@ -3677,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		.ops		= &tracing_pipe_buf_ops,  		.spd_release	= tracing_spd_release_pipe,  	}; +	struct trace_array *tr = iter->tr;  	ssize_t ret;  	size_t rem;  	unsigned int i; @@ -3686,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	mutex_lock(&iter->mutex); @@ -3749,43 +4190,19 @@ out_err:  	goto out;  } -struct ftrace_entries_info { -	struct trace_array	*tr; -	int			cpu; -}; - -static int tracing_entries_open(struct inode *inode, struct file *filp) -{ -	struct ftrace_entries_info *info; - -	if (tracing_disabled) -		return -ENODEV; - -	info = kzalloc(sizeof(*info), GFP_KERNEL); -	if (!info) -		return -ENOMEM; - -	info->tr = &global_trace; -	info->cpu = (unsigned long)inode->i_private; - -	filp->private_data = info; - -	return 0; -} -  static ssize_t  tracing_entries_read(struct file *filp, char __user *ubuf,  		     size_t cnt, loff_t *ppos)  { -	struct ftrace_entries_info *info = filp->private_data; -	struct trace_array *tr = info->tr; +	struct trace_cpu *tc = filp->private_data; +	struct trace_array *tr = tc->tr;  	char buf[64];  	int r = 0;  	ssize_t ret;  	mutex_lock(&trace_types_lock); -	if (info->cpu == RING_BUFFER_ALL_CPUS) { +	if (tc->cpu == RING_BUFFER_ALL_CPUS) {  		int cpu, buf_size_same;  		unsigned long size; @@ -3795,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,  		for_each_tracing_cpu(cpu) {  			/* fill in the size from first enabled cpu */  			if (size == 0) -				size = tr->data[cpu]->entries; -			if (size != tr->data[cpu]->entries) { +				size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; +			if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {  				buf_size_same = 0;  				break;  			} @@ -3812,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,  		} else  			r = sprintf(buf, "X\n");  	} else -		r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); +		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);  	mutex_unlock(&trace_types_lock); @@ -3824,7 +4241,7 @@ static ssize_t  tracing_entries_write(struct file *filp, const char __user *ubuf,  		      size_t cnt, loff_t *ppos)  { -	struct ftrace_entries_info *info = filp->private_data; +	struct trace_cpu *tc = filp->private_data;  	unsigned long val;  	int ret; @@ -3839,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	/* value is in KB */  	val <<= 10; -	ret = tracing_resize_ring_buffer(val, info->cpu); +	ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);  	if (ret < 0)  		return ret; @@ -3848,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	return cnt;  } -static int -tracing_entries_release(struct inode *inode, struct file *filp) -{ -	struct ftrace_entries_info *info = filp->private_data; - -	kfree(info); - -	return 0; -} -  static ssize_t  tracing_total_entries_read(struct file *filp, char __user *ubuf,  				size_t cnt, loff_t *ppos) @@ -3869,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,  	mutex_lock(&trace_types_lock);  	for_each_tracing_cpu(cpu) { -		size += tr->data[cpu]->entries >> 10; +		size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;  		if (!ring_buffer_expanded)  			expanded_size += trace_buf_size >> 10;  	} @@ -3899,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,  static int  tracing_free_buffer_release(struct inode *inode, struct file *filp)  { +	struct trace_array *tr = inode->i_private; +  	/* disable tracing ? */  	if (trace_flags & TRACE_ITER_STOP_ON_FREE)  		tracing_off();  	/* resize the ring buffer to 0 */ -	tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); +	tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);  	return 0;  } @@ -3972,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	local_save_flags(irq_flags);  	size = sizeof(*entry) + cnt + 2; /* possible \n added */ -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,  					  irq_flags, preempt_count());  	if (!event) { @@ -4014,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  static int tracing_clock_show(struct seq_file *m, void *v)  { +	struct trace_array *tr = m->private;  	int i;  	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)  		seq_printf(m,  			"%s%s%s%s", i ? " " : "", -			i == trace_clock_id ? "[" : "", trace_clocks[i].name, -			i == trace_clock_id ? "]" : ""); +			i == tr->clock_id ? "[" : "", trace_clocks[i].name, +			i == tr->clock_id ? "]" : "");  	seq_putc(m, '\n');  	return 0; @@ -4029,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)  static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  				   size_t cnt, loff_t *fpos)  { +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private;  	char buf[64];  	const char *clockstr;  	int i; @@ -4050,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  	if (i == ARRAY_SIZE(trace_clocks))  		return -EINVAL; -	trace_clock_id = i; -  	mutex_lock(&trace_types_lock); -	ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); -	if (max_tr.buffer) -		ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); +	tr->clock_id = i; + +	ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);  	/*  	 * New clock may not be consistent with the previous clock.  	 * Reset the buffer so that it doesn't have incomparable timestamps.  	 */ -	tracing_reset_online_cpus(&global_trace); -	tracing_reset_online_cpus(&max_tr); +	tracing_reset_online_cpus(&global_trace.trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) +		ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); +	tracing_reset_online_cpus(&global_trace.max_buffer); +#endif  	mutex_unlock(&trace_types_lock); @@ -4076,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)  {  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_clock_show, NULL); + +	return single_open(file, tracing_clock_show, inode->i_private);  } +struct ftrace_buffer_info { +	struct trace_iterator	iter; +	void			*spare; +	unsigned int		read; +}; +  #ifdef CONFIG_TRACER_SNAPSHOT  static int tracing_snapshot_open(struct inode *inode, struct file *file)  { +	struct trace_cpu *tc = inode->i_private;  	struct trace_iterator *iter; +	struct seq_file *m;  	int ret = 0;  	if (file->f_mode & FMODE_READ) {  		iter = __tracing_open(inode, file, true);  		if (IS_ERR(iter))  			ret = PTR_ERR(iter); +	} else { +		/* Writes still need the seq_file to hold the private data */ +		m = kzalloc(sizeof(*m), GFP_KERNEL); +		if (!m) +			return -ENOMEM; +		iter = kzalloc(sizeof(*iter), GFP_KERNEL); +		if (!iter) { +			kfree(m); +			return -ENOMEM; +		} +		iter->tr = tc->tr; +		iter->trace_buffer = &tc->tr->max_buffer; +		iter->cpu_file = tc->cpu; +		m->private = iter; +		file->private_data = m;  	} +  	return ret;  } @@ -4097,6 +4537,9 @@ static ssize_t  tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  { +	struct seq_file *m = filp->private_data; +	struct trace_iterator *iter = m->private; +	struct trace_array *tr = iter->tr;  	unsigned long val;  	int ret; @@ -4110,42 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,  	mutex_lock(&trace_types_lock); -	if (current_trace->use_max_tr) { +	if (tr->current_trace->use_max_tr) {  		ret = -EBUSY;  		goto out;  	}  	switch (val) {  	case 0: -		if (current_trace->allocated_snapshot) { -			/* free spare buffer */ -			ring_buffer_resize(max_tr.buffer, 1, -					   RING_BUFFER_ALL_CPUS); -			set_buffer_entries(&max_tr, 1); -			tracing_reset_online_cpus(&max_tr); -			current_trace->allocated_snapshot = false; +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break;  		} +		if (tr->allocated_snapshot) +			free_snapshot(tr);  		break;  	case 1: -		if (!current_trace->allocated_snapshot) { -			/* allocate spare buffer */ -			ret = resize_buffer_duplicate_size(&max_tr, -					&global_trace, RING_BUFFER_ALL_CPUS); +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break; +		} +#endif +		if (!tr->allocated_snapshot) { +			ret = alloc_snapshot(tr);  			if (ret < 0)  				break; -			current_trace->allocated_snapshot = true;  		} -  		local_irq_disable();  		/* Now, we're going to swap */ -		update_max_tr(&global_trace, current, smp_processor_id()); +		if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +			update_max_tr(tr, current, smp_processor_id()); +		else +			update_max_tr_single(tr, current, iter->cpu_file);  		local_irq_enable();  		break;  	default: -		if (current_trace->allocated_snapshot) -			tracing_reset_online_cpus(&max_tr); -		else -			ret = -EINVAL; +		if (tr->allocated_snapshot) { +			if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +				tracing_reset_online_cpus(&tr->max_buffer); +			else +				tracing_reset(&tr->max_buffer, iter->cpu_file); +		}  		break;  	} @@ -4157,6 +4606,51 @@ out:  	mutex_unlock(&trace_types_lock);  	return ret;  } + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ +	struct seq_file *m = file->private_data; + +	if (file->f_mode & FMODE_READ) +		return tracing_release(inode, file); + +	/* If write only, the seq_file is just a stub */ +	if (m) +		kfree(m->private); +	kfree(m); + +	return 0; +} + +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, +				    size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, +		   struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_buffer_info *info; +	int ret; + +	ret = tracing_buffers_open(inode, filp); +	if (ret < 0) +		return ret; + +	info = filp->private_data; + +	if (info->iter.trace->use_max_tr) { +		tracing_buffers_release(inode, filp); +		return -EBUSY; +	} + +	info->iter.snapshot = true; +	info->iter.trace_buffer = &info->iter.tr->max_buffer; + +	return ret; +} +  #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4184,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {  };  static const struct file_operations tracing_entries_fops = { -	.open		= tracing_entries_open, +	.open		= tracing_open_generic,  	.read		= tracing_entries_read,  	.write		= tracing_entries_write, -	.release	= tracing_entries_release,  	.llseek		= generic_file_llseek,  }; @@ -4222,20 +4715,23 @@ static const struct file_operations snapshot_fops = {  	.read		= seq_read,  	.write		= tracing_snapshot_write,  	.llseek		= tracing_seek, -	.release	= tracing_release, +	.release	= tracing_snapshot_release,  }; -#endif /* CONFIG_TRACER_SNAPSHOT */ -struct ftrace_buffer_info { -	struct trace_array	*tr; -	void			*spare; -	int			cpu; -	unsigned int		read; +static const struct file_operations snapshot_raw_fops = { +	.open		= snapshot_raw_open, +	.read		= tracing_buffers_read, +	.release	= tracing_buffers_release, +	.splice_read	= tracing_buffers_splice_read, +	.llseek		= no_llseek,  }; +#endif /* CONFIG_TRACER_SNAPSHOT */ +  static int tracing_buffers_open(struct inode *inode, struct file *filp)  { -	int cpu = (int)(long)inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct ftrace_buffer_info *info;  	if (tracing_disabled) @@ -4245,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)  	if (!info)  		return -ENOMEM; -	info->tr	= &global_trace; -	info->cpu	= cpu; -	info->spare	= NULL; +	mutex_lock(&trace_types_lock); + +	tr->ref++; + +	info->iter.tr		= tr; +	info->iter.cpu_file	= tc->cpu; +	info->iter.trace	= tr->current_trace; +	info->iter.trace_buffer = &tr->trace_buffer; +	info->spare		= NULL;  	/* Force reading ring buffer for first read */ -	info->read	= (unsigned int)-1; +	info->read		= (unsigned int)-1;  	filp->private_data = info; +	mutex_unlock(&trace_types_lock); +  	return nonseekable_open(inode, filp);  } +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ +	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter; + +	return trace_poll(iter, filp, poll_table); +} +  static ssize_t  tracing_buffers_read(struct file *filp, char __user *ubuf,  		     size_t count, loff_t *ppos)  {  	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter;  	ssize_t ret; -	size_t size; +	ssize_t size;  	if (!count)  		return 0; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		size = -EBUSY; +		goto out_unlock; +	} +#endif +  	if (!info->spare) -		info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); +		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, +							  iter->cpu_file); +	size = -ENOMEM;  	if (!info->spare) -		return -ENOMEM; +		goto out_unlock;  	/* Do we have previous read data to read? */  	if (info->read < PAGE_SIZE)  		goto read; -	trace_access_lock(info->cpu); -	ret = ring_buffer_read_page(info->tr->buffer, + again: +	trace_access_lock(iter->cpu_file); +	ret = ring_buffer_read_page(iter->trace_buffer->buffer,  				    &info->spare,  				    count, -				    info->cpu, 0); -	trace_access_unlock(info->cpu); -	if (ret < 0) -		return 0; +				    iter->cpu_file, 0); +	trace_access_unlock(iter->cpu_file); -	info->read = 0; +	if (ret < 0) { +		if (trace_empty(iter)) { +			if ((filp->f_flags & O_NONBLOCK)) { +				size = -EAGAIN; +				goto out_unlock; +			} +			mutex_unlock(&trace_types_lock); +			iter->trace->wait_pipe(iter); +			mutex_lock(&trace_types_lock); +			if (signal_pending(current)) { +				size = -EINTR; +				goto out_unlock; +			} +			goto again; +		} +		size = 0; +		goto out_unlock; +	} -read: +	info->read = 0; + read:  	size = PAGE_SIZE - info->read;  	if (size > count)  		size = count;  	ret = copy_to_user(ubuf, info->spare + info->read, size); -	if (ret == size) -		return -EFAULT; +	if (ret == size) { +		size = -EFAULT; +		goto out_unlock; +	}  	size -= ret;  	*ppos += size;  	info->read += size; + out_unlock: +	mutex_unlock(&trace_types_lock); +  	return size;  }  static int tracing_buffers_release(struct inode *inode, struct file *file)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter; + +	mutex_lock(&trace_types_lock); + +	WARN_ON(!iter->tr->ref); +	iter->tr->ref--;  	if (info->spare) -		ring_buffer_free_read_page(info->tr->buffer, info->spare); +		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);  	kfree(info); +	mutex_unlock(&trace_types_lock); +  	return 0;  } @@ -4375,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			    unsigned int flags)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter;  	struct partial_page partial_def[PIPE_DEF_BUFFERS];  	struct page *pages_def[PIPE_DEF_BUFFERS];  	struct splice_pipe_desc spd = { @@ -4387,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  	};  	struct buffer_ref *ref;  	int entries, size, i; -	size_t ret; +	ssize_t ret; -	if (splice_grow_spd(pipe, &spd)) -		return -ENOMEM; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		ret = -EBUSY; +		goto out; +	} +#endif + +	if (splice_grow_spd(pipe, &spd)) { +		ret = -ENOMEM; +		goto out; +	}  	if (*ppos & (PAGE_SIZE - 1)) {  		ret = -EINVAL; @@ -4405,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		len &= PAGE_MASK;  	} -	trace_access_lock(info->cpu); -	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: +	trace_access_lock(iter->cpu_file); +	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);  	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {  		struct page *page; @@ -4417,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			break;  		ref->ref = 1; -		ref->buffer = info->tr->buffer; -		ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); +		ref->buffer = iter->trace_buffer->buffer; +		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);  		if (!ref->page) {  			kfree(ref);  			break;  		}  		r = ring_buffer_read_page(ref->buffer, &ref->page, -					  len, info->cpu, 1); +					  len, iter->cpu_file, 1);  		if (r < 0) {  			ring_buffer_free_read_page(ref->buffer, ref->page);  			kfree(ref); @@ -4449,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		spd.nr_pages++;  		*ppos += PAGE_SIZE; -		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); +		entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);  	} -	trace_access_unlock(info->cpu); +	trace_access_unlock(iter->cpu_file);  	spd.nr_pages = i;  	/* did we read anything? */  	if (!spd.nr_pages) { -		if (flags & SPLICE_F_NONBLOCK) +		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {  			ret = -EAGAIN; -		else -			ret = 0; -		/* TODO: block */ -		goto out; +			goto out; +		} +		mutex_unlock(&trace_types_lock); +		iter->trace->wait_pipe(iter); +		mutex_lock(&trace_types_lock); +		if (signal_pending(current)) { +			ret = -EINTR; +			goto out; +		} +		goto again;  	}  	ret = splice_to_pipe(pipe, &spd);  	splice_shrink_spd(&spd);  out: +	mutex_unlock(&trace_types_lock); +  	return ret;  }  static const struct file_operations tracing_buffers_fops = {  	.open		= tracing_buffers_open,  	.read		= tracing_buffers_read, +	.poll		= tracing_buffers_poll,  	.release	= tracing_buffers_release,  	.splice_read	= tracing_buffers_splice_read,  	.llseek		= no_llseek, @@ -4483,12 +5060,14 @@ static ssize_t  tracing_stats_read(struct file *filp, char __user *ubuf,  		   size_t count, loff_t *ppos)  { -	unsigned long cpu = (unsigned long)filp->private_data; -	struct trace_array *tr = &global_trace; +	struct trace_cpu *tc = filp->private_data; +	struct trace_array *tr = tc->tr; +	struct trace_buffer *trace_buf = &tr->trace_buffer;  	struct trace_seq *s;  	unsigned long cnt;  	unsigned long long t;  	unsigned long usec_rem; +	int cpu = tc->cpu;  	s = kmalloc(sizeof(*s), GFP_KERNEL);  	if (!s) @@ -4496,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,  	trace_seq_init(s); -	cnt = ring_buffer_entries_cpu(tr->buffer, cpu); +	cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "entries: %ld\n", cnt); -	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "overrun: %ld\n", cnt); -	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "commit overrun: %ld\n", cnt); -	cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); +	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "bytes: %ld\n", cnt);  	if (trace_clocks[trace_clock_id].in_ns) {  		/* local or global for trace_clock */ -		t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); +		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));  		usec_rem = do_div(t, USEC_PER_SEC);  		trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",  								t, usec_rem); -		t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); +		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));  		usec_rem = do_div(t, USEC_PER_SEC);  		trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);  	} else {  		/* counter or tsc mode for trace_clock */  		trace_seq_printf(s, "oldest event ts: %llu\n", -				ring_buffer_oldest_event_ts(tr->buffer, cpu)); +				ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));  		trace_seq_printf(s, "now ts: %llu\n", -				ring_buffer_time_stamp(tr->buffer, cpu)); +				ring_buffer_time_stamp(trace_buf->buffer, cpu));  	} -	cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); +	cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "dropped events: %ld\n", cnt); -	cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); +	cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "read events: %ld\n", cnt);  	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4582,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {  	.read		= tracing_read_dyn_info,  	.llseek		= generic_file_llseek,  }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ -static struct dentry *d_tracer; +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ +	tracing_snapshot(); +} -struct dentry *tracing_init_dentry(void) +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ +	unsigned long *count = (long *)data; + +	if (!*count) +		return; + +	if (*count != -1) +		(*count)--; + +	tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *data) +{ +	long count = (long)data; + +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "snapshot"); + +	if (count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", count); + +	return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { +	.func			= ftrace_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { +	.func			= ftrace_count_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +			       char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; +	void *count = (void *)-1; +	char *number; +	int ret; + +	/* hash funcs only work with set_ftrace_filter */ +	if (!enable) +		return -EINVAL; + +	ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	} + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	if (!strlen(number)) +		goto out_reg; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, (unsigned long *)&count); +	if (ret) +		return ret; + + out_reg: +	ret = register_ftrace_function_probe(glob, ops, count); + +	if (ret >= 0) +		alloc_snapshot(&global_trace); + +	return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { +	.name			= "snapshot", +	.func			= ftrace_trace_snapshot_callback, +}; + +static int register_snapshot_cmd(void)  { -	static int once; +	return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -	if (d_tracer) -		return d_tracer; +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +{ +	if (tr->dir) +		return tr->dir;  	if (!debugfs_initialized())  		return NULL; -	d_tracer = debugfs_create_dir("tracing", NULL); +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		tr->dir = debugfs_create_dir("tracing", NULL); -	if (!d_tracer && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'tracing'\n"); -		return NULL; -	} +	if (!tr->dir) +		pr_warn_once("Could not create debugfs directory 'tracing'\n"); -	return d_tracer; +	return tr->dir;  } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ +	return tracing_init_dentry_tr(&global_trace); +} -static struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)  { -	static int once;  	struct dentry *d_tracer; -	if (d_percpu) -		return d_percpu; - -	d_tracer = tracing_init_dentry(); +	if (tr->percpu_dir) +		return tr->percpu_dir; +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	d_percpu = debugfs_create_dir("per_cpu", d_tracer); +	tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); -	if (!d_percpu && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'per_cpu'\n"); -		return NULL; -	} +	WARN_ONCE(!tr->percpu_dir, +		  "Could not create debugfs directory 'per_cpu/%d'\n", cpu); -	return d_percpu; +	return tr->percpu_dir;  } -static void tracing_init_debugfs_percpu(long cpu) +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)  { -	struct dentry *d_percpu = tracing_dentry_percpu(); +	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);  	struct dentry *d_cpu;  	char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -4651,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)  	/* per cpu trace_pipe */  	trace_create_file("trace_pipe", 0444, d_cpu, -			(void *) cpu, &tracing_pipe_fops); +			(void *)&data->trace_cpu, &tracing_pipe_fops);  	/* per cpu trace */  	trace_create_file("trace", 0644, d_cpu, -			(void *) cpu, &tracing_fops); +			(void *)&data->trace_cpu, &tracing_fops);  	trace_create_file("trace_pipe_raw", 0444, d_cpu, -			(void *) cpu, &tracing_buffers_fops); +			(void *)&data->trace_cpu, &tracing_buffers_fops);  	trace_create_file("stats", 0444, d_cpu, -			(void *) cpu, &tracing_stats_fops); +			(void *)&data->trace_cpu, &tracing_stats_fops);  	trace_create_file("buffer_size_kb", 0444, d_cpu, -			(void *) cpu, &tracing_entries_fops); +			(void *)&data->trace_cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_cpu, +			  (void *)&data->trace_cpu, &snapshot_fops); + +	trace_create_file("snapshot_raw", 0444, d_cpu, +			(void *)&data->trace_cpu, &snapshot_raw_fops); +#endif  }  #ifdef CONFIG_FTRACE_SELFTEST @@ -4675,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)  struct trace_option_dentry {  	struct tracer_opt		*opt;  	struct tracer_flags		*flags; +	struct trace_array		*tr;  	struct dentry			*entry;  }; @@ -4710,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (!!(topt->flags->val & topt->opt->bit) != val) {  		mutex_lock(&trace_types_lock); -		ret = __set_tracer_option(current_trace, topt->flags, +		ret = __set_tracer_option(topt->tr->current_trace, topt->flags,  					  topt->opt, !val);  		mutex_unlock(&trace_types_lock);  		if (ret) @@ -4749,6 +5438,7 @@ static ssize_t  trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  			 loff_t *ppos)  { +	struct trace_array *tr = &global_trace;  	long index = (long)filp->private_data;  	unsigned long val;  	int ret; @@ -4759,7 +5449,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (val != 0 && val != 1)  		return -EINVAL; -	set_tracer_flags(1 << index, val); + +	mutex_lock(&trace_types_lock); +	ret = set_tracer_flag(tr, 1 << index, val); +	mutex_unlock(&trace_types_lock); + +	if (ret < 0) +		return ret;  	*ppos += cnt; @@ -4789,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,  } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr)  {  	struct dentry *d_tracer; -	static struct dentry *t_options; -	if (t_options) -		return t_options; +	if (tr->options) +		return tr->options; -	d_tracer = tracing_init_dentry(); +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	t_options = debugfs_create_dir("options", d_tracer); -	if (!t_options) { +	tr->options = debugfs_create_dir("options", d_tracer); +	if (!tr->options) {  		pr_warning("Could not create debugfs directory 'options'\n");  		return NULL;  	} -	return t_options; +	return tr->options;  }  static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, +			 struct trace_option_dentry *topt,  			 struct tracer_flags *flags,  			 struct tracer_opt *opt)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	topt->flags = flags;  	topt->opt = opt; +	topt->tr = tr;  	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,  				    &trace_options_fops); @@ -4830,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,  }  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer)  {  	struct trace_option_dentry *topts;  	struct tracer_flags *flags; @@ -4855,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)  		return NULL;  	for (cnt = 0; opts[cnt].name; cnt++) -		create_trace_option_file(&topts[cnt], flags, +		create_trace_option_file(tr, &topts[cnt], flags,  					 &opts[cnt]);  	return topts; @@ -4878,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)  }  static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, +			      const char *option, long index)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return NULL; @@ -4890,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)  				    &trace_options_core_fops);  } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr)  {  	struct dentry *t_options;  	int i; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	for (i = 0; trace_options[i]; i++) -		create_trace_option_core_file(trace_options[i], i); +		create_trace_option_core_file(tr, trace_options[i], i);  }  static ssize_t @@ -4908,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,  	       size_t cnt, loff_t *ppos)  {  	struct trace_array *tr = filp->private_data; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	char buf[64];  	int r; @@ -4927,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  		size_t cnt, loff_t *ppos)  {  	struct trace_array *tr = filp->private_data; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	unsigned long val;  	int ret; @@ -4939,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  		mutex_lock(&trace_types_lock);  		if (val) {  			ring_buffer_record_on(buffer); -			if (current_trace->start) -				current_trace->start(tr); +			if (tr->current_trace->start) +				tr->current_trace->start(tr);  		} else {  			ring_buffer_record_off(buffer); -			if (current_trace->stop) -				current_trace->stop(tr); +			if (tr->current_trace->stop) +				tr->current_trace->stop(tr);  		}  		mutex_unlock(&trace_types_lock);  	} @@ -4961,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {  	.llseek		= default_llseek,  }; +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) +{ +	int cpu; + +	for_each_tracing_cpu(cpu) { +		memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); +		per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; +		per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; +	} +} + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ +	enum ring_buffer_flags rb_flags; + +	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + +	buf->buffer = ring_buffer_alloc(size, rb_flags); +	if (!buf->buffer) +		return -ENOMEM; + +	buf->data = alloc_percpu(struct trace_array_cpu); +	if (!buf->data) { +		ring_buffer_free(buf->buffer); +		return -ENOMEM; +	} + +	init_trace_buffers(tr, buf); + +	/* Allocate the first page for all buffers */ +	set_buffer_entries(&tr->trace_buffer, +			   ring_buffer_size(tr->trace_buffer.buffer, 0)); + +	return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ +	int ret; + +	ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); +	if (ret) +		return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE +	ret = allocate_trace_buffer(tr, &tr->max_buffer, +				    allocate_snapshot ? size : 1); +	if (WARN_ON(ret)) { +		ring_buffer_free(tr->trace_buffer.buffer); +		free_percpu(tr->trace_buffer.data); +		return -ENOMEM; +	} +	tr->allocated_snapshot = allocate_snapshot; + +	/* +	 * Only the top level trace array gets its snapshot allocated +	 * from the kernel command line. +	 */ +	allocate_snapshot = false; +#endif +	return 0; +} + +static int new_instance_create(const char *name) +{ +	struct trace_array *tr; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -EEXIST; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) +			goto out_unlock; +	} + +	ret = -ENOMEM; +	tr = kzalloc(sizeof(*tr), GFP_KERNEL); +	if (!tr) +		goto out_unlock; + +	tr->name = kstrdup(name, GFP_KERNEL); +	if (!tr->name) +		goto out_free_tr; + +	raw_spin_lock_init(&tr->start_lock); + +	tr->current_trace = &nop_trace; + +	INIT_LIST_HEAD(&tr->systems); +	INIT_LIST_HEAD(&tr->events); + +	if (allocate_trace_buffers(tr, trace_buf_size) < 0) +		goto out_free_tr; + +	/* Holder for file callbacks */ +	tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; +	tr->trace_cpu.tr = tr; + +	tr->dir = debugfs_create_dir(name, trace_instance_dir); +	if (!tr->dir) +		goto out_free_tr; + +	ret = event_trace_add_tracer(tr->dir, tr); +	if (ret) +		goto out_free_tr; + +	init_tracer_debugfs(tr, tr->dir); + +	list_add(&tr->list, &ftrace_trace_arrays); + +	mutex_unlock(&trace_types_lock); + +	return 0; + + out_free_tr: +	if (tr->trace_buffer.buffer) +		ring_buffer_free(tr->trace_buffer.buffer); +	kfree(tr->name); +	kfree(tr); + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; + +} + +static int instance_delete(const char *name) +{ +	struct trace_array *tr; +	int found = 0; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -ENODEV; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) { +			found = 1; +			break; +		} +	} +	if (!found) +		goto out_unlock; + +	ret = -EBUSY; +	if (tr->ref) +		goto out_unlock; + +	list_del(&tr->list); + +	event_trace_del_tracer(tr); +	debugfs_remove_recursive(tr->dir); +	free_percpu(tr->trace_buffer.data); +	ring_buffer_free(tr->trace_buffer.buffer); + +	kfree(tr->name); +	kfree(tr); + +	ret = 0; + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the new_instance_create() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = new_instance_create(dentry->d_iname); + +	mutex_lock(&inode->i_mutex); + +	return ret; +} + +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* The caller did a dget() on dentry */ +	mutex_unlock(&dentry->d_inode->i_mutex); + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the instance_delete() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = instance_delete(dentry->d_iname); + +	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); +	mutex_lock(&dentry->d_inode->i_mutex); + +	return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { +	.lookup		= simple_lookup, +	.mkdir		= instance_mkdir, +	.rmdir		= instance_rmdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ +	trace_instance_dir = debugfs_create_dir("instances", d_tracer); +	if (WARN_ON(!trace_instance_dir)) +		return; + +	/* Hijack the dir inode operations, to allow mkdir */ +	trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ +	int cpu; + +	trace_create_file("trace_options", 0644, d_tracer, +			  tr, &tracing_iter_fops); + +	trace_create_file("trace", 0644, d_tracer, +			(void *)&tr->trace_cpu, &tracing_fops); + +	trace_create_file("trace_pipe", 0444, d_tracer, +			(void *)&tr->trace_cpu, &tracing_pipe_fops); + +	trace_create_file("buffer_size_kb", 0644, d_tracer, +			(void *)&tr->trace_cpu, &tracing_entries_fops); + +	trace_create_file("buffer_total_size_kb", 0444, d_tracer, +			  tr, &tracing_total_entries_fops); + +	trace_create_file("free_buffer", 0644, d_tracer, +			  tr, &tracing_free_buffer_fops); + +	trace_create_file("trace_marker", 0220, d_tracer, +			  tr, &tracing_mark_fops); + +	trace_create_file("trace_clock", 0644, d_tracer, tr, +			  &trace_clock_fops); + +	trace_create_file("tracing_on", 0644, d_tracer, +			    tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_tracer, +			  (void *)&tr->trace_cpu, &snapshot_fops); +#endif + +	for_each_tracing_cpu(cpu) +		tracing_init_debugfs_percpu(tr, cpu); + +} +  static __init int tracer_init_debugfs(void)  {  	struct dentry *d_tracer; -	int cpu;  	trace_access_lock_init();  	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; -	trace_create_file("trace_options", 0644, d_tracer, -			NULL, &tracing_iter_fops); +	init_tracer_debugfs(&global_trace, d_tracer);  	trace_create_file("tracing_cpumask", 0644, d_tracer, -			NULL, &tracing_cpumask_fops); - -	trace_create_file("trace", 0644, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops); +			&global_trace, &tracing_cpumask_fops);  	trace_create_file("available_tracers", 0444, d_tracer,  			&global_trace, &show_traces_fops); @@ -4996,44 +5981,17 @@ static __init int tracer_init_debugfs(void)  	trace_create_file("README", 0444, d_tracer,  			NULL, &tracing_readme_fops); -	trace_create_file("trace_pipe", 0444, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - -	trace_create_file("buffer_size_kb", 0644, d_tracer, -			(void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); - -	trace_create_file("buffer_total_size_kb", 0444, d_tracer, -			&global_trace, &tracing_total_entries_fops); - -	trace_create_file("free_buffer", 0644, d_tracer, -			&global_trace, &tracing_free_buffer_fops); - -	trace_create_file("trace_marker", 0220, d_tracer, -			NULL, &tracing_mark_fops); -  	trace_create_file("saved_cmdlines", 0444, d_tracer,  			NULL, &tracing_saved_cmdlines_fops); -	trace_create_file("trace_clock", 0644, d_tracer, NULL, -			  &trace_clock_fops); - -	trace_create_file("tracing_on", 0644, d_tracer, -			    &global_trace, &rb_simple_fops); -  #ifdef CONFIG_DYNAMIC_FTRACE  	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,  			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);  #endif -#ifdef CONFIG_TRACER_SNAPSHOT -	trace_create_file("snapshot", 0644, d_tracer, -			  (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); -#endif - -	create_trace_options_dir(); +	create_trace_instances(d_tracer); -	for_each_tracing_cpu(cpu) -		tracing_init_debugfs_percpu(cpu); +	create_trace_options_dir(&global_trace);  	return 0;  } @@ -5089,8 +6047,8 @@ void  trace_printk_seq(struct trace_seq *s)  {  	/* Probably should print a warning here. */ -	if (s->len >= 1000) -		s->len = 1000; +	if (s->len >= TRACE_MAX_PRINT) +		s->len = TRACE_MAX_PRINT;  	/* should be zero ended, but we are paranoid. */  	s->buffer[s->len] = 0; @@ -5103,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)  void trace_init_global_iter(struct trace_iterator *iter)  {  	iter->tr = &global_trace; -	iter->trace = current_trace; -	iter->cpu_file = TRACE_PIPE_ALL_CPU; +	iter->trace = iter->tr->current_trace; +	iter->cpu_file = RING_BUFFER_ALL_CPUS; +	iter->trace_buffer = &global_trace.trace_buffer;  } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)  { -	static arch_spinlock_t ftrace_dump_lock = -		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  	/* use static because iter can be a bit big for the stack */  	static struct trace_iterator iter; +	static atomic_t dump_running;  	unsigned int old_userobj; -	static int dump_ran;  	unsigned long flags;  	int cnt = 0, cpu; -	/* only one dump */ -	local_irq_save(flags); -	arch_spin_lock(&ftrace_dump_lock); -	if (dump_ran) -		goto out; - -	dump_ran = 1; +	/* Only allow one dump user at a time. */ +	if (atomic_inc_return(&dump_running) != 1) { +		atomic_dec(&dump_running); +		return; +	} +	/* +	 * Always turn off tracing when we dump. +	 * We don't need to show trace output of what happens +	 * between multiple crashes. +	 * +	 * If the user does a sysrq-z, then they can re-enable +	 * tracing with echo 1 > tracing_on. +	 */  	tracing_off(); -	/* Did function tracer already get disabled? */ -	if (ftrace_is_dead()) { -		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); -		printk("#          MAY BE MISSING FUNCTION EVENTS\n"); -	} - -	if (disable_tracing) -		ftrace_kill(); +	local_irq_save(flags);  	/* Simulate the iterator */  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);  	}  	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -5152,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  	switch (oops_dump_mode) {  	case DUMP_ALL: -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  		break;  	case DUMP_ORIG:  		iter.cpu_file = raw_smp_processor_id(); @@ -5161,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		goto out_enable;  	default:  		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  	}  	printk(KERN_TRACE "Dumping ftrace buffer:\n"); +	/* Did function tracer already get disabled? */ +	if (ftrace_is_dead()) { +		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); +		printk("#          MAY BE MISSING FUNCTION EVENTS\n"); +	} +  	/*  	 * We need to stop all tracing on all CPUS to read the  	 * the next buffer. This is a bit expensive, but is @@ -5205,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		printk(KERN_TRACE "---------------------------------\n");   out_enable: -	/* Re-enable tracing if requested */ -	if (!disable_tracing) { -		trace_flags |= old_userobj; +	trace_flags |= old_userobj; -		for_each_tracing_cpu(cpu) { -			atomic_dec(&iter.tr->data[cpu]->disabled); -		} -		tracing_on(); +	for_each_tracing_cpu(cpu) { +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	} - - out: -	arch_spin_unlock(&ftrace_dump_lock); + 	atomic_dec(&dump_running);  	local_irq_restore(flags);  } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ -	__ftrace_dump(true, oops_dump_mode); -}  EXPORT_SYMBOL_GPL(ftrace_dump);  __init static int tracer_alloc_buffers(void)  {  	int ring_buf_size; -	enum ring_buffer_flags rb_flags; -	int i;  	int ret = -ENOMEM; @@ -5252,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)  	else  		ring_buf_size = 1; -	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; -  	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);  	cpumask_copy(tracing_cpumask, cpu_all_mask); +	raw_spin_lock_init(&global_trace.start_lock); +  	/* TODO: make the number of buffers hot pluggable with CPUS */ -	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); -	if (!global_trace.buffer) { +	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {  		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");  		WARN_ON(1);  		goto out_free_cpumask;  	} +  	if (global_trace.buffer_disabled)  		tracing_off(); - -#ifdef CONFIG_TRACER_MAX_TRACE -	max_tr.buffer = ring_buffer_alloc(1, rb_flags); -	if (!max_tr.buffer) { -		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); -		WARN_ON(1); -		ring_buffer_free(global_trace.buffer); -		goto out_free_cpumask; -	} -#endif - -	/* Allocate the first page for all buffers */ -	for_each_tracing_cpu(i) { -		global_trace.data[i] = &per_cpu(global_trace_cpu, i); -		max_tr.data[i] = &per_cpu(max_tr_data, i); -	} - -	set_buffer_entries(&global_trace, -			   ring_buffer_size(global_trace.buffer, 0)); -#ifdef CONFIG_TRACER_MAX_TRACE -	set_buffer_entries(&max_tr, 1); -#endif -  	trace_init_cmdlines(); -	init_irq_work(&trace_work_wakeup, trace_wake_up);  	register_tracer(&nop_trace); +	global_trace.current_trace = &nop_trace; +  	/* All seems OK, enable tracing */  	tracing_disabled = 0; @@ -5303,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)  	register_die_notifier(&trace_die_notifier); +	global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + +	/* Holder for file callbacks */ +	global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; +	global_trace.trace_cpu.tr = &global_trace; + +	INIT_LIST_HEAD(&global_trace.systems); +	INIT_LIST_HEAD(&global_trace.events); +	list_add(&global_trace.list, &ftrace_trace_arrays); +  	while (trace_boot_options) {  		char *option;  		option = strsep(&trace_boot_options, ","); -		trace_set_options(option); +		trace_set_options(&global_trace, option);  	} +	register_snapshot_cmd(); +  	return 0;  out_free_cpumask: +	free_percpu(global_trace.trace_buffer.data); +#ifdef CONFIG_TRACER_MAX_TRACE +	free_percpu(global_trace.max_buffer.data); +#endif  	free_cpumask_var(tracing_cpumask);  out_free_buffer_mask:  	free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d5..711ca7d3e7f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,11 @@  #include <linux/trace_seq.h>  #include <linux/ftrace_event.h> +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h>		/* For NR_SYSCALLS	     */ +#include <asm/syscall.h>	/* some archs define it here */ +#endif +  enum trace_type {  	__TRACE_FIRST_TYPE = 0, @@ -29,6 +34,7 @@ enum trace_type {  	TRACE_GRAPH_ENT,  	TRACE_USER_STACK,  	TRACE_BLK, +	TRACE_BPUTS,  	__TRACE_LAST_TYPE,  }; @@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head {  	unsigned long		ret_ip;  }; -struct uprobe_trace_entry_head { -	struct trace_entry	ent; -	unsigned long		ip; -}; -  /*   * trace_flag_type is an enumeration that holds different   * states when a trace occurs. These are: @@ -127,12 +128,21 @@ enum trace_flag_type {  #define TRACE_BUF_SIZE		1024 +struct trace_array; + +struct trace_cpu { +	struct trace_array	*tr; +	struct dentry		*dir; +	int			cpu; +}; +  /*   * The CPU trace array - it consists of thousands of trace entries   * plus some other descriptor data: (for example which task started   * the trace, etc.)   */  struct trace_array_cpu { +	struct trace_cpu	trace_cpu;  	atomic_t		disabled;  	void			*buffer_page;	/* ring buffer spare */ @@ -151,20 +161,83 @@ struct trace_array_cpu {  	char			comm[TASK_COMM_LEN];  }; +struct tracer; + +struct trace_buffer { +	struct trace_array		*tr; +	struct ring_buffer		*buffer; +	struct trace_array_cpu __percpu	*data; +	cycle_t				time_start; +	int				cpu; +}; +  /*   * The trace array - an array of per-CPU trace arrays. This is the   * highest level data structure that individual tracers deal with.   * They have on/off state as well:   */  struct trace_array { -	struct ring_buffer	*buffer; -	int			cpu; +	struct list_head	list; +	char			*name; +	struct trace_buffer	trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	/* +	 * The max_buffer is used to snapshot the trace when a maximum +	 * latency is reached, or when the user initiates a snapshot. +	 * Some tracers will use this to store a maximum trace while +	 * it continues examining live traces. +	 * +	 * The buffers for the max_buffer are set up the same as the trace_buffer +	 * When a snapshot is taken, the buffer of the max_buffer is swapped +	 * with the buffer of the trace_buffer and the buffers are reset for +	 * the trace_buffer so the tracing can continue. +	 */ +	struct trace_buffer	max_buffer; +	bool			allocated_snapshot; +#endif  	int			buffer_disabled; -	cycle_t			time_start; +	struct trace_cpu	trace_cpu;	/* place holder */ +#ifdef CONFIG_FTRACE_SYSCALLS +	int			sys_refcount_enter; +	int			sys_refcount_exit; +	DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +	DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +#endif +	int			stop_count; +	int			clock_id; +	struct tracer		*current_trace; +	unsigned int		flags; +	raw_spinlock_t		start_lock; +	struct dentry		*dir; +	struct dentry		*options; +	struct dentry		*percpu_dir; +	struct dentry		*event_dir; +	struct list_head	systems; +	struct list_head	events;  	struct task_struct	*waiter; -	struct trace_array_cpu	*data[NR_CPUS]; +	int			ref; +}; + +enum { +	TRACE_ARRAY_FL_GLOBAL	= (1 << 0)  }; +extern struct list_head ftrace_trace_arrays; + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ +	struct trace_array *tr; + +	tr = list_entry(ftrace_trace_arrays.prev, +			typeof(*tr), list); +	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); +	return tr; +} +  #define FTRACE_CMP_TYPE(var, type) \  	__builtin_types_compatible_p(typeof(var), type *) @@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void);  		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\  		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\  		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\ +		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\  			  TRACE_MMIO_RW);				\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\ @@ -283,11 +357,16 @@ struct tracer {  	enum print_line_t	(*print_line)(struct trace_iterator *iter);  	/* If you handled the flag setting, return 0 */  	int			(*set_flag)(u32 old_flags, u32 bit, int set); +	/* Return 0 if OK with change, else return non-zero */ +	int			(*flag_changed)(struct tracer *tracer, +						u32 mask, int set);  	struct tracer		*next;  	struct tracer_flags	*flags;  	bool			print_max; +	bool			enabled; +#ifdef CONFIG_TRACER_MAX_TRACE  	bool			use_max_tr; -	bool			allocated_snapshot; +#endif  }; @@ -423,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit)  	current->trace_recursion = val;  } -#define TRACE_PIPE_ALL_CPU	-1 -  static inline struct ring_buffer_iter *  trace_buffer_iter(struct trace_iterator *iter, int cpu)  { @@ -435,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)  int tracer_init(struct tracer *t, struct trace_array *tr);  int tracing_is_enabled(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf);  void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp);  struct dentry *trace_create_file(const char *name,  				 umode_t mode, @@ -446,6 +523,7 @@ struct dentry *trace_create_file(const char *name,  				 void *data,  				 const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr);  struct dentry *tracing_init_dentry(void);  struct ring_buffer_event; @@ -579,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void);  #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2  extern int DYN_FTRACE_TEST_NAME2(void); -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded;  extern bool tracing_selftest_disabled;  DECLARE_PER_CPU(int, ftrace_cpu_disabled); @@ -615,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr,  		    unsigned long ip, const char *fmt, va_list args);  int trace_array_printk(struct trace_array *tr,  		       unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...);  void trace_printk_seq(struct trace_seq *s);  enum print_line_t print_trace_line(struct trace_iterator *iter); @@ -782,6 +862,7 @@ enum trace_iterator_flags {  	TRACE_ITER_STOP_ON_FREE		= 0x400000,  	TRACE_ITER_IRQ_INFO		= 0x800000,  	TRACE_ITER_MARKERS		= 0x1000000, +	TRACE_ITER_FUNCTION		= 0x2000000,  };  /* @@ -828,8 +909,8 @@ enum {  struct ftrace_event_field {  	struct list_head	link; -	char			*name; -	char			*type; +	const char		*name; +	const char		*type;  	int			filter_type;  	int			offset;  	int			size; @@ -847,12 +928,19 @@ struct event_filter {  struct event_subsystem {  	struct list_head	list;  	const char		*name; -	struct dentry		*entry;  	struct event_filter	*filter; -	int			nr_events;  	int			ref_count;  }; +struct ftrace_subsystem_dir { +	struct list_head		list; +	struct event_subsystem		*subsystem; +	struct trace_array		*tr; +	struct dentry			*entry; +	int				ref_count; +	int				nr_events; +}; +  #define FILTER_PRED_INVALID	((unsigned short)-1)  #define FILTER_PRED_IS_RIGHT	(1 << 15)  #define FILTER_PRED_FOLD	(1 << 15) @@ -902,22 +990,20 @@ struct filter_pred {  	unsigned short		right;  }; -extern struct list_head ftrace_common_fields; -  extern enum regex_type  filter_parse_regex(char *buff, int len, char **search, int *not);  extern void print_event_filter(struct ftrace_event_call *call,  			       struct trace_seq *s);  extern int apply_event_filter(struct ftrace_event_call *call,  			      char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  					char *filter_string);  extern void print_subsystem_event_filter(struct event_subsystem *system,  					 struct trace_seq *s);  extern int filter_assign_type(const char *type); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name);  static inline int  filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -934,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,  }  extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr);  extern struct mutex event_mutex;  extern struct list_head ftrace_events; @@ -943,6 +1031,19 @@ extern const char *__stop___trace_bprintk_fmt[];  void trace_printk_init_buffers(void);  void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))  #undef FTRACE_ENTRY  #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 95e96842ed2..d594da0dc03 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  {  	struct ftrace_event_call *call = &event_branch;  	struct trace_array *tr = branch_tracer; +	struct trace_array_cpu *data;  	struct ring_buffer_event *event;  	struct trace_branch *entry;  	struct ring_buffer *buffer; @@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) +	data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	if (atomic_inc_return(&data->disabled) != 1)  		goto out;  	pc = preempt_count(); -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,  					  sizeof(*entry), flags, pc);  	if (!event) @@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  		__buffer_unlock_commit(buffer, event);   out: -	atomic_dec(&tr->data[cpu]->disabled); +	atomic_dec(&data->disabled);  	local_irq_restore(flags);  } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aa8f5f48dae..26dc348332b 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -57,6 +57,16 @@ u64 notrace trace_clock(void)  	return local_clock();  } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + */ +u64 notrace trace_clock_jiffies(void) +{ +	u64 jiffy = jiffies - INITIAL_JIFFIES; + +	/* Return nsecs */ +	return (u64)jiffies_to_usecs(jiffy) * 1000ULL; +}  /*   * trace_clock_global(): special globally coherent trace clock diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4108e1250ca..e2d027ac66a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,  		__dynamic_array(	u32,	buf	)  	), -	F_printk("%08lx fmt:%p", -		 __entry->ip, __entry->fmt), +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->fmt),  	FILTER_OTHER  ); @@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,  		__dynamic_array(	char,	buf	)  	), -	F_printk("%08lx %s", -		 __entry->ip, __entry->buf), +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->buf), + +	FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + +	TRACE_BPUTS, + +	F_STRUCT( +		__field(	unsigned long,	ip	) +		__field(	const char *,	str	) +	), + +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->str),  	FILTER_OTHER  ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e9b284250..53582e982e5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];  EXPORT_SYMBOL_GPL(event_storage);  LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static LIST_HEAD(ftrace_common_fields); -struct list_head * +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		struct ftrace_event_file *___n;				\ +		list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file()		\ +	} + +static struct list_head *  trace_get_fields(struct ftrace_event_call *event_call)  {  	if (!event_call->class->get_fields) @@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)  	return event_call->class->get_fields(event_call);  } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ +	struct ftrace_event_field *field; + +	list_for_each_entry(field, head, link) { +		if (!strcmp(field->name, name)) +			return field; +	} + +	return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ +	struct ftrace_event_field *field; +	struct list_head *head; + +	field = __find_event_field(&ftrace_common_fields, name); +	if (field) +		return field; + +	head = trace_get_fields(call); +	return __find_event_field(head, name); +} +  static int __trace_define_field(struct list_head *head, const char *type,  				const char *name, int offset, int size,  				int is_signed, int filter_type)  {  	struct ftrace_event_field *field; -	field = kzalloc(sizeof(*field), GFP_KERNEL); +	field = kmem_cache_alloc(field_cachep, GFP_TRACE);  	if (!field)  		goto err; -	field->name = kstrdup(name, GFP_KERNEL); -	if (!field->name) -		goto err; - -	field->type = kstrdup(type, GFP_KERNEL); -	if (!field->type) -		goto err; +	field->name = name; +	field->type = type;  	if (filter_type == FILTER_OTHER)  		field->filter_type = filter_assign_type(type); @@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,  	return 0;  err: -	if (field) -		kfree(field->name); -	kfree(field); +	kmem_cache_free(field_cachep, field);  	return -ENOMEM;  } @@ -120,7 +158,7 @@ static int trace_define_common_fields(void)  	return ret;  } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call)  {  	struct ftrace_event_field *field, *next;  	struct list_head *head; @@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)  	head = trace_get_fields(call);  	list_for_each_entry_safe(field, next, head, link) {  		list_del(&field->link); -		kfree(field->type); -		kfree(field->name); -		kfree(field); +		kmem_cache_free(field_cachep, field);  	}  } @@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);  int ftrace_event_reg(struct ftrace_event_call *call,  		     enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER:  		return tracepoint_probe_register(call->name,  						 call->class->probe, -						 call); +						 file);  	case TRACE_REG_UNREGISTER:  		tracepoint_probe_unregister(call->name,  					    call->class->probe, -					    call); +					    file);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);  void trace_event_enable_cmd_record(bool enable)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) +	do_for_each_event_file(tr, file) { + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))  			continue;  		if (enable) {  			tracing_start_cmdline_record(); -			call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} else {  			tracing_stop_cmdline_record(); -			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} -	} +	} while_for_each_event_file();  	mutex_unlock(&event_mutex);  } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, -					int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, +					 int enable, int soft_disable)  { +	struct ftrace_event_call *call = file->event_call;  	int ret = 0; +	int disable;  	switch (enable) {  	case 0: -		if (call->flags & TRACE_EVENT_FL_ENABLED) { -			call->flags &= ~TRACE_EVENT_FL_ENABLED; -			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { +		/* +		 * When soft_disable is set and enable is cleared, we want +		 * to clear the SOFT_DISABLED flag but leave the event in the +		 * state that it was. That is, if the event was enabled and +		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED +		 * is set we do not want the event to be enabled before we +		 * clear the bit. +		 * +		 * When soft_disable is not set but the SOFT_MODE flag is, +		 * we do nothing. Do not disable the tracepoint, otherwise +		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. +		 */ +		if (soft_disable) { +			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; +			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); +		} else +			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + +		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { +			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); +			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {  				tracing_stop_cmdline_record(); -				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			call->class->reg(call, TRACE_REG_UNREGISTER, NULL); +			call->class->reg(call, TRACE_REG_UNREGISTER, file);  		} +		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ +		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) +			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);  		break;  	case 1: -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { +		/* +		 * When soft_disable is set and enable is set, we want to +		 * register the tracepoint for the event, but leave the event +		 * as is. That means, if the event was already enabled, we do +		 * nothing (but set SOFT_MODE). If the event is disabled, we +		 * set SOFT_DISABLED before enabling the event tracepoint, so +		 * it still seems to be disabled. +		 */ +		if (!soft_disable) +			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +		else +			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + +			/* Keep the event disabled, when going to SOFT_MODE. */ +			if (soft_disable) +				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +  			if (trace_flags & TRACE_ITER_RECORD_CMD) {  				tracing_start_cmdline_record(); -				call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); +			ret = call->class->reg(call, TRACE_REG_REGISTER, file);  			if (ret) {  				tracing_stop_cmdline_record();  				pr_info("event trace: Could not enable event "  					"%s\n", call->name);  				break;  			} -			call->flags |= TRACE_EVENT_FL_ENABLED; +			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + +			/* WAS_ENABLED gets set but never cleared. */ +			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;  		}  		break;  	} @@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,  	return ret;  } -static void ftrace_clear_events(void) +static int ftrace_event_enable_disable(struct ftrace_event_file *file, +				       int enable)  { -	struct ftrace_event_call *call; +	return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ +	struct ftrace_event_file *file;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		ftrace_event_enable_disable(call, 0); +	list_for_each_entry(file, &tr->events, list) { +		ftrace_event_enable_disable(file, 0);  	}  	mutex_unlock(&event_mutex);  } @@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system)  	if (--system->ref_count)  		return; +	list_del(&system->list); +  	if (filter) {  		kfree(filter->filter_string);  		kfree(filter);  	} -	kfree(system->name);  	kfree(system);  } @@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system)  	system->ref_count++;  } -static void put_system(struct event_subsystem *system) +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	dir->ref_count++; +	__get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	/* If the subsystem is about to be freed, the dir must be too */ +	WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + +	__put_system(dir->subsystem); +	if (!--dir->ref_count) +		kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir)  {  	mutex_lock(&event_mutex); -	__put_system(system); +	__put_system_dir(dir);  	mutex_unlock(&event_mutex);  }  /*   * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.   */ -static int __ftrace_set_clr_event(const char *match, const char *sub, -				  const char *event, int set) +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, +				  const char *sub, const char *event, int set)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	int ret = -EINVAL;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call;  		if (!call->name || !call->class || !call->class->reg)  			continue; @@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,  		if (event && strcmp(event, call->name) != 0)  			continue; -		ftrace_event_enable_disable(call, set); +		ftrace_event_enable_disable(file, set);  		ret = 0;  	} @@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,  	return ret;  } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)  {  	char *event = NULL, *sub = NULL, *match; @@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set)  			event = NULL;  	} -	return __ftrace_set_clr_event(match, sub, event, set); +	return __ftrace_set_clr_event(tr, match, sub, event, set);  }  /** @@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set)   */  int trace_set_clr_event(const char *system, const char *event, int set)  { -	return __ftrace_set_clr_event(NULL, system, event, set); +	struct trace_array *tr = top_trace_array(); + +	return __ftrace_set_clr_event(tr, NULL, system, event, set);  }  EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  {  	struct trace_parser parser; +	struct seq_file *m = file->private_data; +	struct trace_array *tr = m->private;  	ssize_t read, ret;  	if (!cnt) @@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		parser.buffer[parser.idx] = 0; -		ret = ftrace_set_clr_event(parser.buffer + !set, set); +		ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);  		if (ret)  			goto out_put;  	} @@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  static void *  t_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { +	list_for_each_entry_continue(file, &tr->events, list) { +		call = file->event_call;  		/*  		 * The ftrace subsystem is for showing formats only.  		 * They can not be enabled or disabled via the event files.  		 */  		if (call->class && call->class->reg) -			return call; +			return file;  	}  	return NULL; @@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  static void *t_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = t_next(m, call, &l); -		if (!call) +		file = t_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static void *  s_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { -		if (call->flags & TRACE_EVENT_FL_ENABLED) -			return call; +	list_for_each_entry_continue(file, &tr->events, list) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) +			return file;  	}  	return NULL; @@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)  static void *s_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = s_next(m, call, &l); -		if (!call) +		file = s_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static int t_show(struct seq_file *m, void *v)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call = file->event_call;  	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)  		seq_printf(m, "%s:", call->class->system); @@ -494,25 +617,31 @@ static ssize_t  event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file = filp->private_data;  	char *buf; -	if (call->flags & TRACE_EVENT_FL_ENABLED) -		buf = "1\n"; -	else +	if (file->flags & FTRACE_EVENT_FL_ENABLED) { +		if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) +			buf = "0*\n"; +		else +			buf = "1\n"; +	} else  		buf = "0\n"; -	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));  }  static ssize_t  event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file = filp->private_data;  	unsigned long val;  	int ret; +	if (!file) +		return -EINVAL; +  	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);  	if (ret)  		return ret; @@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	case 0:  	case 1:  		mutex_lock(&event_mutex); -		ret = ftrace_event_enable_disable(call, val); +		ret = ftrace_event_enable_disable(file, val);  		mutex_unlock(&event_mutex);  		break; @@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		   loff_t *ppos)  {  	const char set_to_char[4] = { '?', '0', '1', 'X' }; -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = dir->tr;  	char buf[2];  	int set = 0;  	int ret;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (!call->name || !call->class || !call->class->reg)  			continue; @@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		 * or if all events or cleared, or if we have  		 * a mixture.  		 */ -		set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); +		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));  		/*  		 * If we have a mixture, no need to look further. @@ -584,7 +717,8 @@ static ssize_t  system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		    loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	const char *name = NULL;  	unsigned long val;  	ssize_t ret; @@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (system)  		name = system->name; -	ret = __ftrace_set_clr_event(NULL, name, NULL, val); +	ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);  	if (ret)  		goto out; @@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems);  static int subsystem_open(struct inode *inode, struct file *filp)  {  	struct event_subsystem *system = NULL; +	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ +	struct trace_array *tr;  	int ret; -	if (!inode->i_private) -		goto skip_search; -  	/* Make sure the system still exists */  	mutex_lock(&event_mutex); -	list_for_each_entry(system, &event_subsystems, list) { -		if (system == inode->i_private) { -			/* Don't open systems with no events */ -			if (!system->nr_events) { -				system = NULL; -				break; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		list_for_each_entry(dir, &tr->systems, list) { +			if (dir == inode->i_private) { +				/* Don't open systems with no events */ +				if (dir->nr_events) { +					__get_system_dir(dir); +					system = dir->subsystem; +				} +				goto exit_loop;  			} -			__get_system(system); -			break;  		}  	} + exit_loop:  	mutex_unlock(&event_mutex); -	if (system != inode->i_private) +	if (!system)  		return -ENODEV; - skip_search: +	/* Some versions of gcc think dir can be uninitialized here */ +	WARN_ON(!dir); + +	ret = tracing_open_generic(inode, filp); +	if (ret < 0) +		put_system(dir); + +	return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_subsystem_dir *dir; +	struct trace_array *tr = inode->i_private; +	int ret; + +	/* Make a temporary dir that has no system but points to tr */ +	dir = kzalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) +		return -ENOMEM; + +	dir->tr = tr; +  	ret = tracing_open_generic(inode, filp); -	if (ret < 0 && system) -		put_system(system); +	if (ret < 0) +		kfree(dir); + +	filp->private_data = dir;  	return ret;  }  static int subsystem_release(struct inode *inode, struct file *file)  { -	struct event_subsystem *system = inode->i_private; +	struct ftrace_subsystem_dir *dir = file->private_data; -	if (system) -		put_system(system); +	/* +	 * If dir->subsystem is NULL, then this is a temporary +	 * descriptor that was made for a trace_array to enable +	 * all subsystems. +	 */ +	if (dir->subsystem) +		put_system(dir); +	else +		kfree(dir);  	return 0;  } @@ -890,7 +1056,8 @@ static ssize_t  subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  		      loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct trace_seq *s;  	int r; @@ -915,7 +1082,7 @@ static ssize_t  subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data;  	char *buf;  	int err; @@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	}  	buf[cnt] = '\0'; -	err = apply_subsystem_event_filter(system, buf); +	err = apply_subsystem_event_filter(dir, buf);  	free_page((unsigned long) buf);  	if (err < 0)  		return err; @@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = {  	.release = subsystem_release,  }; +static const struct file_operations ftrace_tr_enable_fops = { +	.open = system_tr_open, +	.read = system_enable_read, +	.write = system_enable_write, +	.llseek = default_llseek, +	.release = subsystem_release, +}; +  static const struct file_operations ftrace_show_header_fops = {  	.open = tracing_open_generic,  	.read = show_header,  	.llseek = default_llseek,  }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, +		  const struct seq_operations *seq_ops)  { -	static struct dentry *d_tracer; -	static struct dentry *d_events; - -	if (d_events) -		return d_events; - -	d_tracer = tracing_init_dentry(); -	if (!d_tracer) -		return NULL; +	struct seq_file *m; +	int ret; -	d_events = debugfs_create_dir("events", d_tracer); -	if (!d_events) -		pr_warning("Could not create debugfs " -			   "'events' directory\n"); +	ret = seq_open(file, seq_ops); +	if (ret < 0) +		return ret; +	m = file->private_data; +	/* copy tr over to seq ops */ +	m->private = inode->i_private; -	return d_events; +	return ret;  }  static int @@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)  {  	const struct seq_operations *seq_ops = &show_event_seq_ops; -	return seq_open(file, seq_ops); +	return ftrace_event_open(inode, file, seq_ops);  }  static int  ftrace_event_set_open(struct inode *inode, struct file *file)  {  	const struct seq_operations *seq_ops = &show_set_event_seq_ops; +	struct trace_array *tr = inode->i_private;  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) -		ftrace_clear_events(); +		ftrace_clear_events(tr); -	return seq_open(file, seq_ops); +	return ftrace_event_open(inode, file, seq_ops); +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ +	struct event_subsystem *system; + +	/* need to create new entry */ +	system = kmalloc(sizeof(*system), GFP_KERNEL); +	if (!system) +		return NULL; + +	system->ref_count = 1; +	system->name = name; + +	system->filter = NULL; + +	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); +	if (!system->filter) +		goto out_free; + +	list_add(&system->list, &event_subsystems); + +	return system; + + out_free: +	kfree(system); +	return NULL;  }  static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, +		    struct ftrace_event_file *file, struct dentry *parent)  { +	struct ftrace_subsystem_dir *dir;  	struct event_subsystem *system;  	struct dentry *entry;  	/* First see if we did not already create this dir */ -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { +		system = dir->subsystem;  		if (strcmp(system->name, name) == 0) { -			system->nr_events++; -			return system->entry; +			dir->nr_events++; +			file->system = dir; +			return dir->entry;  		}  	} -	/* need to create new entry */ -	system = kmalloc(sizeof(*system), GFP_KERNEL); -	if (!system) { -		pr_warning("No memory to create event subsystem %s\n", -			   name); -		return d_events; -	} - -	system->entry = debugfs_create_dir(name, d_events); -	if (!system->entry) { -		pr_warning("Could not create event subsystem %s\n", -			   name); -		kfree(system); -		return d_events; -	} - -	system->nr_events = 1; -	system->ref_count = 1; -	system->name = kstrdup(name, GFP_KERNEL); -	if (!system->name) { -		debugfs_remove(system->entry); -		kfree(system); -		return d_events; +	/* Now see if the system itself exists. */ +	list_for_each_entry(system, &event_subsystems, list) { +		if (strcmp(system->name, name) == 0) +			break;  	} +	/* Reset system variable when not found */ +	if (&system->list == &event_subsystems) +		system = NULL; -	list_add(&system->list, &event_subsystems); +	dir = kmalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) +		goto out_fail; -	system->filter = NULL; +	if (!system) { +		system = create_new_subsystem(name); +		if (!system) +			goto out_free; +	} else +		__get_system(system); -	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); -	if (!system->filter) { -		pr_warning("Could not allocate filter for subsystem " -			   "'%s'\n", name); -		return system->entry; +	dir->entry = debugfs_create_dir(name, parent); +	if (!dir->entry) { +		pr_warning("Failed to create system directory %s\n", name); +		__put_system(system); +		goto out_free;  	} -	entry = debugfs_create_file("filter", 0644, system->entry, system, +	dir->tr = tr; +	dir->ref_count = 1; +	dir->nr_events = 1; +	dir->subsystem = system; +	file->system = dir; + +	entry = debugfs_create_file("filter", 0644, dir->entry, dir,  				    &ftrace_subsystem_filter_fops);  	if (!entry) {  		kfree(system->filter);  		system->filter = NULL; -		pr_warning("Could not create debugfs " -			   "'%s/filter' entry\n", name); +		pr_warning("Could not create debugfs '%s/filter' entry\n", name);  	} -	trace_create_file("enable", 0644, system->entry, system, +	trace_create_file("enable", 0644, dir->entry, dir,  			  &ftrace_system_enable_fops); -	return system->entry; +	list_add(&dir->list, &tr->systems); + +	return dir->entry; + + out_free: +	kfree(dir); + out_fail: +	/* Only print this message if failed on memory allocation */ +	if (!dir || !system) +		pr_warning("No memory to create event subsystem %s\n", +			   name); +	return NULL;  }  static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, +event_create_dir(struct dentry *parent, +		 struct ftrace_event_file *file,  		 const struct file_operations *id,  		 const struct file_operations *enable,  		 const struct file_operations *filter,  		 const struct file_operations *format)  { +	struct ftrace_event_call *call = file->event_call; +	struct trace_array *tr = file->tr;  	struct list_head *head; +	struct dentry *d_events;  	int ret;  	/*  	 * If the trace point header did not define TRACE_SYSTEM  	 * then the system would be called "TRACE_SYSTEM".  	 */ -	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) -		d_events = event_subsystem_dir(call->class->system, d_events); +	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { +		d_events = event_subsystem_dir(tr, call->class->system, file, parent); +		if (!d_events) +			return -ENOMEM; +	} else +		d_events = parent; -	call->dir = debugfs_create_dir(call->name, d_events); -	if (!call->dir) { -		pr_warning("Could not create debugfs " -			   "'%s' directory\n", call->name); +	file->dir = debugfs_create_dir(call->name, d_events); +	if (!file->dir) { +		pr_warning("Could not create debugfs '%s' directory\n", +			   call->name);  		return -1;  	}  	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) -		trace_create_file("enable", 0644, call->dir, call, +		trace_create_file("enable", 0644, file->dir, file,  				  enable);  #ifdef CONFIG_PERF_EVENTS  	if (call->event.type && call->class->reg) -		trace_create_file("id", 0444, call->dir, call, +		trace_create_file("id", 0444, file->dir, call,  		 		  id);  #endif @@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,  		if (ret < 0) {  			pr_warning("Could not initialize trace point"  				   " events/%s\n", call->name); -			return ret; +			return -1;  		}  	} -	trace_create_file("filter", 0644, call->dir, call, +	trace_create_file("filter", 0644, file->dir, call,  			  filter); -	trace_create_file("format", 0444, call->dir, call, +	trace_create_file("format", 0444, file->dir, call,  			  format);  	return 0;  } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ +	if (!dir) +		return; + +	if (!--dir->nr_events) { +		debugfs_remove_recursive(dir->entry); +		list_del(&dir->list); +		__put_system_dir(dir); +	} +} + +static void remove_event_from_tracers(struct ftrace_event_call *call) +{ +	struct ftrace_event_file *file; +	struct trace_array *tr; + +	do_for_each_event_file_safe(tr, file) { + +		if (file->event_call != call) +			continue; + +		list_del(&file->list); +		debugfs_remove_recursive(file->dir); +		remove_subsystem(file->system); +		kmem_cache_free(file_cachep, file); + +		/* +		 * The do_for_each_event_file_safe() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); +} +  static void event_remove(struct ftrace_event_call *call)  { -	ftrace_event_enable_disable(call, 0); +	struct trace_array *tr; +	struct ftrace_event_file *file; + +	do_for_each_event_file(tr, file) { +		if (file->event_call != call) +			continue; +		ftrace_event_enable_disable(file, 0); +		/* +		 * The do_for_each_event_file() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); +  	if (call->event.funcs)  		__unregister_ftrace_event(&call->event); +	remove_event_from_tracers(call);  	list_del(&call->list);  } @@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call)  }  static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, -		       const struct file_operations *id, -		       const struct file_operations *enable, -		       const struct file_operations *filter, -		       const struct file_operations *format) +__register_event(struct ftrace_event_call *call, struct module *mod)  { -	struct dentry *d_events;  	int ret;  	ret = event_init(call);  	if (ret < 0)  		return ret; -	d_events = event_trace_events_dir(); -	if (!d_events) -		return -ENOENT; - -	ret = event_create_dir(call, d_events, id, enable, filter, format); -	if (!ret) -		list_add(&call->list, &ftrace_events); +	list_add(&call->list, &ftrace_events);  	call->mod = mod; -	return ret; +	return 0; +} + +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, +		      struct trace_array *tr, +		      const struct file_operations *id, +		      const struct file_operations *enable, +		      const struct file_operations *filter, +		      const struct file_operations *format) +{ +	struct ftrace_event_file *file; + +	file = kmem_cache_alloc(file_cachep, GFP_TRACE); +	if (!file) +		return -ENOMEM; + +	file->event_call = call; +	file->tr = tr; +	list_add(&file->list, &tr->events); + +	return event_create_dir(tr->event_dir, file, id, enable, filter, format); +} + +/* + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, +			    struct trace_array *tr) +{ +	struct ftrace_event_file *file; + +	file = kmem_cache_alloc(file_cachep, GFP_TRACE); +	if (!file) +		return -ENOMEM; + +	file->event_call = call; +	file->tr = tr; +	list_add(&file->list, &tr->events); + +	return 0;  } +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call, +				   struct ftrace_module_file_ops *file_ops); +  /* Add an additional event_call dynamically */  int trace_add_event_call(struct ftrace_event_call *call)  {  	int ret;  	mutex_lock(&event_mutex); -	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, -				     &ftrace_enable_fops, -				     &ftrace_event_filter_fops, -				     &ftrace_event_format_fops); -	mutex_unlock(&event_mutex); -	return ret; -} -static void remove_subsystem_dir(const char *name) -{ -	struct event_subsystem *system; - -	if (strcmp(name, TRACE_SYSTEM) == 0) -		return; +	ret = __register_event(call, NULL); +	if (ret >= 0) +		__add_event_to_tracers(call, NULL); -	list_for_each_entry(system, &event_subsystems, list) { -		if (strcmp(system->name, name) == 0) { -			if (!--system->nr_events) { -				debugfs_remove_recursive(system->entry); -				list_del(&system->list); -				__put_system(system); -			} -			break; -		} -	} +	mutex_unlock(&event_mutex); +	return ret;  }  /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Must be called under locking both of event_mutex and trace_event_sem.   */  static void __trace_remove_event_call(struct ftrace_event_call *call)  {  	event_remove(call);  	trace_destroy_fields(call);  	destroy_preds(call); -	debugfs_remove_recursive(call->dir); -	remove_subsystem_dir(call->class->system);  }  /* Remove an event_call */  void trace_remove_event_call(struct ftrace_event_call *call)  {  	mutex_lock(&event_mutex); -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	__trace_remove_event_call(call); -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	mutex_unlock(&event_mutex);  } @@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops {  };  static struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ +	/* +	 * As event_calls are added in groups by module, +	 * when we find one file_ops, we don't need to search for +	 * each call in that module, as the rest should be the +	 * same. Only search for a new one if the last one did +	 * not match. +	 */ +	if (file_ops && mod == file_ops->mod) +		return file_ops; + +	list_for_each_entry(file_ops, &ftrace_module_file_list, list) { +		if (file_ops->mod == mod) +			return file_ops; +	} +	return NULL; +} + +static struct ftrace_module_file_ops *  trace_create_file_ops(struct module *mod)  {  	struct ftrace_module_file_ops *file_ops; @@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod)  		return;  	for_each_event(call, start, end) { -		__trace_add_event_call(*call, mod, -				       &file_ops->id, &file_ops->enable, -				       &file_ops->filter, &file_ops->format); +		__register_event(*call, mod); +		__add_event_to_tracers(*call, file_ops);  	}  } @@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod)  {  	struct ftrace_module_file_ops *file_ops;  	struct ftrace_event_call *call, *p; -	bool found = false; +	bool clear_trace = false; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	list_for_each_entry_safe(call, p, &ftrace_events, list) {  		if (call->mod == mod) { -			found = true; +			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) +				clear_trace = true;  			__trace_remove_event_call(call);  		}  	} @@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod)  		list_del(&file_ops->list);  		kfree(file_ops);  	} +	up_write(&trace_event_sem);  	/*  	 * It is safest to reset the ring buffer if the module being unloaded -	 * registered any events. +	 * registered any events that were used. The only worry is if +	 * a new module gets loaded, and takes on the same id as the events +	 * of this module. When printing out the buffer, traced events left +	 * over from this module may be passed to the new module events and +	 * unexpected results may occur.  	 */ -	if (found) -		tracing_reset_current_online_cpus(); -	up_write(&trace_event_mutex); +	if (clear_trace) +		tracing_reset_all_online_cpus();  }  static int trace_module_notify(struct notifier_block *self, @@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self,  	return 0;  } + +static int +__trace_add_new_mod_event(struct ftrace_event_call *call, +			  struct trace_array *tr, +			  struct ftrace_module_file_ops *file_ops) +{ +	return __trace_add_new_event(call, tr, +				     &file_ops->id, &file_ops->enable, +				     &file_ops->filter, &file_ops->format); +} +  #else -static int trace_module_notify(struct notifier_block *self, -			       unsigned long val, void *data) +static inline struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ +	return NULL; +} +static inline int trace_module_notify(struct notifier_block *self, +				      unsigned long val, void *data)  {  	return 0;  } +static inline int +__trace_add_new_mod_event(struct ftrace_event_call *call, +			  struct trace_array *tr, +			  struct ftrace_module_file_ops *file_ops) +{ +	return -ENODEV; +}  #endif /* CONFIG_MODULES */ +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_module_file_ops *file_ops = NULL; +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		if (call->mod) { +			/* +			 * Directories for events by modules need to +			 * keep module ref counts when opened (as we don't +			 * want the module to disappear when reading one +			 * of these files). The file_ops keep account of +			 * the module ref count. +			 */ +			file_ops = find_ftrace_file_ops(file_ops, call->mod); +			if (!file_ops) +				continue; /* Warn? */ +			ret = __trace_add_new_mod_event(call, tr, file_ops); +			if (ret < 0) +				pr_warning("Could not create directory for event %s\n", +					   call->name); +			continue; +		} +		ret = __trace_add_new_event(call, tr, +					    &ftrace_event_id_fops, +					    &ftrace_enable_fops, +					    &ftrace_event_filter_fops, +					    &ftrace_event_format_fops); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   call->name); +	} +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR	"enable_event" +#define DISABLE_EVENT_STR	"disable_event" + +struct event_probe_data { +	struct ftrace_event_file	*file; +	unsigned long			count; +	int				ref; +	bool				enable; +}; + +static struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system,  const char *event) +{ +	struct ftrace_event_file *file; +	struct ftrace_event_call *call; + +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call; + +		if (!call->name || !call->class || !call->class->reg) +			continue; + +		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) +			continue; + +		if (strcmp(event, call->name) == 0 && +		    strcmp(system, call->class->system) == 0) +			return file; +	} +	return NULL; +} + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (data->enable) +		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +	else +		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (!data->count) +		return; + +	/* Skip if the event is in a state we want to switch to */ +	if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) +		return; + +	if (data->count != -1) +		(data->count)--; + +	event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *_data) +{ +	struct event_probe_data *data = _data; + +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "%s:%s:%s", +		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, +		   data->file->event_call->class->system, +		   data->file->event_call->name); + +	if (data->count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", data->count); + +	return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	data->ref++; +	return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (WARN_ON_ONCE(data->ref <= 0)) +		return; + +	data->ref--; +	if (!data->ref) { +		/* Remove the SOFT_MODE flag */ +		__ftrace_event_enable_disable(data->file, 0, 1); +		module_put(data->file->event_call->mod); +		kfree(data); +	} +	*pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, +		  char *glob, char *cmd, char *param, int enabled) +{ +	struct trace_array *tr = top_trace_array(); +	struct ftrace_event_file *file; +	struct ftrace_probe_ops *ops; +	struct event_probe_data *data; +	const char *system; +	const char *event; +	char *number; +	bool enable; +	int ret; + +	/* hash funcs only work with set_ftrace_filter */ +	if (!enabled) +		return -EINVAL; + +	if (!param) +		return -EINVAL; + +	system = strsep(¶m, ":"); +	if (!param) +		return -EINVAL; + +	event = strsep(¶m, ":"); + +	mutex_lock(&event_mutex); + +	ret = -EINVAL; +	file = find_event_file(tr, system, event); +	if (!file) +		goto out; + +	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + +	if (enable) +		ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; +	else +		ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		ret = 0; +		goto out; +	} + +	ret = -ENOMEM; +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		goto out; + +	data->enable = enable; +	data->count = -1; +	data->file = file; + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	ret = -EINVAL; +	if (!strlen(number)) +		goto out_free; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, &data->count); +	if (ret) +		goto out_free; + + out_reg: +	/* Don't let event modules unload while probe registered */ +	ret = try_module_get(file->event_call->mod); +	if (!ret) +		goto out_free; + +	ret = __ftrace_event_enable_disable(file, 1, 1); +	if (ret < 0) +		goto out_put; +	ret = register_ftrace_function_probe(glob, ops, data); +	if (!ret) +		goto out_disable; + out: +	mutex_unlock(&event_mutex); +	return ret; + + out_disable: +	__ftrace_event_enable_disable(file, 0, 1); + out_put: +	module_put(file->event_call->mod); + out_free: +	kfree(data); +	goto out; +} + +static struct ftrace_func_command event_enable_cmd = { +	.name			= ENABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { +	.name			= DISABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static __init int register_event_cmds(void) +{ +	int ret; + +	ret = register_ftrace_command(&event_enable_cmd); +	if (WARN_ON(ret < 0)) +		return ret; +	ret = register_ftrace_command(&event_disable_cmd); +	if (WARN_ON(ret < 0)) +		unregister_ftrace_command(&event_enable_cmd); +	return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file; +	int ret; + + +	list_for_each_entry(file, &tr->events, list) { +		ret = event_create_dir(tr->event_dir, file, +				       &ftrace_event_id_fops, +				       &ftrace_enable_fops, +				       &ftrace_event_filter_fops, +				       &ftrace_event_format_fops); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   file->event_call->name); +	} +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		/* Early boot up should not have any modules loaded */ +		if (WARN_ON_ONCE(call->mod)) +			continue; + +		ret = __trace_early_add_new_event(call, tr); +		if (ret < 0) +			pr_warning("Could not create early event %s\n", +				   call->name); +	} +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file, *next; + +	list_for_each_entry_safe(file, next, &tr->events, list) { +		list_del(&file->list); +		debugfs_remove_recursive(file->dir); +		remove_subsystem(file->system); +		kmem_cache_free(file_cachep, file); +	} +} + +static void +__add_event_to_tracers(struct ftrace_event_call *call, +		       struct ftrace_module_file_ops *file_ops) +{ +	struct trace_array *tr; + +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (file_ops) +			__trace_add_new_mod_event(call, tr, file_ops); +		else +			__trace_add_new_event(call, tr, +					      &ftrace_event_id_fops, +					      &ftrace_enable_fops, +					      &ftrace_event_filter_fops, +					      &ftrace_event_format_fops); +	} +} +  static struct notifier_block trace_module_nb = {  	.notifier_call = trace_module_notify,  	.priority = 0, @@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;  static __init int setup_trace_event(char *str)  {  	strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); -	ring_buffer_expanded = 1; -	tracing_selftest_disabled = 1; +	ring_buffer_expanded = true; +	tracing_selftest_disabled = true;  	return 1;  }  __setup("trace_event=", setup_trace_event); +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) +{ +	struct dentry *d_events; +	struct dentry *entry; + +	entry = debugfs_create_file("set_event", 0644, parent, +				    tr, &ftrace_set_event_fops); +	if (!entry) { +		pr_warning("Could not create debugfs 'set_event' entry\n"); +		return -ENOMEM; +	} + +	d_events = debugfs_create_dir("events", parent); +	if (!d_events) { +		pr_warning("Could not create debugfs 'events' directory\n"); +		return -ENOMEM; +	} + +	/* ring buffer internal formats */ +	trace_create_file("header_page", 0444, d_events, +			  ring_buffer_print_page_header, +			  &ftrace_show_header_fops); + +	trace_create_file("header_event", 0444, d_events, +			  ring_buffer_print_entry_header, +			  &ftrace_show_header_fops); + +	trace_create_file("enable", 0644, d_events, +			  tr, &ftrace_tr_enable_fops); + +	tr->event_dir = d_events; + +	return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_early_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ +	/* Disable any running events */ +	__ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + +	mutex_lock(&event_mutex); + +	down_write(&trace_event_sem); +	__trace_remove_event_dirs(tr); +	debugfs_remove_recursive(tr->event_dir); +	up_write(&trace_event_sem); + +	tr->event_dir = NULL; + +	mutex_unlock(&event_mutex); + +	return 0; +} + +static __init int event_trace_memsetup(void) +{ +	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); +	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); +	return 0; +} +  static __init int event_trace_enable(void)  { +	struct trace_array *tr = top_trace_array();  	struct ftrace_event_call **iter, *call;  	char *buf = bootup_event_buf;  	char *token; @@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void)  			list_add(&call->list, &ftrace_events);  	} +	/* +	 * We need the top trace array to have a working set of trace +	 * points at early init, before the debug files and directories +	 * are created. Create the file entries now, and attach them +	 * to the actual file dentries later. +	 */ +	__trace_early_add_events(tr); +  	while (true) {  		token = strsep(&buf, ","); @@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void)  		if (!*token)  			continue; -		ret = ftrace_set_clr_event(token, 1); +		ret = ftrace_set_clr_event(tr, token, 1);  		if (ret)  			pr_warn("Failed to enable trace event: %s\n", token);  	}  	trace_printk_start_comm(); +	register_event_cmds(); +  	return 0;  }  static __init int event_trace_init(void)  { -	struct ftrace_event_call *call; +	struct trace_array *tr;  	struct dentry *d_tracer;  	struct dentry *entry; -	struct dentry *d_events;  	int ret; +	tr = top_trace_array(); +  	d_tracer = tracing_init_dentry();  	if (!d_tracer)  		return 0;  	entry = debugfs_create_file("available_events", 0444, d_tracer, -				    NULL, &ftrace_avail_fops); +				    tr, &ftrace_avail_fops);  	if (!entry)  		pr_warning("Could not create debugfs "  			   "'available_events' entry\n"); -	entry = debugfs_create_file("set_event", 0644, d_tracer, -				    NULL, &ftrace_set_event_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_event' entry\n"); - -	d_events = event_trace_events_dir(); -	if (!d_events) -		return 0; - -	/* ring buffer internal formats */ -	trace_create_file("header_page", 0444, d_events, -			  ring_buffer_print_page_header, -			  &ftrace_show_header_fops); - -	trace_create_file("header_event", 0444, d_events, -			  ring_buffer_print_entry_header, -			  &ftrace_show_header_fops); - -	trace_create_file("enable", 0644, d_events, -			  NULL, &ftrace_system_enable_fops); -  	if (trace_define_common_fields())  		pr_warning("tracing: Failed to allocate common fields"); -	/* -	 * Early initialization already enabled ftrace event. -	 * Now it's only necessary to create the event directory. -	 */ -	list_for_each_entry(call, &ftrace_events, list) { - -		ret = event_create_dir(call, d_events, -				       &ftrace_event_id_fops, -				       &ftrace_enable_fops, -				       &ftrace_event_filter_fops, -				       &ftrace_event_format_fops); -		if (ret < 0) -			event_remove(call); -	} +	ret = early_event_add_tracer(d_tracer, tr); +	if (ret) +		return ret;  	ret = register_module_notifier(&trace_module_nb);  	if (ret) @@ -1568,6 +2399,7 @@ static __init int event_trace_init(void)  	return 0;  } +early_initcall(event_trace_memsetup);  core_initcall(event_trace_enable);  fs_initcall(event_trace_init); @@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void)   */  static __init void event_trace_self_tests(void)  { +	struct ftrace_subsystem_dir *dir; +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	struct event_subsystem *system; +	struct trace_array *tr;  	int ret; +	tr = top_trace_array(); +  	pr_info("Running tests on trace events:\n"); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call;  		/* Only test those that have a probe */  		if (!call->class || !call->class->probe) @@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void)  		 * If an event is already enabled, someone is using  		 * it and the self test should not be on.  		 */ -		if (call->flags & TRACE_EVENT_FL_ENABLED) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) {  			pr_warning("Enabled event during self test!\n");  			WARN_ON_ONCE(1);  			continue;  		} -		ftrace_event_enable_disable(call, 1); +		ftrace_event_enable_disable(file, 1);  		event_test_stuff(); -		ftrace_event_enable_disable(call, 0); +		ftrace_event_enable_disable(file, 0);  		pr_cont("OK\n");  	} @@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on trace event systems:\n"); -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { + +		system = dir->subsystem;  		/* the ftrace system is special, skip it */  		if (strcmp(system->name, "ftrace") == 0) @@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void)  		pr_info("Testing event system %s: ", system->name); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);  		if (WARN_ON_ONCE(ret)) {  			pr_warning("error enabling system %s\n",  				   system->name); @@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void)  		event_test_stuff(); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);  		if (WARN_ON_ONCE(ret)) {  			pr_warning("error disabling system %s\n",  				   system->name); @@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on all trace events:\n");  	pr_info("Testing all events: "); -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error enabling all events\n");  		return; @@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void)  	event_test_stuff();  	/* reset sysname */ -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error disabling all events\n");  		return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e5b0ca8b8d4..a6361178de5 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,  	mutex_unlock(&event_mutex);  } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ -	struct ftrace_event_field *field; - -	list_for_each_entry(field, head, link) { -		if (!strcmp(field->name, name)) -			return field; -	} - -	return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ -	struct ftrace_event_field *field; -	struct list_head *head; - -	field = __find_event_field(&ftrace_common_fields, name); -	if (field) -		return field; - -	head = trace_get_fields(call); -	return __find_event_field(head, name); -} -  static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)  {  	stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); @@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,  		return NULL;  	} -	field = find_event_field(call, operand1); +	field = trace_find_event_field(call, operand1);  	if (!field) {  		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);  		return NULL; @@ -1907,16 +1880,17 @@ out_unlock:  	return err;  } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  				 char *filter_string)  { +	struct event_subsystem *system = dir->subsystem;  	struct event_filter *filter;  	int err = 0;  	mutex_lock(&event_mutex);  	/* Make sure the system still has events */ -	if (!system->nr_events) { +	if (!dir->nr_events) {  		err = -ENODEV;  		goto out_unlock;  	} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e039906b037..d21a7467008 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void)		\  #undef FTRACE_ENTRY  #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ -int									\ +static int __init							\  ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  {									\  	struct struct_name field;					\ @@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\  			 regfn)						\  									\ -struct ftrace_event_class event_class_ftrace_##call = {			\ +struct ftrace_event_class __refdata event_class_ftrace_##call = {	\  	.system			= __stringify(TRACE_SYSTEM),		\  	.define_fields		= ftrace_define_fields_##call,		\  	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 60115252332..c4d6d719198 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);  static int function_trace_init(struct trace_array *tr)  {  	func_trace = tr; -	tr->cpu = get_cpu(); +	tr->trace_buffer.cpu = get_cpu();  	put_cpu();  	tracing_start_cmdline_record(); @@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)  static void function_trace_start(struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  /* Our option */ @@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,  		goto out;  	cpu = smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (!atomic_read(&data->disabled)) {  		local_save_flags(flags);  		trace_function(tr, ip, parent_ip, flags, pc); @@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,  	 */  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) { @@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =  };  #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data)  { -	long *count = (long *)data; - -	if (tracing_is_on()) -		return; +	unsigned long *count = (long *)data;  	if (!*count) -		return; +		return 0;  	if (*count != -1)  		(*count)--; -	tracing_on(); +	return 1;  }  static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)  { -	long *count = (long *)data; +	if (tracing_is_on()) +		return; + +	if (update_count(data)) +		tracing_on(); +} +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{  	if (!tracing_is_on())  		return; -	if (!*count) +	if (update_count(data)) +		tracing_off(); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (tracing_is_on())  		return; -	if (*count != -1) -		(*count)--; +	tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return;  	tracing_off();  } -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data); +/* + * Skip 4: + *   ftrace_stacktrace() + *   function_trace_probe_call() + *   ftrace_ops_list_func() + *   ftrace_call() + */ +#define STACK_SKIP 4 -static struct ftrace_probe_ops traceon_probe_ops = { -	.func			= ftrace_traceon, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ +	trace_dump_stack(STACK_SKIP); +} -static struct ftrace_probe_ops traceoff_probe_ops = { -	.func			= ftrace_traceoff, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return; + +	if (update_count(data)) +		trace_dump_stack(STACK_SKIP); +}  static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data) +ftrace_probe_print(const char *name, struct seq_file *m, +		   unsigned long ip, void *data)  {  	long count = (long)data; -	seq_printf(m, "%ps:", (void *)ip); - -	if (ops == &traceon_probe_ops) -		seq_printf(m, "traceon"); -	else -		seq_printf(m, "traceoff"); +	seq_printf(m, "%ps:%s", (void *)ip, name);  	if (count == -1)  		seq_printf(m, ":unlimited\n"); @@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,  }  static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +ftrace_traceon_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data)  { -	struct ftrace_probe_ops *ops; - -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +	return ftrace_probe_print("traceon", m, ip, data); +} -	unregister_ftrace_function_probe_func(glob, ops); +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("traceoff", m, ip, data); +} -	return 0; +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("stacktrace", m, ip, data);  } +static struct ftrace_probe_ops traceon_count_probe_ops = { +	.func			= ftrace_traceon_count, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { +	.func			= ftrace_traceoff_count, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { +	.func			= ftrace_stacktrace_count, +	.print			= ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { +	.func			= ftrace_traceon, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { +	.func			= ftrace_traceoff, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { +	.func			= ftrace_stacktrace, +	.print			= ftrace_stacktrace_print, +}; +  static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, -			    char *glob, char *cmd, char *param, int enable) +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +			    struct ftrace_hash *hash, char *glob, +			    char *cmd, char *param, int enable)  { -	struct ftrace_probe_ops *ops;  	void *count = (void *)-1;  	char *number;  	int ret; @@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,  	if (!enable)  		return -EINVAL; -	if (glob[0] == '!') -		return ftrace_trace_onoff_unreg(glob+1, cmd, param); - -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	}  	if (!param)  		goto out_reg; @@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,  	return ret < 0 ? ret : 0;  } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, +			    char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	/* we register both traceon and traceoff to this callback */ +	if (strcmp(cmd, "traceon") == 0) +		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; +	else +		ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} +  static struct ftrace_func_command ftrace_traceon_cmd = {  	.name			= "traceon",  	.func			= ftrace_trace_onoff_callback, @@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {  	.func			= ftrace_trace_onoff_callback,  }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { +	.name			= "stacktrace", +	.func			= ftrace_stacktrace_callback, +}; +  static int __init init_func_cmd_traceon(void)  {  	int ret; @@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)  	ret = register_ftrace_command(&ftrace_traceon_cmd);  	if (ret)  		unregister_ftrace_command(&ftrace_traceoff_cmd); + +	ret = register_ftrace_command(&ftrace_stacktrace_cmd); +	if (ret) { +		unregister_ftrace_command(&ftrace_traceoff_cmd); +		unregister_ftrace_command(&ftrace_traceon_cmd); +	}  	return ret;  }  #else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 39ada66389c..8388bc99f2e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_entry;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ent_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_exit;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ret_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,  			 * We need to consume the current entry to see  			 * the next one.  			 */ -			ring_buffer_consume(iter->tr->buffer, iter->cpu, +			ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,  					    NULL, NULL); -			event = ring_buffer_peek(iter->tr->buffer, iter->cpu, +			event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,  						 NULL, NULL);  		} diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac488..b19d065a28c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,7 +32,8 @@ enum {  static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; +static bool function_enabled;  static void stop_irqsoff_tracer(struct trace_array *tr, int graph);  static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,  	if (!irqs_disabled_flags(*flags))  		return 0; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (likely(disabled == 1)) @@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)  		per_cpu(tracing_cpu, cpu) = 0;  	tracing_max_latency = 0; -	tracing_reset_online_cpus(irqsoff_trace); +	tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);  	return start_irqsoff_tracer(irqsoff_trace, set);  } @@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)  	if (per_cpu(tracing_cpu, cpu))  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) || atomic_read(&data->disabled))  		return; @@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)  	if (!tracer_enabled)  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) ||  	    !data->critical_start || atomic_read(&data->disabled)) @@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)  }  #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(int graph, int set)  { -	int ret = 0; +	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&irqsoff_graph_return,  					    &irqsoff_graph_entry); +	else +		ret = register_ftrace_function(&trace_ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_irqsoff_function(int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(&trace_ops); + +	function_enabled = false; +} + +static void irqsoff_function_set(int set) +{ +	if (set) +		register_irqsoff_function(is_graph(), 1); +	else +		unregister_irqsoff_function(is_graph()); +} + +static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +{ +	if (mask & TRACE_ITER_FUNCTION) +		irqsoff_function_set(set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ +	int ret; + +	ret = register_irqsoff_function(graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_irqsoff_function(graph);  }  static void __irqsoff_tracer_init(struct trace_array *tr)  { -	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; -	trace_flags |= TRACE_ITER_LATENCY_FMT; +	save_flags = trace_flags; + +	/* non overwrite screws up the latency tracers */ +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);  	tracing_max_latency = 0;  	irqsoff_trace = tr;  	/* make sure that the tracer is visible */  	smp_wmb(); -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	if (start_irqsoff_tracer(tr, is_graph()))  		printk(KERN_ERR "failed to start irqsoff tracer\n"); @@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)  static void irqsoff_tracer_reset(struct trace_array *tr)  { +	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; +	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; +  	stop_irqsoff_tracer(tr, is_graph()); -	if (!save_lat_flag) -		trace_flags &= ~TRACE_ITER_LATENCY_FMT; +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);  }  static void irqsoff_tracer_start(struct trace_array *tr) @@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_irqsoff,  #endif @@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptoff,  #endif @@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptirqsoff,  #endif diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b..bd90e1b0608 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	old_userobj = trace_flags; @@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	iter.iter_flags |= TRACE_FILE_LAT_FMT;  	iter.pos = -1; -	if (cpu_file == TRACE_PIPE_ALL_CPU) { +	if (cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter.buffer_iter[cpu] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);  			ring_buffer_read_start(iter.buffer_iter[cpu]);  			tracing_iter_reset(&iter, cpu);  		}  	} else {  		iter.cpu_file = cpu_file;  		iter.buffer_iter[cpu_file] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu_file); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);  		ring_buffer_read_start(iter.buffer_iter[cpu_file]);  		tracing_iter_reset(&iter, cpu_file);  	} @@ -83,7 +83,7 @@ out:  	trace_flags = old_userobj;  	for_each_tracing_cpu(cpu) { -		atomic_dec(&iter.tr->data[cpu]->disabled); +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	for_each_tracing_cpu(cpu) @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)  		    !cpu_online(cpu_file))  			return KDB_BADINT;  	} else { -		cpu_file = TRACE_PIPE_ALL_CPU; +		cpu_file = RING_BUFFER_ALL_CPUS;  	}  	kdb_trap_printk++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e..a5e8f4878bf 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)  	overrun_detected = false;  	prev_overruns = 0; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  static int mmio_trace_init(struct trace_array *tr) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)  static unsigned long count_overruns(struct trace_iterator *iter)  {  	unsigned long cnt = atomic_xchg(&dropped_count, 0); -	unsigned long over = ring_buffer_overruns(iter->tr->buffer); +	unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);  	if (over > prev_overruns)  		cnt += over - prev_overruns; @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  				struct mmiotrace_rw *rw)  {  	struct ftrace_event_call *call = &event_mmiotrace_rw; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_rw *entry;  	int pc = preempt_count(); @@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  void mmio_trace_rw(struct mmiotrace_rw *rw)  {  	struct trace_array *tr = mmio_trace_array; -	struct trace_array_cpu *data = tr->data[smp_processor_id()]; +	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_rw(tr, data, rw);  } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  				struct mmiotrace_map *map)  {  	struct ftrace_event_call *call = &event_mmiotrace_map; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_map *entry;  	int pc = preempt_count(); @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)  	struct trace_array_cpu *data;  	preempt_disable(); -	data = tr->data[smp_processor_id()]; +	data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_map(tr, data, map);  	preempt_enable();  } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 697e88d1390..bb922d9ee51 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@  /* must be a power of 2 */  #define EVENT_HASHSIZE	128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem);  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)  	return ret;  } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ +	struct trace_seq *s = &iter->seq; +	struct trace_entry *entry = iter->ent; +	struct bputs_entry *field; +	int ret; + +	trace_assign_type(field, entry); + +	ret = trace_seq_puts(s, field->str); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return TRACE_TYPE_HANDLED; +} +  enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq; @@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)  }  EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, +			   struct trace_event *trace_event) +{ +	struct ftrace_event_call *event; +	struct trace_seq *s = &iter->seq; +	struct trace_seq *p = &iter->tmp_seq; +	struct trace_entry *entry; +	int ret; + +	event = container_of(trace_event, struct ftrace_event_call, event); +	entry = iter->ent; + +	if (entry->type != event->event.type) { +		WARN_ON_ONCE(1); +		return TRACE_TYPE_UNHANDLED; +	} + +	trace_seq_init(p); +	ret = trace_seq_printf(s, "%s: ", event->name); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); +  #ifdef CONFIG_KRETPROBES  static inline const char *kretprobed(const char *name)  { @@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)  {  	unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;  	unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; -	unsigned long long abs_ts = iter->ts - iter->tr->time_start; +	unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;  	unsigned long long rel_ts = next_ts - iter->ts;  	struct trace_seq *s = &iter->seq; @@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)  void trace_event_read_lock(void)  { -	down_read(&trace_event_mutex); +	down_read(&trace_event_sem);  }  void trace_event_read_unlock(void)  { -	up_read(&trace_event_mutex); +	up_read(&trace_event_sem);  }  /** @@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)  	unsigned key;  	int ret = 0; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	if (WARN_ON(!event))  		goto out; @@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)  	ret = event->type;   out: -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return ret;  }  EXPORT_SYMBOL_GPL(register_ftrace_event);  /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write.   */  int __unregister_ftrace_event(struct trace_event *event)  { @@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)   */  int unregister_ftrace_event(struct trace_event *event)  { -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	__unregister_ftrace_event(event); -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return 0;  } @@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {  	.funcs		= &trace_user_stack_funcs,  }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, +		   struct trace_event *event) +{ +	struct trace_entry *entry = iter->ent; +	struct trace_seq *s = &iter->seq; +	struct bputs_entry *field; + +	trace_assign_type(field, entry); + +	if (!seq_print_ip_sym(s, field->ip, flags)) +		goto partial; + +	if (!trace_seq_puts(s, ": ")) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, +		struct trace_event *event) +{ +	struct bputs_entry *field; +	struct trace_seq *s = &iter->seq; + +	trace_assign_type(field, iter->ent); + +	if (!trace_seq_printf(s, ": %lx : ", field->ip)) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { +	.trace		= trace_bputs_print, +	.raw		= trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { +	.type		= TRACE_BPUTS, +	.funcs		= &trace_bputs_funcs, +}; +  /* TRACE_BPRINT */  static enum print_line_t  trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {  	&trace_wake_event,  	&trace_stack_event,  	&trace_user_stack_event, +	&trace_bputs_event,  	&trace_bprint_event,  	&trace_print_event,  	NULL diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492..127a9d8c835 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -5,6 +5,8 @@  #include "trace.h"  extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t  trace_print_bprintk_msg_only(struct trace_iterator *iter);  extern enum print_line_t  trace_print_printk_msg_only(struct trace_iterator *iter); @@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);  /* used by module unregistering */  extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem;  #define MAX_MEMHEX_BYTES	8  #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3374c792ccd..4e98e3b257a 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  			   unsigned long flags, int pc)  {  	struct ftrace_event_call *call = &event_context_switch; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	struct ftrace_event_call *call = &event_wakeup;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,  					  sizeof(*entry), flags, pc); @@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a..fee77e15d81 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr);  static int wakeup_graph_entry(struct ftrace_graph_ent *trace);  static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; +static bool function_enabled;  #define TRACE_DISPLAY_GRAPH     1 @@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,  	if (cpu != wakeup_current_cpu)  		goto out_enable; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =  };  #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(int graph, int set)  {  	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&wakeup_graph_return,  					    &wakeup_graph_entry); +	else +		ret = register_ftrace_function(&trace_ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_wakeup_function(int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(&trace_ops); + +	function_enabled = false; +} + +static void wakeup_function_set(int set) +{ +	if (set) +		register_wakeup_function(is_graph(), 1); +	else +		unregister_wakeup_function(is_graph()); +} + +static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +{ +	if (mask & TRACE_ITER_FUNCTION) +		wakeup_function_set(set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(int graph) +{ +	int ret; + +	ret = register_wakeup_function(graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_wakeup_function(graph);  }  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,  	/* disable local data, not wakeup_cpu data */  	cpu = raw_smp_processor_id(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (likely(disabled != 1))  		goto out; @@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,  		goto out_unlock;  	/* The task we are waiting for is waking up */ -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);  	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -387,7 +430,7 @@ out_unlock:  	arch_spin_unlock(&wakeup_lock);  	local_irq_restore(flags);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void __wakeup_reset(struct trace_array *tr) @@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)  {  	unsigned long flags; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	local_irq_save(flags);  	arch_spin_lock(&wakeup_lock); @@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  		return;  	pc = preempt_count(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  	local_save_flags(flags); -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	data->preempt_timestamp = ftrace_now(cpu);  	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  out_locked:  	arch_spin_unlock(&wakeup_lock);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void start_wakeup_tracer(struct trace_array *tr) @@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)  static int __wakeup_tracer_init(struct trace_array *tr)  { -	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; -	trace_flags |= TRACE_ITER_LATENCY_FMT; +	save_flags = trace_flags; + +	/* non overwrite screws up the latency tracers */ +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);  	tracing_max_latency = 0;  	wakeup_trace = tr; @@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)  static void wakeup_tracer_reset(struct trace_array *tr)  { +	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; +	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; +  	stop_wakeup_tracer(tr);  	/* make sure we put back any tasks we are tracing */  	wakeup_reset(tr); -	if (!save_lat_flag) -		trace_flags &= ~TRACE_ITER_LATENCY_FMT; +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);  }  static void wakeup_tracer_start(struct trace_array *tr) @@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif @@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 51c819c12c2..55e2cf66967 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)  	return 0;  } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)  {  	struct ring_buffer_event *event;  	struct trace_entry *entry;  	unsigned int loops = 0; -	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { +	while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {  		entry = ring_buffer_event_data(event);  		/* @@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)   * Test the trace buffer to see if all the elements   * are still sane.   */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)  {  	unsigned long flags, cnt = 0;  	int cpu, ret = 0; @@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	local_irq_save(flags);  	arch_spin_lock(&ftrace_max_lock); -	cnt = ring_buffer_entries(tr->buffer); +	cnt = ring_buffer_entries(buf->buffer);  	/*  	 * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	 */  	tracing_off();  	for_each_possible_cpu(cpu) { -		ret = trace_test_buffer_cpu(tr, cpu); +		ret = trace_test_buffer_cpu(buf, cpu);  		if (ret)  			break;  	} @@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	msleep(100);  	/* we should have nothing in the buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	if (ret)  		goto out; @@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	tracing_start();  	/* we should only have one item */ @@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  /* Maximum number of functions to trace before diagnosing a hang */  #define GRAPH_MAX_FUNC_TEST	100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);  static unsigned int graph_hang_thresh;  /* Wrap the real function entry probe to avoid possible hanging */ @@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)  	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {  		ftrace_graph_stop();  		printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); -		if (ftrace_dump_on_oops) -			__ftrace_dump(false, DUMP_ALL); +		if (ftrace_dump_on_oops) { +			ftrace_dump(DUMP_ALL); +			/* ftrace_dump() disables tracing */ +			tracing_on(); +		}  		return 0;  	} @@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	 * Simulate the init() callback but we attach a watchdog callback  	 * to detect and recover from possible hangs  	 */ -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	set_graph_array(tr);  	ret = register_ftrace_graph(&trace_graph_return,  				    &trace_graph_entry_watchdog); @@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (ret)  		goto out; @@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (!ret && !count) {  		printk(KERN_CONT ".. no entries found .."); @@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	printk("ret = %d\n", ret);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr); @@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr  	/* stop the tracing. */  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc70..b20428c5efe 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -20,13 +20,24 @@  #define STACK_TRACE_ENTRIES 500 +#ifdef CC_USING_FENTRY +# define fentry		1 +#else +# define fentry		0 +#endif +  static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =  	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };  static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */  static struct stack_trace max_stack_trace = { -	.max_entries		= STACK_TRACE_ENTRIES, -	.entries		= stack_dump_trace, +	.max_entries		= STACK_TRACE_ENTRIES - 1, +	.entries		= &stack_dump_trace[1],  };  static unsigned long max_stack_size; @@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);  int stack_tracer_enabled;  static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void +check_stack(unsigned long ip, unsigned long *stack)  {  	unsigned long this_size, flags;  	unsigned long *p, *top, *start; +	static int tracer_frame; +	int frame_size = ACCESS_ONCE(tracer_frame);  	int i; -	this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); +	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);  	this_size = THREAD_SIZE - this_size; +	/* Remove the frame of the tracer */ +	this_size -= frame_size;  	if (this_size <= max_stack_size)  		return;  	/* we do not handle interrupt stacks yet */ -	if (!object_is_on_stack(&this_size)) +	if (!object_is_on_stack(stack))  		return;  	local_irq_save(flags);  	arch_spin_lock(&max_stack_lock); +	/* In case another CPU set the tracer_frame on us */ +	if (unlikely(!frame_size)) +		this_size -= tracer_frame; +  	/* a race could have already updated it */  	if (this_size <= max_stack_size)  		goto out; @@ -70,10 +90,18 @@ static inline void check_stack(void)  	save_stack_trace(&max_stack_trace);  	/* +	 * Add the passed in ip from the function tracer. +	 * Searching for this on the stack will skip over +	 * most of the overhead from the stack tracer itself. +	 */ +	stack_dump_trace[0] = ip; +	max_stack_trace.nr_entries++; + +	/*  	 * Now find where in the stack these are.  	 */  	i = 0; -	start = &this_size; +	start = stack;  	top = (unsigned long *)  		(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -97,6 +125,18 @@ static inline void check_stack(void)  				found = 1;  				/* Start the search from here */  				start = p + 1; +				/* +				 * We do not want to show the overhead +				 * of the stack tracer stack in the +				 * max stack. If we haven't figured +				 * out what that is, then figure it out +				 * now. +				 */ +				if (unlikely(!tracer_frame) && i == 1) { +					tracer_frame = (p - stack) * +						sizeof(unsigned long); +					max_stack_size -= tracer_frame; +				}  			}  		} @@ -113,6 +153,7 @@ static void  stack_trace_call(unsigned long ip, unsigned long parent_ip,  		 struct ftrace_ops *op, struct pt_regs *pt_regs)  { +	unsigned long stack;  	int cpu;  	preempt_disable_notrace(); @@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,  	if (per_cpu(trace_active, cpu)++ != 0)  		goto out; -	check_stack(); +	/* +	 * When fentry is used, the traced function does not get +	 * its stack frame set up, and we lose the parent. +	 * The ip is pretty useless because the function tracer +	 * was called before that function set up its stack frame. +	 * In this case, we use the parent ip. +	 * +	 * By adding the return address of either the parent ip +	 * or the current ip we can disregard most of the stack usage +	 * caused by the stack tracer itself. +	 * +	 * The function tracer always reports the address of where the +	 * mcount call was, but the stack will hold the return address. +	 */ +	if (fentry) +		ip = parent_ip; +	else +		ip += MCOUNT_INSN_SIZE; + +	check_stack(ip, &stack);   out:  	per_cpu(trace_active, cpu)--; @@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {  	.open = stack_trace_filter_open,  	.read = seq_read,  	.write = ftrace_filter_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -371,6 +431,8 @@ static __init int stack_trace_init(void)  	struct dentry *d_tracer;  	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0;  	trace_create_file("stack_max_size", 0644, d_tracer,  			&max_stack_size, &stack_max_size_fops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e7..847f88a6194 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -307,6 +307,8 @@ static int tracing_stat_init(void)  	struct dentry *d_tracing;  	d_tracing = tracing_init_dentry(); +	if (!d_tracing) +		return 0;  	stat_dir = debugfs_create_dir("trace_stat", d_tracing);  	if (!stat_dir) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a809e32105..8f2ac73c7a5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -12,10 +12,6 @@  #include "trace.h"  static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);  static int syscall_enter_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data); @@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name  	/*  	 * Only compare after the "sys" prefix. Archs that use  	 * syscall wrappers may have syscalls symbols aliases prefixed -	 * with "SyS" instead of "sys", leading to an unwanted +	 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted  	 * mismatch.  	 */  	return !strcmp(sym + 3, name + 3); @@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)  		kfree(call->print_fmt);  } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_enter trace;  	struct syscall_metadata *meta = call->data; @@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)  	return ret;  } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_exit trace;  	int ret; @@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)  	return ret;  } -static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  { +	struct trace_array *tr = data;  	struct syscall_trace_enter *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_enter_syscalls)) +	if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr); @@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; -	event = trace_current_buffer_lock_reserve(&buffer, +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer,  			sys_data->enter_event->event.type, size, 0, 0);  	if (!event)  		return; @@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  { +	struct trace_array *tr = data;  	struct syscall_trace_exit *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_exit_syscalls)) +	if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr);  	if (!sys_data)  		return; -	event = trace_current_buffer_lock_reserve(&buffer, +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer,  			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);  	if (!event)  		return; @@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -static int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, +				   struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num; @@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_enter) -		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); +	if (!tr->sys_refcount_enter) +		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);  	if (!ret) { -		set_bit(num, enabled_enter_syscalls); -		sys_refcount_enter++; +		set_bit(num, tr->enabled_enter_syscalls); +		tr->sys_refcount_enter++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -static void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, +				      struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr;  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_enter--; -	clear_bit(num, enabled_enter_syscalls); -	if (!sys_refcount_enter) -		unregister_trace_sys_enter(ftrace_syscall_enter, NULL); +	tr->sys_refcount_enter--; +	clear_bit(num, tr->enabled_enter_syscalls); +	if (!tr->sys_refcount_enter) +		unregister_trace_sys_enter(ftrace_syscall_enter, tr);  	mutex_unlock(&syscall_trace_lock);  } -static int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, +				  struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num; @@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_exit) -		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); +	if (!tr->sys_refcount_exit) +		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);  	if (!ret) { -		set_bit(num, enabled_exit_syscalls); -		sys_refcount_exit++; +		set_bit(num, tr->enabled_exit_syscalls); +		tr->sys_refcount_exit++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -static void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, +				     struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr;  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_exit--; -	clear_bit(num, enabled_exit_syscalls); -	if (!sys_refcount_exit) -		unregister_trace_sys_exit(ftrace_syscall_exit, NULL); +	tr->sys_refcount_exit--; +	clear_bit(num, tr->enabled_exit_syscalls); +	if (!tr->sys_refcount_exit) +		unregister_trace_sys_exit(ftrace_syscall_exit, tr);  	mutex_unlock(&syscall_trace_lock);  } @@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {  	.trace		= print_syscall_exit,  }; -struct ftrace_event_class event_class_syscall_enter = { +struct ftrace_event_class __refdata event_class_syscall_enter = {  	.system		= "syscalls",  	.reg		= syscall_enter_register,  	.define_fields	= syscall_enter_define_fields, @@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {  	.raw_init	= init_syscall_trace,  }; -struct ftrace_event_class event_class_syscall_exit = { +struct ftrace_event_class __refdata event_class_syscall_exit = {  	.system		= "syscalls",  	.reg		= syscall_exit_register,  	.define_fields	= syscall_exit_define_fields, @@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)  static int syscall_enter_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_enter(event); +		return reg_event_syscall_enter(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_enter(event); +		unreg_event_syscall_enter(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,  static int syscall_exit_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_exit(event); +		return reg_event_syscall_exit(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_exit(event); +		unreg_event_syscall_exit(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8dad2a92dee..32494fb0ee6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,18 @@  #define UPROBE_EVENT_SYSTEM	"uprobes" +struct uprobe_trace_entry_head { +	struct trace_entry	ent; +	unsigned long		vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return)			\ +	(sizeof(struct uprobe_trace_entry_head) +	\ +	 sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return)		\ +	((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) +  struct trace_uprobe_filter {  	rwlock_t		rwlock;  	int			nr_systemwide; @@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);  static LIST_HEAD(uprobe_list);  static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, +				unsigned long func, struct pt_regs *regs);  static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)  { @@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)  	return !filter->nr_systemwide && list_empty(&filter->perf_events);  } +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ +	return tu->consumer.ret_handler != NULL; +} +  /*   * Allocate new trace_uprobe and initialize it (including uprobes).   */  static struct trace_uprobe * -alloc_trace_uprobe(const char *group, const char *event, int nargs) +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)  {  	struct trace_uprobe *tu; @@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)  	INIT_LIST_HEAD(&tu->list);  	tu->consumer.handler = uprobe_dispatcher; +	if (is_ret) +		tu->consumer.ret_handler = uretprobe_dispatcher;  	init_trace_uprobe_filter(&tu->filter);  	return tu; @@ -180,7 +201,7 @@ end:  /*   * Argument syntax: - *  - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] + *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]   *   *  - Remove uprobe: -:[GRP/]EVENT   */ @@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)  	char buf[MAX_EVENT_NAME_LEN];  	struct path path;  	unsigned long offset; -	bool is_delete; +	bool is_delete, is_return;  	int i, ret;  	inode = NULL;  	ret = 0;  	is_delete = false; +	is_return = false;  	event = NULL;  	group = NULL;  	/* argc must be >= 1 */  	if (argv[0][0] == '-')  		is_delete = true; +	else if (argv[0][0] == 'r') +		is_return = true;  	else if (argv[0][0] != 'p') { -		pr_info("Probe definition must be started with 'p' or '-'.\n"); +		pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");  		return -EINVAL;  	} @@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)  		kfree(tail);  	} -	tu = alloc_trace_uprobe(group, event, argc); +	tu = alloc_trace_uprobe(group, event, argc, is_return);  	if (IS_ERR(tu)) {  		pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));  		ret = PTR_ERR(tu); @@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)  static int probes_seq_show(struct seq_file *m, void *v)  {  	struct trace_uprobe *tu = v; +	char c = is_ret_probe(tu) ? 'r' : 'p';  	int i; -	seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); +	seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);  	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);  	for (i = 0; i < tu->nr_args; i++) @@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {  	.release	= seq_release,  }; -/* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_trace_print(struct trace_uprobe *tu, +				unsigned long func, struct pt_regs *regs)  {  	struct uprobe_trace_entry_head *entry;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; -	u8 *data; -	int size, i, pc; -	unsigned long irq_flags; +	void *data; +	int size, i;  	struct ftrace_event_call *call = &tu->call; -	local_save_flags(irq_flags); -	pc = preempt_count(); - -	size = sizeof(*entry) + tu->size; - +	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));  	event = trace_current_buffer_lock_reserve(&buffer, call->event.type, -						  size, irq_flags, pc); +						  size + tu->size, 0, 0);  	if (!event) -		return 0; +		return;  	entry = ring_buffer_event_data(event); -	entry->ip = instruction_pointer(task_pt_regs(current)); -	data = (u8 *)&entry[1]; +	if (is_ret_probe(tu)) { +		entry->vaddr[0] = func; +		entry->vaddr[1] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		entry->vaddr[0] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, false); +	} +  	for (i = 0; i < tu->nr_args; i++)  		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);  	if (!filter_current_check_discard(buffer, call, entry, event)) -		trace_buffer_unlock_commit(buffer, event, irq_flags, pc); +		trace_buffer_unlock_commit(buffer, event, 0, 0); +} +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ +	if (!is_ret_probe(tu)) +		uprobe_trace_print(tu, 0, regs);  	return 0;  } +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, +				struct pt_regs *regs) +{ +	uprobe_trace_print(tu, func, regs); +} +  /* Event entry printers */  static enum print_line_t  print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)  { -	struct uprobe_trace_entry_head *field; +	struct uprobe_trace_entry_head *entry;  	struct trace_seq *s = &iter->seq;  	struct trace_uprobe *tu;  	u8 *data;  	int i; -	field = (struct uprobe_trace_entry_head *)iter->ent; +	entry = (struct uprobe_trace_entry_head *)iter->ent;  	tu = container_of(event, struct trace_uprobe, call.event); -	if (!trace_seq_printf(s, "%s: (", tu->call.name)) -		goto partial; - -	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) -		goto partial; - -	if (!trace_seq_puts(s, ")")) -		goto partial; +	if (is_ret_probe(tu)) { +		if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, +					entry->vaddr[1], entry->vaddr[0])) +			goto partial; +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, +					entry->vaddr[0])) +			goto partial; +		data = DATAOF_TRACE_ENTRY(entry, false); +	} -	data = (u8 *)&field[1];  	for (i = 0; i < tu->nr_args; i++) {  		if (!tu->args[i].type->print(s, tu->args[i].name, -					     data + tu->args[i].offset, field)) +					     data + tu->args[i].offset, entry))  			goto partial;  	} @@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)  static int uprobe_event_define_fields(struct ftrace_event_call *event_call)  { -	int ret, i; +	int ret, i, size;  	struct uprobe_trace_entry_head field; -	struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; +	struct trace_uprobe *tu = event_call->data; -	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); +	if (is_ret_probe(tu)) { +		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); +		DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); +		size = SIZEOF_TRACE_ENTRY(true); +	} else { +		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); +		size = SIZEOF_TRACE_ENTRY(false); +	}  	/* Set argument names as fields */  	for (i = 0; i < tu->nr_args; i++) {  		ret = trace_define_field(event_call, tu->args[i].type->fmttype,  					 tu->args[i].name, -					 sizeof(field) + tu->args[i].offset, +					 size + tu->args[i].offset,  					 tu->args[i].type->size,  					 tu->args[i].type->is_signed,  					 FILTER_OTHER); @@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)  	int i;  	int pos = 0; -	fmt = "(%lx)"; -	arg = "REC->" FIELD_STRING_IP; +	if (is_ret_probe(tu)) { +		fmt = "(%lx <- %lx)"; +		arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; +	} else { +		fmt = "(%lx)"; +		arg = "REC->" FIELD_STRING_IP; +	}  	/* When len=0, we just calculate the needed length */ @@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,  	return ret;  } -/* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_perf_print(struct trace_uprobe *tu, +				unsigned long func, struct pt_regs *regs)  {  	struct ftrace_event_call *call = &tu->call;  	struct uprobe_trace_entry_head *entry;  	struct hlist_head *head; -	u8 *data; -	int size, __size, i; -	int rctx; +	void *data; +	int size, rctx, i; -	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) -		return UPROBE_HANDLER_REMOVE; - -	__size = sizeof(*entry) + tu->size; -	size = ALIGN(__size + sizeof(u32), sizeof(u64)); -	size -= sizeof(u32); +	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); +	size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);  	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) -		return 0; +		return;  	preempt_disable(); +	head = this_cpu_ptr(call->perf_events); +	if (hlist_empty(head)) +		goto out;  	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);  	if (!entry)  		goto out; -	entry->ip = instruction_pointer(task_pt_regs(current)); -	data = (u8 *)&entry[1]; +	if (is_ret_probe(tu)) { +		entry->vaddr[0] = func; +		entry->vaddr[1] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		entry->vaddr[0] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, false); +	} +  	for (i = 0; i < tu->nr_args; i++)  		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -	head = this_cpu_ptr(call->perf_events); -	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); - +	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);   out:  	preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ +	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) +		return UPROBE_HANDLER_REMOVE; + +	if (!is_ret_probe(tu)) +		uprobe_perf_print(tu, 0, regs);  	return 0;  } + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, +				struct pt_regs *regs) +{ +	uprobe_perf_print(tu, func, regs); +}  #endif	/* CONFIG_PERF_EVENTS */  static  int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)  { -	struct trace_uprobe *tu = (struct trace_uprobe *)event->data; +	struct trace_uprobe *tu = event->data;  	switch (type) {  	case TRACE_REG_REGISTER: @@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)  	return ret;  } +static int uretprobe_dispatcher(struct uprobe_consumer *con, +				unsigned long func, struct pt_regs *regs) +{ +	struct trace_uprobe *tu; + +	tu = container_of(con, struct trace_uprobe, consumer); + +	if (tu->flags & TP_FLAG_TRACE) +		uretprobe_trace_func(tu, func, regs); + +#ifdef CONFIG_PERF_EVENTS +	if (tu->flags & TP_FLAG_PROFILE) +		uretprobe_perf_func(tu, func, regs); +#endif +	return 0; +} +  static struct trace_event_functions uprobe_funcs = {  	.trace		= print_uprobe_event  }; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 0c05a459204..29f26540e9c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,  	int nr_probes = 0;  	struct tracepoint_func *old, *new; -	WARN_ON(!probe); +	if (WARN_ON(!probe)) +		return ERR_PTR(-EINVAL);  	debug_print_probes(entry);  	old = entry->funcs; @@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,  	debug_print_probes(entry);  	/* (N -> M), (N > 1, M >= 0) probes */ -	for (nr_probes = 0; old[nr_probes].func; nr_probes++) { -		if (!probe || -		    (old[nr_probes].func == probe && -		     old[nr_probes].data == data)) -			nr_del++; +	if (probe) { +		for (nr_probes = 0; old[nr_probes].func; nr_probes++) { +			if (old[nr_probes].func == probe && +			     old[nr_probes].data == data) +				nr_del++; +		}  	} +	/* +	 * If probe is NULL, then nr_probes = nr_del = 0, and then the +	 * entire entry will be removed. +	 */  	if (nr_probes - nr_del == 0) {  		/* N -> 0, (N > 1) */  		entry->funcs = NULL; @@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,  		if (new == NULL)  			return ERR_PTR(-ENOMEM);  		for (i = 0; old[i].func; i++) -			if (probe && -			    (old[i].func != probe || old[i].data != data)) +			if (old[i].func != probe || old[i].data != data)  				new[j++] = old[i];  		new[nr_probes - nr_del].func = NULL;  		entry->refcount = nr_probes - nr_del; diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03..8e635a18ab5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {  	.owner = GLOBAL_ROOT_UID,  	.group = GLOBAL_ROOT_GID,  	.proc_inum = PROC_USER_INIT_INO, +	.may_mount_sysfs = true, +	.may_mount_proc = true,  };  EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8b650837083..e134d8f365d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,10 +21,12 @@  #include <linux/uaccess.h>  #include <linux/ctype.h>  #include <linux/projid.h> +#include <linux/fs_struct.h>  static struct kmem_cache *user_ns_cachep __read_mostly; -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, +				struct user_namespace *ns, int cap_setid,  				struct uid_gid_map *map);  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) @@ -60,6 +62,15 @@ int create_user_ns(struct cred *new)  	kgid_t group = new->egid;  	int ret; +	/* +	 * Verify that we can not violate the policy of which files +	 * may be accessed that is specified by the root directory, +	 * by verifing that the root directory is at the root of the +	 * mount namespace which allows all files to be accessed. +	 */ +	if (current_chrooted()) +		return -EPERM; +  	/* The creator needs a mapping in the parent user namespace  	 * or else we won't be able to reasonably tell userspace who  	 * created a user_namespace. @@ -86,6 +97,8 @@ int create_user_ns(struct cred *new)  	set_cred_user_ns(new, ns); +	update_mnt_policy(ns); +  	return 0;  } @@ -600,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,  	if (map->nr_extents != 0)  		goto out; -	/* Require the appropriate privilege CAP_SETUID or CAP_SETGID -	 * over the user namespace in order to set the id mapping. +	/* +	 * Adjusting namespace settings requires capabilities on the target.  	 */ -	if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) +	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))  		goto out;  	/* Get a buffer */ @@ -688,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,  	ret = -EPERM;  	/* Validate the user is allowed to use user id's mapped to. */ -	if (!new_idmap_permitted(ns, cap_setid, &new_map)) +	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))  		goto out;  	/* Map the lower ids from the parent user namespace to the @@ -775,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t  			 &ns->projid_map, &ns->parent->projid_map);  } -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file,  +				struct user_namespace *ns, int cap_setid,  				struct uid_gid_map *new_map)  {  	/* Allow mapping to your own filesystem ids */ @@ -783,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,  		u32 id = new_map->extent[0].lower_first;  		if (cap_setid == CAP_SETUID) {  			kuid_t uid = make_kuid(ns->parent, id); -			if (uid_eq(uid, current_fsuid())) +			if (uid_eq(uid, file->f_cred->fsuid))  				return true;  		}  		else if (cap_setid == CAP_SETGID) {  			kgid_t gid = make_kgid(ns->parent, id); -			if (gid_eq(gid, current_fsgid())) +			if (gid_eq(gid, file->f_cred->fsgid))  				return true;  		}  	} @@ -799,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,  	/* Allow the specified ids if we have the appropriate capability  	 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. +	 * And the opener of the id file also had the approprpiate capability.  	 */ -	if (ns_capable(ns->parent, cap_setid)) +	if (ns_capable(ns->parent, cap_setid) && +	    file_ns_capable(file, ns->parent, cap_setid))  		return true;  	return false; @@ -837,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)  	if (atomic_read(¤t->mm->mm_users) > 1)  		return -EINVAL; +	if (current->fs->users != 1) +		return -EINVAL; +  	if (!ns_capable(user_ns, CAP_SYS_ADMIN))  		return -EPERM; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4a944676358..05039e348f0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write,  		return ret;  	set_sample_period(); +	/* +	 * Watchdog threads shouldn't be enabled if they are +	 * disabled. The 'watchdog_disabled' variable check in +	 * watchdog_*_all_cpus() function takes care of this. +	 */  	if (watchdog_enabled && watchdog_thresh)  		watchdog_enable_all_cpus();  	else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811e..154aa12af48 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,7 +41,11 @@  #include <linux/debug_locks.h>  #include <linux/lockdep.h>  #include <linux/idr.h> +#include <linux/jhash.h>  #include <linux/hashtable.h> +#include <linux/rculist.h> +#include <linux/nodemask.h> +#include <linux/moduleparam.h>  #include "workqueue_internal.h" @@ -58,12 +62,11 @@ enum {  	 * %WORKER_UNBOUND set and concurrency management disabled, and may  	 * be executing on any CPU.  The pool behaves as an unbound one.  	 * -	 * Note that DISASSOCIATED can be flipped only while holding -	 * assoc_mutex to avoid changing binding state while +	 * Note that DISASSOCIATED should be flipped only while holding +	 * manager_mutex to avoid changing binding state while  	 * create_worker() is in progress.  	 */  	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */ -	POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */  	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */  	POOL_FREEZING		= 1 << 3,	/* freeze in progress */ @@ -74,12 +77,14 @@ enum {  	WORKER_PREP		= 1 << 3,	/* preparing to run works */  	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */  	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */ +	WORKER_REBOUND		= 1 << 8,	/* worker was rebound */ -	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_UNBOUND | -				  WORKER_CPU_INTENSIVE, +	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE | +				  WORKER_UNBOUND | WORKER_REBOUND,  	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */ +	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */  	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */  	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */ @@ -97,6 +102,8 @@ enum {  	 */  	RESCUER_NICE_LEVEL	= -20,  	HIGHPRI_NICE_LEVEL	= -20, + +	WQ_NAME_LEN		= 24,  };  /* @@ -115,16 +122,26 @@ enum {   *    cpu or grabbing pool->lock is enough for read access.  If   *    POOL_DISASSOCIATED is set, it's identical to L.   * - * F: wq->flush_mutex protected. + * MG: pool->manager_mutex and pool->lock protected.  Writes require both + *     locks.  Reads can happen under either lock. + * + * PL: wq_pool_mutex protected. + * + * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads. + * + * WQ: wq->mutex protected.   * - * W: workqueue_lock protected. + * WR: wq->mutex protected for writes.  Sched-RCU protected for reads. + * + * MD: wq_mayday_lock protected.   */  /* struct worker is defined in workqueue_internal.h */  struct worker_pool {  	spinlock_t		lock;		/* the pool lock */ -	unsigned int		cpu;		/* I: the associated cpu */ +	int			cpu;		/* I: the associated cpu */ +	int			node;		/* I: the associated node ID */  	int			id;		/* I: pool ID */  	unsigned int		flags;		/* X: flags */ @@ -138,12 +155,18 @@ struct worker_pool {  	struct timer_list	idle_timer;	/* L: worker idle timeout */  	struct timer_list	mayday_timer;	/* L: SOS timer for workers */ -	/* workers are chained either in busy_hash or idle_list */ +	/* a workers is either on busy_hash or idle_list, or the manager */  	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);  						/* L: hash of busy workers */ -	struct mutex		assoc_mutex;	/* protect POOL_DISASSOCIATED */ -	struct ida		worker_ida;	/* L: for worker IDs */ +	/* see manage_workers() for details on the two manager mutexes */ +	struct mutex		manager_arb;	/* manager arbitration */ +	struct mutex		manager_mutex;	/* manager exclusion */ +	struct idr		worker_idr;	/* MG: worker IDs and iteration */ + +	struct workqueue_attrs	*attrs;		/* I: worker attributes */ +	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */ +	int			refcnt;		/* PL: refcnt for unbound pools */  	/*  	 * The current concurrency level.  As it's likely to be accessed @@ -151,6 +174,12 @@ struct worker_pool {  	 * cacheline.  	 */  	atomic_t		nr_running ____cacheline_aligned_in_smp; + +	/* +	 * Destruction of pool is sched-RCU protected to allow dereferences +	 * from get_work_pool(). +	 */ +	struct rcu_head		rcu;  } ____cacheline_aligned_in_smp;  /* @@ -164,75 +193,107 @@ struct pool_workqueue {  	struct workqueue_struct *wq;		/* I: the owning workqueue */  	int			work_color;	/* L: current color */  	int			flush_color;	/* L: flushing color */ +	int			refcnt;		/* L: reference count */  	int			nr_in_flight[WORK_NR_COLORS];  						/* L: nr of in_flight works */  	int			nr_active;	/* L: nr of active works */  	int			max_active;	/* L: max active works */  	struct list_head	delayed_works;	/* L: delayed works */ -}; +	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */ +	struct list_head	mayday_node;	/* MD: node on wq->maydays */ + +	/* +	 * Release of unbound pwq is punted to system_wq.  See put_pwq() +	 * and pwq_unbound_release_workfn() for details.  pool_workqueue +	 * itself is also sched-RCU protected so that the first pwq can be +	 * determined without grabbing wq->mutex. +	 */ +	struct work_struct	unbound_release_work; +	struct rcu_head		rcu; +} __aligned(1 << WORK_STRUCT_FLAG_BITS);  /*   * Structure used to wait for workqueue flush.   */  struct wq_flusher { -	struct list_head	list;		/* F: list of flushers */ -	int			flush_color;	/* F: flush color waiting for */ +	struct list_head	list;		/* WQ: list of flushers */ +	int			flush_color;	/* WQ: flush color waiting for */  	struct completion	done;		/* flush completion */  }; -/* - * All cpumasks are assumed to be always set on UP and thus can't be - * used to determine whether there's something to be done. - */ -#ifdef CONFIG_SMP -typedef cpumask_var_t mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask)	\ -	cpumask_test_and_set_cpu((cpu), (mask)) -#define mayday_clear_cpu(cpu, mask)		cpumask_clear_cpu((cpu), (mask)) -#define for_each_mayday_cpu(cpu, mask)		for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp)		zalloc_cpumask_var((maskp), (gfp)) -#define free_mayday_mask(mask)			free_cpumask_var((mask)) -#else -typedef unsigned long mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask)	test_and_set_bit(0, &(mask)) -#define mayday_clear_cpu(cpu, mask)		clear_bit(0, &(mask)) -#define for_each_mayday_cpu(cpu, mask)		if ((cpu) = 0, (mask)) -#define alloc_mayday_mask(maskp, gfp)		true -#define free_mayday_mask(mask)			do { } while (0) -#endif +struct wq_device;  /* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: + * The externally visible workqueue.  It relays the issued work items to + * the appropriate worker_pool through its pool_workqueues.   */  struct workqueue_struct { -	unsigned int		flags;		/* W: WQ_* flags */ -	union { -		struct pool_workqueue __percpu		*pcpu; -		struct pool_workqueue			*single; -		unsigned long				v; -	} pool_wq;				/* I: pwq's */ -	struct list_head	list;		/* W: list of all workqueues */ +	struct list_head	pwqs;		/* WR: all pwqs of this wq */ +	struct list_head	list;		/* PL: list of all workqueues */ -	struct mutex		flush_mutex;	/* protects wq flushing */ -	int			work_color;	/* F: current work color */ -	int			flush_color;	/* F: current flush color */ +	struct mutex		mutex;		/* protects this wq */ +	int			work_color;	/* WQ: current work color */ +	int			flush_color;	/* WQ: current flush color */  	atomic_t		nr_pwqs_to_flush; /* flush in progress */ -	struct wq_flusher	*first_flusher;	/* F: first flusher */ -	struct list_head	flusher_queue;	/* F: flush waiters */ -	struct list_head	flusher_overflow; /* F: flush overflow list */ +	struct wq_flusher	*first_flusher;	/* WQ: first flusher */ +	struct list_head	flusher_queue;	/* WQ: flush waiters */ +	struct list_head	flusher_overflow; /* WQ: flush overflow list */ -	mayday_mask_t		mayday_mask;	/* cpus requesting rescue */ +	struct list_head	maydays;	/* MD: pwqs requesting rescue */  	struct worker		*rescuer;	/* I: rescue worker */ -	int			nr_drainers;	/* W: drain in progress */ -	int			saved_max_active; /* W: saved pwq max_active */ +	int			nr_drainers;	/* WQ: drain in progress */ +	int			saved_max_active; /* WQ: saved pwq max_active */ + +	struct workqueue_attrs	*unbound_attrs;	/* WQ: only for unbound wqs */ +	struct pool_workqueue	*dfl_pwq;	/* WQ: only for unbound wqs */ + +#ifdef CONFIG_SYSFS +	struct wq_device	*wq_dev;	/* I: for sysfs interface */ +#endif  #ifdef CONFIG_LOCKDEP  	struct lockdep_map	lockdep_map;  #endif -	char			name[];		/* I: workqueue name */ +	char			name[WQ_NAME_LEN]; /* I: workqueue name */ + +	/* hot fields used during command issue, aligned to cacheline */ +	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */ +	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ +	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */  }; +static struct kmem_cache *pwq_cache; + +static int wq_numa_tbl_len;		/* highest possible NUMA node id + 1 */ +static cpumask_var_t *wq_numa_possible_cpumask; +					/* possible CPUs of each node */ + +static bool wq_disable_numa; +module_param_named(disable_numa, wq_disable_numa, bool, 0444); + +static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */ + +/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; + +static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */ +static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */ + +static LIST_HEAD(workqueues);		/* PL: list of all workqueues */ +static bool workqueue_freezing;		/* PL: have wqs started freezing? */ + +/* the per-cpu worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], +				     cpu_worker_pools); + +static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */ + +/* PL: hash of all unbound pools keyed by pool->attrs */ +static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); + +/* I: attributes used when instantiating standard unbound pools on demand */ +static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; +  struct workqueue_struct *system_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_wq);  struct workqueue_struct *system_highpri_wq __read_mostly; @@ -244,64 +305,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);  struct workqueue_struct *system_freezable_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_freezable_wq); +static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, +				 const struct workqueue_attrs *from); +  #define CREATE_TRACE_POINTS  #include <trace/events/workqueue.h> -#define for_each_std_worker_pool(pool, cpu)				\ -	for ((pool) = &std_worker_pools(cpu)[0];			\ -	     (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) +#define assert_rcu_or_pool_mutex()					\ +	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\ +			   lockdep_is_held(&wq_pool_mutex),		\ +			   "sched RCU or wq_pool_mutex should be held") -#define for_each_busy_worker(worker, i, pool)				\ -	hash_for_each(pool->busy_hash, i, worker, hentry) +#define assert_rcu_or_wq_mutex(wq)					\ +	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\ +			   lockdep_is_held(&wq->mutex),			\ +			   "sched RCU or wq->mutex should be held") -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, -				unsigned int sw) -{ -	if (cpu < nr_cpu_ids) { -		if (sw & 1) { -			cpu = cpumask_next(cpu, mask); -			if (cpu < nr_cpu_ids) -				return cpu; -		} -		if (sw & 2) -			return WORK_CPU_UNBOUND; -	} -	return WORK_CPU_END; -} +#ifdef CONFIG_LOCKDEP +#define assert_manager_or_pool_lock(pool)				\ +	WARN_ONCE(debug_locks &&					\ +		  !lockdep_is_held(&(pool)->manager_mutex) &&		\ +		  !lockdep_is_held(&(pool)->lock),			\ +		  "pool->manager_mutex or ->lock should be held") +#else +#define assert_manager_or_pool_lock(pool)	do { } while (0) +#endif -static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, -				 struct workqueue_struct *wq) -{ -	return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); -} +#define for_each_cpu_worker_pool(pool, cpu)				\ +	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\ +	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ +	     (pool)++) -/* - * CPU iterators +/** + * for_each_pool - iterate through all worker_pools in the system + * @pool: iteration cursor + * @pi: integer used for iteration   * - * An extra cpu number is defined using an invalid cpu number - * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU.  The following iterators are similar to for_each_*_cpu() - * iterators but also considers the unbound CPU. + * This must be called either with wq_pool_mutex held or sched RCU read + * locked.  If the pool needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pool stays online.   * - * for_each_wq_cpu()		: possible CPUs + WORK_CPU_UNBOUND - * for_each_online_wq_cpu()	: online CPUs + WORK_CPU_UNBOUND - * for_each_pwq_cpu()		: possible CPUs for bound workqueues, - *				  WORK_CPU_UNBOUND for unbound workqueues + * The if/else clause exists only for the lockdep assertion and can be + * ignored.   */ -#define for_each_wq_cpu(cpu)						\ -	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);		\ -	     (cpu) < WORK_CPU_END;					\ -	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) +#define for_each_pool(pool, pi)						\ +	idr_for_each_entry(&worker_pool_idr, pool, pi)			\ +		if (({ assert_rcu_or_pool_mutex(); false; })) { }	\ +		else -#define for_each_online_wq_cpu(cpu)					\ -	for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);		\ -	     (cpu) < WORK_CPU_END;					\ -	     (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) +/** + * for_each_pool_worker - iterate through all workers of a worker_pool + * @worker: iteration cursor + * @wi: integer used for iteration + * @pool: worker_pool to iterate workers of + * + * This must be called with either @pool->manager_mutex or ->lock held. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pool_worker(worker, wi, pool)				\ +	idr_for_each_entry(&(pool)->worker_idr, (worker), (wi))		\ +		if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ +		else -#define for_each_pwq_cpu(cpu, wq)					\ -	for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq));	\ -	     (cpu) < WORK_CPU_END;					\ -	     (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) +/** + * for_each_pwq - iterate through all pool_workqueues of the specified workqueue + * @pwq: iteration cursor + * @wq: the target workqueue + * + * This must be called either with wq->mutex held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pwq(pwq, wq)						\ +	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)		\ +		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\ +		else  #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -419,76 +503,35 @@ static inline void debug_work_activate(struct work_struct *work) { }  static inline void debug_work_deactivate(struct work_struct *work) { }  #endif -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing;		/* W: have wqs started freezing? */ - -/* - * The CPU and unbound standard worker pools.  The unbound ones have - * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. - */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], -				     cpu_std_worker_pools); -static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; - -/* idr of all pools */ -static DEFINE_MUTEX(worker_pool_idr_mutex); -static DEFINE_IDR(worker_pool_idr); - -static int worker_thread(void *__worker); - -static struct worker_pool *std_worker_pools(int cpu) -{ -	if (cpu != WORK_CPU_UNBOUND) -		return per_cpu(cpu_std_worker_pools, cpu); -	else -		return unbound_std_worker_pools; -} - -static int std_worker_pool_pri(struct worker_pool *pool) -{ -	return pool - std_worker_pools(pool->cpu); -} -  /* allocate ID and assign it to @pool */  static int worker_pool_assign_id(struct worker_pool *pool)  {  	int ret; -	mutex_lock(&worker_pool_idr_mutex); -	idr_pre_get(&worker_pool_idr, GFP_KERNEL); -	ret = idr_get_new(&worker_pool_idr, pool, &pool->id); -	mutex_unlock(&worker_pool_idr_mutex); +	lockdep_assert_held(&wq_pool_mutex); +	ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); +	if (ret >= 0) { +		pool->id = ret; +		return 0; +	}  	return ret;  } -/* - * Lookup worker_pool by id.  The idr currently is built during boot and - * never modified.  Don't worry about locking for now. +/** + * unbound_pwq_by_node - return the unbound pool_workqueue for the given node + * @wq: the target workqueue + * @node: the node ID + * + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online.   */ -static struct worker_pool *worker_pool_by_id(int pool_id) -{ -	return idr_find(&worker_pool_idr, pool_id); -} - -static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) +static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, +						  int node)  { -	struct worker_pool *pools = std_worker_pools(cpu); - -	return &pools[highpri]; -} - -static struct pool_workqueue *get_pwq(unsigned int cpu, -				      struct workqueue_struct *wq) -{ -	if (!(wq->flags & WQ_UNBOUND)) { -		if (likely(cpu < nr_cpu_ids)) -			return per_cpu_ptr(wq->pool_wq.pcpu, cpu); -	} else if (likely(cpu == WORK_CPU_UNBOUND)) -		return wq->pool_wq.single; -	return NULL; +	assert_rcu_or_wq_mutex(wq); +	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);  }  static unsigned int work_color_to_flags(int color) @@ -530,7 +573,7 @@ static int work_next_color(int color)  static inline void set_work_data(struct work_struct *work, unsigned long data,  				 unsigned long flags)  { -	BUG_ON(!work_pending(work)); +	WARN_ON_ONCE(!work_pending(work));  	atomic_long_set(&work->data, data | flags | work_static(work));  } @@ -582,13 +625,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)   * @work: the work item of interest   *   * Return the worker_pool @work was last associated with.  %NULL if none. + * + * Pools are created and destroyed under wq_pool_mutex, and allows read + * access under sched-RCU read lock.  As such, this function should be + * called under wq_pool_mutex or with preemption disabled. + * + * All fields of the returned pool are accessible as long as the above + * mentioned locking is in effect.  If the returned pool needs to be used + * beyond the critical section, the caller is responsible for ensuring the + * returned pool is and stays online.   */  static struct worker_pool *get_work_pool(struct work_struct *work)  {  	unsigned long data = atomic_long_read(&work->data); -	struct worker_pool *pool;  	int pool_id; +	assert_rcu_or_pool_mutex(); +  	if (data & WORK_STRUCT_PWQ)  		return ((struct pool_workqueue *)  			(data & WORK_STRUCT_WQ_DATA_MASK))->pool; @@ -597,9 +650,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)  	if (pool_id == WORK_OFFQ_POOL_NONE)  		return NULL; -	pool = worker_pool_by_id(pool_id); -	WARN_ON_ONCE(!pool); -	return pool; +	return idr_find(&worker_pool_idr, pool_id);  }  /** @@ -688,7 +739,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)  /* Do we have too many workers and should some go away? */  static bool too_many_workers(struct worker_pool *pool)  { -	bool managing = pool->flags & POOL_MANAGING_WORKERS; +	bool managing = mutex_is_locked(&pool->manager_arb);  	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */  	int nr_busy = pool->nr_workers - nr_idle; @@ -743,7 +794,7 @@ static void wake_up_worker(struct worker_pool *pool)   * CONTEXT:   * spin_lock_irq(rq->lock)   */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_waking_up(struct task_struct *task, int cpu)  {  	struct worker *worker = kthread_data(task); @@ -768,8 +819,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)   * RETURNS:   * Worker task on @cpu to wake up, %NULL if none.   */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, -				       unsigned int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)  {  	struct worker *worker = kthread_data(task), *to_wakeup = NULL;  	struct worker_pool *pool; @@ -785,7 +835,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,  	pool = worker->pool;  	/* this can only happen on the local cpu */ -	BUG_ON(cpu != raw_smp_processor_id()); +	if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) +		return NULL;  	/*  	 * The counterpart of the following dec_and_test, implied mb, @@ -890,13 +941,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)   * recycled work item as currently executing and make it wait until the   * current execution finishes, introducing an unwanted dependency.   * - * This function checks the work item address, work function and workqueue - * to avoid false positives.  Note that this isn't complete as one may - * construct a work function which can introduce dependency onto itself - * through a recycled work item.  Well, if somebody wants to shoot oneself - * in the foot that badly, there's only so much we can do, and if such - * deadlock actually occurs, it should be easy to locate the culprit work - * function. + * This function checks the work item address and work function to avoid + * false positives.  Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item.  Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function.   *   * CONTEXT:   * spin_lock_irq(pool->lock). @@ -960,6 +1010,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,  		*nextp = n;  } +/** + * get_pwq - get an extra reference on the specified pool_workqueue + * @pwq: pool_workqueue to get + * + * Obtain an extra reference on @pwq.  The caller should guarantee that + * @pwq has positive refcnt and be holding the matching pool->lock. + */ +static void get_pwq(struct pool_workqueue *pwq) +{ +	lockdep_assert_held(&pwq->pool->lock); +	WARN_ON_ONCE(pwq->refcnt <= 0); +	pwq->refcnt++; +} + +/** + * put_pwq - put a pool_workqueue reference + * @pwq: pool_workqueue to put + * + * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its + * destruction.  The caller should be holding the matching pool->lock. + */ +static void put_pwq(struct pool_workqueue *pwq) +{ +	lockdep_assert_held(&pwq->pool->lock); +	if (likely(--pwq->refcnt)) +		return; +	if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) +		return; +	/* +	 * @pwq can't be released under pool->lock, bounce to +	 * pwq_unbound_release_workfn().  This never recurses on the same +	 * pool->lock as this path is taken only for unbound workqueues and +	 * the release work item is scheduled on a per-cpu workqueue.  To +	 * avoid lockdep warning, unbound pool->locks are given lockdep +	 * subclass of 1 in get_unbound_pool(). +	 */ +	schedule_work(&pwq->unbound_release_work); +} + +/** + * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock + * @pwq: pool_workqueue to put (can be %NULL) + * + * put_pwq() with locking.  This function also allows %NULL @pwq. + */ +static void put_pwq_unlocked(struct pool_workqueue *pwq) +{ +	if (pwq) { +		/* +		 * As both pwqs and pools are sched-RCU protected, the +		 * following lock operations are safe. +		 */ +		spin_lock_irq(&pwq->pool->lock); +		put_pwq(pwq); +		spin_unlock_irq(&pwq->pool->lock); +	} +} +  static void pwq_activate_delayed_work(struct work_struct *work)  {  	struct pool_workqueue *pwq = get_work_pwq(work); @@ -991,9 +1099,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)   */  static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)  { -	/* ignore uncolored works */ +	/* uncolored work items don't participate in flushing or nr_active */  	if (color == WORK_NO_COLOR) -		return; +		goto out_put;  	pwq->nr_in_flight[color]--; @@ -1006,11 +1114,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)  	/* is flush in progress and are we at the flushing tip? */  	if (likely(pwq->flush_color != color)) -		return; +		goto out_put;  	/* are there still in-flight works? */  	if (pwq->nr_in_flight[color]) -		return; +		goto out_put;  	/* this pwq is done, clear flush_color */  	pwq->flush_color = -1; @@ -1021,6 +1129,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)  	 */  	if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))  		complete(&pwq->wq->first_flusher->done); +out_put: +	put_pwq(pwq);  }  /** @@ -1143,11 +1253,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,  	/* we own @work, set data and link */  	set_work_pwq(work, pwq, extra_flags);  	list_add_tail(&work->entry, head); +	get_pwq(pwq);  	/* -	 * Ensure either worker_sched_deactivated() sees the above -	 * list_add_tail() or we see zero nr_running to avoid workers -	 * lying around lazily while there are works to be processed. +	 * Ensure either wq_worker_sleeping() sees the above +	 * list_add_tail() or we see zero nr_running to avoid workers lying +	 * around lazily while there are works to be processed.  	 */  	smp_mb(); @@ -1171,10 +1282,11 @@ static bool is_chained_work(struct workqueue_struct *wq)  	return worker && worker->current_pwq->wq == wq;  } -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, +static void __queue_work(int cpu, struct workqueue_struct *wq,  			 struct work_struct *work)  {  	struct pool_workqueue *pwq; +	struct worker_pool *last_pool;  	struct list_head *worklist;  	unsigned int work_flags;  	unsigned int req_cpu = cpu; @@ -1190,48 +1302,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  	debug_work_activate(work);  	/* if dying, only works from the same workqueue are allowed */ -	if (unlikely(wq->flags & WQ_DRAINING) && +	if (unlikely(wq->flags & __WQ_DRAINING) &&  	    WARN_ON_ONCE(!is_chained_work(wq)))  		return; +retry: +	if (req_cpu == WORK_CPU_UNBOUND) +		cpu = raw_smp_processor_id(); -	/* determine the pwq to use */ -	if (!(wq->flags & WQ_UNBOUND)) { -		struct worker_pool *last_pool; - -		if (cpu == WORK_CPU_UNBOUND) -			cpu = raw_smp_processor_id(); - -		/* -		 * It's multi cpu.  If @work was previously on a different -		 * cpu, it might still be running there, in which case the -		 * work needs to be queued on that cpu to guarantee -		 * non-reentrancy. -		 */ -		pwq = get_pwq(cpu, wq); -		last_pool = get_work_pool(work); +	/* pwq which will be used unless @work is executing elsewhere */ +	if (!(wq->flags & WQ_UNBOUND)) +		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); +	else +		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); -		if (last_pool && last_pool != pwq->pool) { -			struct worker *worker; +	/* +	 * If @work was previously on a different pool, it might still be +	 * running there, in which case the work needs to be queued on that +	 * pool to guarantee non-reentrancy. +	 */ +	last_pool = get_work_pool(work); +	if (last_pool && last_pool != pwq->pool) { +		struct worker *worker; -			spin_lock(&last_pool->lock); +		spin_lock(&last_pool->lock); -			worker = find_worker_executing_work(last_pool, work); +		worker = find_worker_executing_work(last_pool, work); -			if (worker && worker->current_pwq->wq == wq) { -				pwq = get_pwq(last_pool->cpu, wq); -			} else { -				/* meh... not running there, queue here */ -				spin_unlock(&last_pool->lock); -				spin_lock(&pwq->pool->lock); -			} +		if (worker && worker->current_pwq->wq == wq) { +			pwq = worker->current_pwq;  		} else { +			/* meh... not running there, queue here */ +			spin_unlock(&last_pool->lock);  			spin_lock(&pwq->pool->lock);  		}  	} else { -		pwq = get_pwq(WORK_CPU_UNBOUND, wq);  		spin_lock(&pwq->pool->lock);  	} +	/* +	 * pwq is determined and locked.  For unbound pools, we could have +	 * raced with pwq release and it could already be dead.  If its +	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die +	 * without another pwq replacing it in the numa_pwq_tbl or while +	 * work items are executing on it, so the retrying is guaranteed to +	 * make forward-progress. +	 */ +	if (unlikely(!pwq->refcnt)) { +		if (wq->flags & WQ_UNBOUND) { +			spin_unlock(&pwq->pool->lock); +			cpu_relax(); +			goto retry; +		} +		/* oops */ +		WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", +			  wq->name, cpu); +	} +  	/* pwq determined, queue */  	trace_workqueue_queue_work(req_cpu, pwq, work); @@ -1286,22 +1412,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,  }  EXPORT_SYMBOL_GPL(queue_work_on); -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns %false if @work was already on a queue, %true otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -bool queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ -	return queue_work_on(WORK_CPU_UNBOUND, wq, work); -} -EXPORT_SYMBOL_GPL(queue_work); -  void delayed_work_timer_fn(unsigned long __data)  {  	struct delayed_work *dwork = (struct delayed_work *)__data; @@ -1377,21 +1487,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,  EXPORT_SYMBOL_GPL(queue_delayed_work_on);  /** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Equivalent to queue_delayed_work_on() but tries to use the local CPU. - */ -bool queue_delayed_work(struct workqueue_struct *wq, -			struct delayed_work *dwork, unsigned long delay) -{ -	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - -/**   * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU   * @cpu: CPU number to execute work on   * @wq: workqueue to use @@ -1430,21 +1525,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,  EXPORT_SYMBOL_GPL(mod_delayed_work_on);  /** - * mod_delayed_work - modify delay of or queue a delayed work - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * mod_delayed_work_on() on local CPU. - */ -bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, -		      unsigned long delay) -{ -	return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(mod_delayed_work); - -/**   * worker_enter_idle - enter idle state   * @worker: worker which is entering idle state   * @@ -1458,9 +1538,10 @@ static void worker_enter_idle(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	BUG_ON(worker->flags & WORKER_IDLE); -	BUG_ON(!list_empty(&worker->entry) && -	       (worker->hentry.next || worker->hentry.pprev)); +	if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || +	    WARN_ON_ONCE(!list_empty(&worker->entry) && +			 (worker->hentry.next || worker->hentry.pprev))) +		return;  	/* can't use worker_set_flags(), also called from start_worker() */  	worker->flags |= WORKER_IDLE; @@ -1497,22 +1578,25 @@ static void worker_leave_idle(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	BUG_ON(!(worker->flags & WORKER_IDLE)); +	if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) +		return;  	worker_clr_flags(worker, WORKER_IDLE);  	pool->nr_idle--;  	list_del_init(&worker->entry);  }  /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool - * @worker: self + * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it + * @pool: target worker_pool + * + * Bind %current to the cpu of @pool if it is associated and lock @pool.   *   * Works which are scheduled while the cpu is online must at least be   * scheduled to a worker which is bound to the cpu so that if they are   * flushed from cpu callbacks while cpu is going down, they are   * guaranteed to execute on the cpu.   * - * This function is to be used by rogue workers and rescuers to bind + * This function is to be used by unbound workers and rescuers to bind   * themselves to the target cpu and may race with cpu going down or   * coming online.  kthread_bind() can't be used because it may put the   * worker to already dead cpu and set_cpus_allowed_ptr() can't be used @@ -1533,12 +1617,9 @@ static void worker_leave_idle(struct worker *worker)   * %true if the associated pool is online (@worker is successfully   * bound), %false if offline.   */ -static bool worker_maybe_bind_and_lock(struct worker *worker) +static bool worker_maybe_bind_and_lock(struct worker_pool *pool)  __acquires(&pool->lock)  { -	struct worker_pool *pool = worker->pool; -	struct task_struct *task = worker->task; -  	while (true) {  		/*  		 * The following call may fail, succeed or succeed @@ -1547,14 +1628,13 @@ __acquires(&pool->lock)  		 * against POOL_DISASSOCIATED.  		 */  		if (!(pool->flags & POOL_DISASSOCIATED)) -			set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); +			set_cpus_allowed_ptr(current, pool->attrs->cpumask);  		spin_lock_irq(&pool->lock);  		if (pool->flags & POOL_DISASSOCIATED)  			return false; -		if (task_cpu(task) == pool->cpu && -		    cpumask_equal(¤t->cpus_allowed, -				  get_cpu_mask(pool->cpu))) +		if (task_cpu(current) == pool->cpu && +		    cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask))  			return true;  		spin_unlock_irq(&pool->lock); @@ -1569,108 +1649,6 @@ __acquires(&pool->lock)  	}  } -/* - * Rebind an idle @worker to its CPU.  worker_thread() will test - * list_empty(@worker->entry) before leaving idle and call this function. - */ -static void idle_worker_rebind(struct worker *worker) -{ -	/* CPU may go down again inbetween, clear UNBOUND only on success */ -	if (worker_maybe_bind_and_lock(worker)) -		worker_clr_flags(worker, WORKER_UNBOUND); - -	/* rebind complete, become available again */ -	list_add(&worker->entry, &worker->pool->idle_list); -	spin_unlock_irq(&worker->pool->lock); -} - -/* - * Function for @worker->rebind.work used to rebind unbound busy workers to - * the associated cpu which is coming back online.  This is scheduled by - * cpu up but can race with other cpu hotplug operations and may be - * executed twice without intervening cpu down. - */ -static void busy_worker_rebind_fn(struct work_struct *work) -{ -	struct worker *worker = container_of(work, struct worker, rebind_work); - -	if (worker_maybe_bind_and_lock(worker)) -		worker_clr_flags(worker, WORKER_UNBOUND); - -	spin_unlock_irq(&worker->pool->lock); -} - -/** - * rebind_workers - rebind all workers of a pool to the associated CPU - * @pool: pool of interest - * - * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding - * is different for idle and busy ones. - * - * Idle ones will be removed from the idle_list and woken up.  They will - * add themselves back after completing rebind.  This ensures that the - * idle_list doesn't contain any unbound workers when re-bound busy workers - * try to perform local wake-ups for concurrency management. - * - * Busy workers can rebind after they finish their current work items. - * Queueing the rebind work item at the head of the scheduled list is - * enough.  Note that nr_running will be properly bumped as busy workers - * rebind. - * - * On return, all non-manager workers are scheduled for rebind - see - * manage_workers() for the manager special case.  Any idle worker - * including the manager will not appear on @idle_list until rebind is - * complete, making local wake-ups safe. - */ -static void rebind_workers(struct worker_pool *pool) -{ -	struct worker *worker, *n; -	int i; - -	lockdep_assert_held(&pool->assoc_mutex); -	lockdep_assert_held(&pool->lock); - -	/* dequeue and kick idle ones */ -	list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { -		/* -		 * idle workers should be off @pool->idle_list until rebind -		 * is complete to avoid receiving premature local wake-ups. -		 */ -		list_del_init(&worker->entry); - -		/* -		 * worker_thread() will see the above dequeuing and call -		 * idle_worker_rebind(). -		 */ -		wake_up_process(worker->task); -	} - -	/* rebind busy workers */ -	for_each_busy_worker(worker, i, pool) { -		struct work_struct *rebind_work = &worker->rebind_work; -		struct workqueue_struct *wq; - -		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, -				     work_data_bits(rebind_work))) -			continue; - -		debug_work_activate(rebind_work); - -		/* -		 * wq doesn't really matter but let's keep @worker->pool -		 * and @pwq->pool consistent for sanity. -		 */ -		if (std_worker_pool_pri(worker->pool)) -			wq = system_highpri_wq; -		else -			wq = system_wq; - -		insert_work(get_pwq(pool->cpu, wq), rebind_work, -			    worker->scheduled.next, -			    work_color_to_flags(WORK_NO_COLOR)); -	} -} -  static struct worker *alloc_worker(void)  {  	struct worker *worker; @@ -1679,7 +1657,6 @@ static struct worker *alloc_worker(void)  	if (worker) {  		INIT_LIST_HEAD(&worker->entry);  		INIT_LIST_HEAD(&worker->scheduled); -		INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);  		/* on creation a worker is in !idle && prep state */  		worker->flags = WORKER_PREP;  	} @@ -1702,18 +1679,25 @@ static struct worker *alloc_worker(void)   */  static struct worker *create_worker(struct worker_pool *pool)  { -	const char *pri = std_worker_pool_pri(pool) ? "H" : "";  	struct worker *worker = NULL;  	int id = -1; +	char id_buf[16]; +	lockdep_assert_held(&pool->manager_mutex); + +	/* +	 * ID is needed to determine kthread name.  Allocate ID first +	 * without installing the pointer. +	 */ +	idr_preload(GFP_KERNEL);  	spin_lock_irq(&pool->lock); -	while (ida_get_new(&pool->worker_ida, &id)) { -		spin_unlock_irq(&pool->lock); -		if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) -			goto fail; -		spin_lock_irq(&pool->lock); -	} + +	id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); +  	spin_unlock_irq(&pool->lock); +	idr_preload_end(); +	if (id < 0) +		goto fail;  	worker = alloc_worker();  	if (!worker) @@ -1722,40 +1706,46 @@ static struct worker *create_worker(struct worker_pool *pool)  	worker->pool = pool;  	worker->id = id; -	if (pool->cpu != WORK_CPU_UNBOUND) -		worker->task = kthread_create_on_node(worker_thread, -					worker, cpu_to_node(pool->cpu), -					"kworker/%u:%d%s", pool->cpu, id, pri); +	if (pool->cpu >= 0) +		snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, +			 pool->attrs->nice < 0  ? "H" : "");  	else -		worker->task = kthread_create(worker_thread, worker, -					      "kworker/u:%d%s", id, pri); +		snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + +	worker->task = kthread_create_on_node(worker_thread, worker, pool->node, +					      "kworker/%s", id_buf);  	if (IS_ERR(worker->task))  		goto fail; -	if (std_worker_pool_pri(pool)) -		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); +	/* +	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any +	 * online CPUs.  It'll be re-applied when any of the CPUs come up. +	 */ +	set_user_nice(worker->task, pool->attrs->nice); +	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + +	/* prevent userland from meddling with cpumask of workqueue workers */ +	worker->task->flags |= PF_NO_SETAFFINITY;  	/* -	 * Determine CPU binding of the new worker depending on -	 * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the -	 * flag remains stable across this function.  See the comments -	 * above the flag definition for details. -	 * -	 * As an unbound worker may later become a regular one if CPU comes -	 * online, make sure every worker has %PF_THREAD_BOUND set. +	 * The caller is responsible for ensuring %POOL_DISASSOCIATED +	 * remains stable across this function.  See the comments above the +	 * flag definition for details.  	 */ -	if (!(pool->flags & POOL_DISASSOCIATED)) { -		kthread_bind(worker->task, pool->cpu); -	} else { -		worker->task->flags |= PF_THREAD_BOUND; +	if (pool->flags & POOL_DISASSOCIATED)  		worker->flags |= WORKER_UNBOUND; -	} + +	/* successful, commit the pointer to idr */ +	spin_lock_irq(&pool->lock); +	idr_replace(&pool->worker_idr, worker, worker->id); +	spin_unlock_irq(&pool->lock);  	return worker; +  fail:  	if (id >= 0) {  		spin_lock_irq(&pool->lock); -		ida_remove(&pool->worker_ida, id); +		idr_remove(&pool->worker_idr, id);  		spin_unlock_irq(&pool->lock);  	}  	kfree(worker); @@ -1780,6 +1770,30 @@ static void start_worker(struct worker *worker)  }  /** + * create_and_start_worker - create and start a worker for a pool + * @pool: the target pool + * + * Grab the managership of @pool and create and start a new worker for it. + */ +static int create_and_start_worker(struct worker_pool *pool) +{ +	struct worker *worker; + +	mutex_lock(&pool->manager_mutex); + +	worker = create_worker(pool); +	if (worker) { +		spin_lock_irq(&pool->lock); +		start_worker(worker); +		spin_unlock_irq(&pool->lock); +	} + +	mutex_unlock(&pool->manager_mutex); + +	return worker ? 0 : -ENOMEM; +} + +/**   * destroy_worker - destroy a workqueue worker   * @worker: worker to be destroyed   * @@ -1791,11 +1805,14 @@ static void start_worker(struct worker *worker)  static void destroy_worker(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	int id = worker->id; + +	lockdep_assert_held(&pool->manager_mutex); +	lockdep_assert_held(&pool->lock);  	/* sanity check frenzy */ -	BUG_ON(worker->current_work); -	BUG_ON(!list_empty(&worker->scheduled)); +	if (WARN_ON(worker->current_work) || +	    WARN_ON(!list_empty(&worker->scheduled))) +		return;  	if (worker->flags & WORKER_STARTED)  		pool->nr_workers--; @@ -1805,13 +1822,14 @@ static void destroy_worker(struct worker *worker)  	list_del_init(&worker->entry);  	worker->flags |= WORKER_DIE; +	idr_remove(&pool->worker_idr, worker->id); +  	spin_unlock_irq(&pool->lock);  	kthread_stop(worker->task);  	kfree(worker);  	spin_lock_irq(&pool->lock); -	ida_remove(&pool->worker_ida, id);  }  static void idle_worker_timeout(unsigned long __pool) @@ -1840,23 +1858,21 @@ static void idle_worker_timeout(unsigned long __pool)  	spin_unlock_irq(&pool->lock);  } -static bool send_mayday(struct work_struct *work) +static void send_mayday(struct work_struct *work)  {  	struct pool_workqueue *pwq = get_work_pwq(work);  	struct workqueue_struct *wq = pwq->wq; -	unsigned int cpu; -	if (!(wq->flags & WQ_RESCUER)) -		return false; +	lockdep_assert_held(&wq_mayday_lock); + +	if (!wq->rescuer) +		return;  	/* mayday mayday mayday */ -	cpu = pwq->pool->cpu; -	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ -	if (cpu == WORK_CPU_UNBOUND) -		cpu = 0; -	if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) +	if (list_empty(&pwq->mayday_node)) { +		list_add_tail(&pwq->mayday_node, &wq->maydays);  		wake_up_process(wq->rescuer->task); -	return true; +	}  }  static void pool_mayday_timeout(unsigned long __pool) @@ -1864,7 +1880,8 @@ static void pool_mayday_timeout(unsigned long __pool)  	struct worker_pool *pool = (void *)__pool;  	struct work_struct *work; -	spin_lock_irq(&pool->lock); +	spin_lock_irq(&wq_mayday_lock);		/* for wq->maydays */ +	spin_lock(&pool->lock);  	if (need_to_create_worker(pool)) {  		/* @@ -1877,7 +1894,8 @@ static void pool_mayday_timeout(unsigned long __pool)  			send_mayday(work);  	} -	spin_unlock_irq(&pool->lock); +	spin_unlock(&pool->lock); +	spin_unlock_irq(&wq_mayday_lock);  	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);  } @@ -1892,8 +1910,8 @@ static void pool_mayday_timeout(unsigned long __pool)   * sent to all rescuers with works scheduled on @pool to resolve   * possible allocation deadlock.   * - * On return, need_to_create_worker() is guaranteed to be false and - * may_start_working() true. + * On return, need_to_create_worker() is guaranteed to be %false and + * may_start_working() %true.   *   * LOCKING:   * spin_lock_irq(pool->lock) which may be released and regrabbed @@ -1901,7 +1919,7 @@ static void pool_mayday_timeout(unsigned long __pool)   * manager.   *   * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true   * otherwise.   */  static bool maybe_create_worker(struct worker_pool *pool) @@ -1924,7 +1942,8 @@ restart:  			del_timer_sync(&pool->mayday_timer);  			spin_lock_irq(&pool->lock);  			start_worker(worker); -			BUG_ON(need_to_create_worker(pool)); +			if (WARN_ON_ONCE(need_to_create_worker(pool))) +				goto restart;  			return true;  		} @@ -1957,7 +1976,7 @@ restart:   * multiple times.  Called only from manager.   *   * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true   * otherwise.   */  static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2008,42 +2027,37 @@ static bool manage_workers(struct worker *worker)  	struct worker_pool *pool = worker->pool;  	bool ret = false; -	if (pool->flags & POOL_MANAGING_WORKERS) +	/* +	 * Managership is governed by two mutexes - manager_arb and +	 * manager_mutex.  manager_arb handles arbitration of manager role. +	 * Anyone who successfully grabs manager_arb wins the arbitration +	 * and becomes the manager.  mutex_trylock() on pool->manager_arb +	 * failure while holding pool->lock reliably indicates that someone +	 * else is managing the pool and the worker which failed trylock +	 * can proceed to executing work items.  This means that anyone +	 * grabbing manager_arb is responsible for actually performing +	 * manager duties.  If manager_arb is grabbed and released without +	 * actual management, the pool may stall indefinitely. +	 * +	 * manager_mutex is used for exclusion of actual management +	 * operations.  The holder of manager_mutex can be sure that none +	 * of management operations, including creation and destruction of +	 * workers, won't take place until the mutex is released.  Because +	 * manager_mutex doesn't interfere with manager role arbitration, +	 * it is guaranteed that the pool's management, while may be +	 * delayed, won't be disturbed by someone else grabbing +	 * manager_mutex. +	 */ +	if (!mutex_trylock(&pool->manager_arb))  		return ret; -	pool->flags |= POOL_MANAGING_WORKERS; -  	/* -	 * To simplify both worker management and CPU hotplug, hold off -	 * management while hotplug is in progress.  CPU hotplug path can't -	 * grab %POOL_MANAGING_WORKERS to achieve this because that can -	 * lead to idle worker depletion (all become busy thinking someone -	 * else is managing) which in turn can result in deadlock under -	 * extreme circumstances.  Use @pool->assoc_mutex to synchronize -	 * manager against CPU hotplug. -	 * -	 * assoc_mutex would always be free unless CPU hotplug is in -	 * progress.  trylock first without dropping @pool->lock. +	 * With manager arbitration won, manager_mutex would be free in +	 * most cases.  trylock first without dropping @pool->lock.  	 */ -	if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { +	if (unlikely(!mutex_trylock(&pool->manager_mutex))) {  		spin_unlock_irq(&pool->lock); -		mutex_lock(&pool->assoc_mutex); -		/* -		 * CPU hotplug could have happened while we were waiting -		 * for assoc_mutex.  Hotplug itself can't handle us -		 * because manager isn't either on idle or busy list, and -		 * @pool's state and ours could have deviated. -		 * -		 * As hotplug is now excluded via assoc_mutex, we can -		 * simply try to bind.  It will succeed or fail depending -		 * on @pool's current state.  Try it and adjust -		 * %WORKER_UNBOUND accordingly. -		 */ -		if (worker_maybe_bind_and_lock(worker)) -			worker->flags &= ~WORKER_UNBOUND; -		else -			worker->flags |= WORKER_UNBOUND; - +		mutex_lock(&pool->manager_mutex);  		ret = true;  	} @@ -2056,8 +2070,8 @@ static bool manage_workers(struct worker *worker)  	ret |= maybe_destroy_workers(pool);  	ret |= maybe_create_worker(pool); -	pool->flags &= ~POOL_MANAGING_WORKERS; -	mutex_unlock(&pool->assoc_mutex); +	mutex_unlock(&pool->manager_mutex); +	mutex_unlock(&pool->manager_arb);  	return ret;  } @@ -2211,11 +2225,11 @@ static void process_scheduled_works(struct worker *worker)   * worker_thread - the worker thread function   * @__worker: self   * - * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools - * of these per each cpu.  These workers process all works regardless of - * their specific target workqueue.  The only exception is works which - * belong to workqueues with a rescuer which will be explained in - * rescuer_thread(). + * The worker thread function.  All workers belong to a worker_pool - + * either a per-cpu one or dynamic unbound one.  These workers process all + * work items regardless of their specific target workqueue.  The only + * exception is work items which belong to workqueues with a rescuer which + * will be explained in rescuer_thread().   */  static int worker_thread(void *__worker)  { @@ -2227,19 +2241,12 @@ static int worker_thread(void *__worker)  woke_up:  	spin_lock_irq(&pool->lock); -	/* we are off idle list if destruction or rebind is requested */ -	if (unlikely(list_empty(&worker->entry))) { +	/* am I supposed to die? */ +	if (unlikely(worker->flags & WORKER_DIE)) {  		spin_unlock_irq(&pool->lock); - -		/* if DIE is set, destruction is requested */ -		if (worker->flags & WORKER_DIE) { -			worker->task->flags &= ~PF_WQ_WORKER; -			return 0; -		} - -		/* otherwise, rebind */ -		idle_worker_rebind(worker); -		goto woke_up; +		WARN_ON_ONCE(!list_empty(&worker->entry)); +		worker->task->flags &= ~PF_WQ_WORKER; +		return 0;  	}  	worker_leave_idle(worker); @@ -2257,14 +2264,16 @@ recheck:  	 * preparing to process a work or actually processing it.  	 * Make sure nobody diddled with it while I was sleeping.  	 */ -	BUG_ON(!list_empty(&worker->scheduled)); +	WARN_ON_ONCE(!list_empty(&worker->scheduled));  	/* -	 * When control reaches this point, we're guaranteed to have -	 * at least one idle worker or that someone else has already -	 * assumed the manager role. +	 * Finish PREP stage.  We're guaranteed to have at least one idle +	 * worker or that someone else has already assumed the manager +	 * role.  This is where @worker starts participating in concurrency +	 * management if applicable and concurrency management is restored +	 * after being rebound.  See rebind_workers() for details.  	 */ -	worker_clr_flags(worker, WORKER_PREP); +	worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);  	do {  		struct work_struct *work = @@ -2306,7 +2315,7 @@ sleep:   * @__rescuer: self   *   * Workqueue rescuer thread function.  There's one rescuer for each - * workqueue which has WQ_RESCUER set. + * workqueue which has WQ_MEM_RECLAIM set.   *   * Regular work processing on a pool may block trying to create a new   * worker which uses GFP_KERNEL allocation which has slight chance of @@ -2325,8 +2334,6 @@ static int rescuer_thread(void *__rescuer)  	struct worker *rescuer = __rescuer;  	struct workqueue_struct *wq = rescuer->rescue_wq;  	struct list_head *scheduled = &rescuer->scheduled; -	bool is_unbound = wq->flags & WQ_UNBOUND; -	unsigned int cpu;  	set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2344,28 +2351,29 @@ repeat:  		return 0;  	} -	/* -	 * See whether any cpu is asking for help.  Unbounded -	 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. -	 */ -	for_each_mayday_cpu(cpu, wq->mayday_mask) { -		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; -		struct pool_workqueue *pwq = get_pwq(tcpu, wq); +	/* see whether any pwq is asking for help */ +	spin_lock_irq(&wq_mayday_lock); + +	while (!list_empty(&wq->maydays)) { +		struct pool_workqueue *pwq = list_first_entry(&wq->maydays, +					struct pool_workqueue, mayday_node);  		struct worker_pool *pool = pwq->pool;  		struct work_struct *work, *n;  		__set_current_state(TASK_RUNNING); -		mayday_clear_cpu(cpu, wq->mayday_mask); +		list_del_init(&pwq->mayday_node); + +		spin_unlock_irq(&wq_mayday_lock);  		/* migrate to the target cpu if possible */ +		worker_maybe_bind_and_lock(pool);  		rescuer->pool = pool; -		worker_maybe_bind_and_lock(rescuer);  		/*  		 * Slurp in all works issued via this workqueue and  		 * process'em.  		 */ -		BUG_ON(!list_empty(&rescuer->scheduled)); +		WARN_ON_ONCE(!list_empty(&rescuer->scheduled));  		list_for_each_entry_safe(work, n, &pool->worklist, entry)  			if (get_work_pwq(work) == pwq)  				move_linked_works(work, scheduled, &n); @@ -2380,9 +2388,13 @@ repeat:  		if (keep_working(pool))  			wake_up_worker(pool); -		spin_unlock_irq(&pool->lock); +		rescuer->pool = NULL; +		spin_unlock(&pool->lock); +		spin_lock(&wq_mayday_lock);  	} +	spin_unlock_irq(&wq_mayday_lock); +  	/* rescuers should never participate in concurrency management */  	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));  	schedule(); @@ -2486,7 +2498,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,   * advanced to @work_color.   *   * CONTEXT: - * mutex_lock(wq->flush_mutex). + * mutex_lock(wq->mutex).   *   * RETURNS:   * %true if @flush_color >= 0 and there's something to flush.  %false @@ -2496,21 +2508,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,  				      int flush_color, int work_color)  {  	bool wait = false; -	unsigned int cpu; +	struct pool_workqueue *pwq;  	if (flush_color >= 0) { -		BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); +		WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));  		atomic_set(&wq->nr_pwqs_to_flush, 1);  	} -	for_each_pwq_cpu(cpu, wq) { -		struct pool_workqueue *pwq = get_pwq(cpu, wq); +	for_each_pwq(pwq, wq) {  		struct worker_pool *pool = pwq->pool;  		spin_lock_irq(&pool->lock);  		if (flush_color >= 0) { -			BUG_ON(pwq->flush_color != -1); +			WARN_ON_ONCE(pwq->flush_color != -1);  			if (pwq->nr_in_flight[flush_color]) {  				pwq->flush_color = flush_color; @@ -2520,7 +2531,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,  		}  		if (work_color >= 0) { -			BUG_ON(work_color != work_next_color(pwq->work_color)); +			WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));  			pwq->work_color = work_color;  		} @@ -2537,11 +2548,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,   * flush_workqueue - ensure that any scheduled work has run to completion.   * @wq: workqueue to flush   * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. + * This function sleeps until all work items which were queued on entry + * have finished execution, but it is not livelocked by new incoming ones.   */  void flush_workqueue(struct workqueue_struct *wq)  { @@ -2555,7 +2563,7 @@ void flush_workqueue(struct workqueue_struct *wq)  	lock_map_acquire(&wq->lockdep_map);  	lock_map_release(&wq->lockdep_map); -	mutex_lock(&wq->flush_mutex); +	mutex_lock(&wq->mutex);  	/*  	 * Start-to-wait phase @@ -2568,13 +2576,13 @@ void flush_workqueue(struct workqueue_struct *wq)  		 * becomes our flush_color and work_color is advanced  		 * by one.  		 */ -		BUG_ON(!list_empty(&wq->flusher_overflow)); +		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));  		this_flusher.flush_color = wq->work_color;  		wq->work_color = next_color;  		if (!wq->first_flusher) {  			/* no flush in progress, become the first flusher */ -			BUG_ON(wq->flush_color != this_flusher.flush_color); +			WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);  			wq->first_flusher = &this_flusher; @@ -2587,7 +2595,7 @@ void flush_workqueue(struct workqueue_struct *wq)  			}  		} else {  			/* wait in queue */ -			BUG_ON(wq->flush_color == this_flusher.flush_color); +			WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);  			list_add_tail(&this_flusher.list, &wq->flusher_queue);  			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);  		} @@ -2600,7 +2608,7 @@ void flush_workqueue(struct workqueue_struct *wq)  		list_add_tail(&this_flusher.list, &wq->flusher_overflow);  	} -	mutex_unlock(&wq->flush_mutex); +	mutex_unlock(&wq->mutex);  	wait_for_completion(&this_flusher.done); @@ -2613,7 +2621,7 @@ void flush_workqueue(struct workqueue_struct *wq)  	if (wq->first_flusher != &this_flusher)  		return; -	mutex_lock(&wq->flush_mutex); +	mutex_lock(&wq->mutex);  	/* we might have raced, check again with mutex held */  	if (wq->first_flusher != &this_flusher) @@ -2621,8 +2629,8 @@ void flush_workqueue(struct workqueue_struct *wq)  	wq->first_flusher = NULL; -	BUG_ON(!list_empty(&this_flusher.list)); -	BUG_ON(wq->flush_color != this_flusher.flush_color); +	WARN_ON_ONCE(!list_empty(&this_flusher.list)); +	WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);  	while (true) {  		struct wq_flusher *next, *tmp; @@ -2635,8 +2643,8 @@ void flush_workqueue(struct workqueue_struct *wq)  			complete(&next->done);  		} -		BUG_ON(!list_empty(&wq->flusher_overflow) && -		       wq->flush_color != work_next_color(wq->work_color)); +		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && +			     wq->flush_color != work_next_color(wq->work_color));  		/* this flush_color is finished, advance by one */  		wq->flush_color = work_next_color(wq->flush_color); @@ -2660,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq)  		}  		if (list_empty(&wq->flusher_queue)) { -			BUG_ON(wq->flush_color != wq->work_color); +			WARN_ON_ONCE(wq->flush_color != wq->work_color);  			break;  		} @@ -2668,8 +2676,8 @@ void flush_workqueue(struct workqueue_struct *wq)  		 * Need to flush more colors.  Make the next flusher  		 * the new first flusher and arm pwqs.  		 */ -		BUG_ON(wq->flush_color == wq->work_color); -		BUG_ON(wq->flush_color != next->flush_color); +		WARN_ON_ONCE(wq->flush_color == wq->work_color); +		WARN_ON_ONCE(wq->flush_color != next->flush_color);  		list_del_init(&next->list);  		wq->first_flusher = next; @@ -2685,7 +2693,7 @@ void flush_workqueue(struct workqueue_struct *wq)  	}  out_unlock: -	mutex_unlock(&wq->flush_mutex); +	mutex_unlock(&wq->mutex);  }  EXPORT_SYMBOL_GPL(flush_workqueue); @@ -2703,22 +2711,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue);  void drain_workqueue(struct workqueue_struct *wq)  {  	unsigned int flush_cnt = 0; -	unsigned int cpu; +	struct pool_workqueue *pwq;  	/*  	 * __queue_work() needs to test whether there are drainers, is much  	 * hotter than drain_workqueue() and already looks at @wq->flags. -	 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. +	 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.  	 */ -	spin_lock(&workqueue_lock); +	mutex_lock(&wq->mutex);  	if (!wq->nr_drainers++) -		wq->flags |= WQ_DRAINING; -	spin_unlock(&workqueue_lock); +		wq->flags |= __WQ_DRAINING; +	mutex_unlock(&wq->mutex);  reflush:  	flush_workqueue(wq); -	for_each_pwq_cpu(cpu, wq) { -		struct pool_workqueue *pwq = get_pwq(cpu, wq); +	mutex_lock(&wq->mutex); + +	for_each_pwq(pwq, wq) {  		bool drained;  		spin_lock_irq(&pwq->pool->lock); @@ -2730,15 +2739,16 @@ reflush:  		if (++flush_cnt == 10 ||  		    (flush_cnt % 100 == 0 && flush_cnt <= 1000)) -			pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", +			pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",  				wq->name, flush_cnt); + +		mutex_unlock(&wq->mutex);  		goto reflush;  	} -	spin_lock(&workqueue_lock);  	if (!--wq->nr_drainers) -		wq->flags &= ~WQ_DRAINING; -	spin_unlock(&workqueue_lock); +		wq->flags &= ~__WQ_DRAINING; +	mutex_unlock(&wq->mutex);  }  EXPORT_SYMBOL_GPL(drain_workqueue); @@ -2749,11 +2759,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)  	struct pool_workqueue *pwq;  	might_sleep(); + +	local_irq_disable();  	pool = get_work_pool(work); -	if (!pool) +	if (!pool) { +		local_irq_enable();  		return false; +	} -	spin_lock_irq(&pool->lock); +	spin_lock(&pool->lock);  	/* see the comment in try_to_grab_pending() with the same code */  	pwq = get_work_pwq(work);  	if (pwq) { @@ -2775,7 +2789,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)  	 * flusher is not running on the same workqueue by verifying write  	 * access.  	 */ -	if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) +	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)  		lock_map_acquire(&pwq->wq->lockdep_map);  	else  		lock_map_acquire_read(&pwq->wq->lockdep_map); @@ -2932,66 +2946,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)  EXPORT_SYMBOL(cancel_delayed_work_sync);  /** - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -bool schedule_work_on(int cpu, struct work_struct *work) -{ -	return queue_work_on(cpu, system_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns %false if @work was already on the kernel-global workqueue and - * %true otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -bool schedule_work(struct work_struct *work) -{ -	return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, -			      unsigned long delay) -{ -	return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) -{ -	return queue_delayed_work(system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - -/**   * schedule_on_each_cpu - execute a function synchronously on each online CPU   * @func: the function to call   * @@ -3084,51 +3038,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)  }  EXPORT_SYMBOL_GPL(execute_in_process_context); -int keventd_up(void) +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the + * following attributes. + * + *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound + *  max_active	RW int	: maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + *  id		RO int	: the associated pool ID + *  nice	RW int	: nice value of the workers + *  cpumask	RW mask	: bitmask of allowed CPUs for the workers + */ +struct wq_device { +	struct workqueue_struct		*wq; +	struct device			dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ +	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + +	return wq_dev->wq; +} + +static ssize_t wq_per_cpu_show(struct device *dev, +			       struct device_attribute *attr, char *buf) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); + +	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} + +static ssize_t wq_max_active_show(struct device *dev, +				  struct device_attribute *attr, char *buf)  { -	return system_wq != NULL; +	struct workqueue_struct *wq = dev_to_wq(dev); + +	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t wq_max_active_store(struct device *dev, +				   struct device_attribute *attr, +				   const char *buf, size_t count) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); +	int val; + +	if (sscanf(buf, "%d", &val) != 1 || val <= 0) +		return -EINVAL; + +	workqueue_set_max_active(wq, val); +	return count; +} + +static struct device_attribute wq_sysfs_attrs[] = { +	__ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), +	__ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), +	__ATTR_NULL, +}; + +static ssize_t wq_pool_ids_show(struct device *dev, +				struct device_attribute *attr, char *buf) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); +	const char *delim = ""; +	int node, written = 0; + +	rcu_read_lock_sched(); +	for_each_node(node) { +		written += scnprintf(buf + written, PAGE_SIZE - written, +				     "%s%d:%d", delim, node, +				     unbound_pwq_by_node(wq, node)->pool->id); +		delim = " "; +	} +	written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); +	rcu_read_unlock_sched(); + +	return written;  } -static int alloc_pwqs(struct workqueue_struct *wq) +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, +			    char *buf)  { +	struct workqueue_struct *wq = dev_to_wq(dev); +	int written; + +	mutex_lock(&wq->mutex); +	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); +	mutex_unlock(&wq->mutex); + +	return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ +	struct workqueue_attrs *attrs; + +	attrs = alloc_workqueue_attrs(GFP_KERNEL); +	if (!attrs) +		return NULL; + +	mutex_lock(&wq->mutex); +	copy_workqueue_attrs(attrs, wq->unbound_attrs); +	mutex_unlock(&wq->mutex); +	return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, +			     const char *buf, size_t count) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); +	struct workqueue_attrs *attrs; +	int ret; + +	attrs = wq_sysfs_prep_attrs(wq); +	if (!attrs) +		return -ENOMEM; + +	if (sscanf(buf, "%d", &attrs->nice) == 1 && +	    attrs->nice >= -20 && attrs->nice <= 19) +		ret = apply_workqueue_attrs(wq, attrs); +	else +		ret = -EINVAL; + +	free_workqueue_attrs(attrs); +	return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, +			       struct device_attribute *attr, char *buf) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); +	int written; + +	mutex_lock(&wq->mutex); +	written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); +	mutex_unlock(&wq->mutex); + +	written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); +	return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, +				struct device_attribute *attr, +				const char *buf, size_t count) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); +	struct workqueue_attrs *attrs; +	int ret; + +	attrs = wq_sysfs_prep_attrs(wq); +	if (!attrs) +		return -ENOMEM; + +	ret = cpumask_parse(buf, attrs->cpumask); +	if (!ret) +		ret = apply_workqueue_attrs(wq, attrs); + +	free_workqueue_attrs(attrs); +	return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, +			    char *buf) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); +	int written; + +	mutex_lock(&wq->mutex); +	written = scnprintf(buf, PAGE_SIZE, "%d\n", +			    !wq->unbound_attrs->no_numa); +	mutex_unlock(&wq->mutex); + +	return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, +			     const char *buf, size_t count) +{ +	struct workqueue_struct *wq = dev_to_wq(dev); +	struct workqueue_attrs *attrs; +	int v, ret; + +	attrs = wq_sysfs_prep_attrs(wq); +	if (!attrs) +		return -ENOMEM; + +	ret = -EINVAL; +	if (sscanf(buf, "%d", &v) == 1) { +		attrs->no_numa = !v; +		ret = apply_workqueue_attrs(wq, attrs); +	} + +	free_workqueue_attrs(attrs); +	return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { +	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), +	__ATTR(nice, 0644, wq_nice_show, wq_nice_store), +	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), +	__ATTR(numa, 0644, wq_numa_show, wq_numa_store), +	__ATTR_NULL, +}; + +static struct bus_type wq_subsys = { +	.name				= "workqueue", +	.dev_attrs			= wq_sysfs_attrs, +}; + +static int __init wq_sysfs_init(void) +{ +	return subsys_virtual_register(&wq_subsys, NULL); +} +core_initcall(wq_sysfs_init); + +static void wq_device_release(struct device *dev) +{ +	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + +	kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Returns 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ +	struct wq_device *wq_dev; +	int ret; +  	/* -	 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. -	 * Make sure that the alignment isn't lower than that of -	 * unsigned long long. +	 * Adjusting max_active or creating new pwqs by applyting +	 * attributes breaks ordering guarantee.  Disallow exposing ordered +	 * workqueues.  	 */ -	const size_t size = sizeof(struct pool_workqueue); -	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, -				   __alignof__(unsigned long long)); +	if (WARN_ON(wq->flags & __WQ_ORDERED)) +		return -EINVAL; -	if (!(wq->flags & WQ_UNBOUND)) -		wq->pool_wq.pcpu = __alloc_percpu(size, align); -	else { -		void *ptr; +	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); +	if (!wq_dev) +		return -ENOMEM; + +	wq_dev->wq = wq; +	wq_dev->dev.bus = &wq_subsys; +	wq_dev->dev.init_name = wq->name; +	wq_dev->dev.release = wq_device_release; + +	/* +	 * unbound_attrs are created separately.  Suppress uevent until +	 * everything is ready. +	 */ +	dev_set_uevent_suppress(&wq_dev->dev, true); + +	ret = device_register(&wq_dev->dev); +	if (ret) { +		kfree(wq_dev); +		wq->wq_dev = NULL; +		return ret; +	} + +	if (wq->flags & WQ_UNBOUND) { +		struct device_attribute *attr; + +		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { +			ret = device_create_file(&wq_dev->dev, attr); +			if (ret) { +				device_unregister(&wq_dev->dev); +				wq->wq_dev = NULL; +				return ret; +			} +		} +	} + +	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); +	return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ +	struct wq_device *wq_dev = wq->wq_dev; + +	if (!wq->wq_dev) +		return; + +	wq->wq_dev = NULL; +	device_unregister(&wq_dev->dev); +} +#else	/* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ } +#endif	/* CONFIG_SYSFS */ + +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free + * + * Undo alloc_workqueue_attrs(). + */ +void free_workqueue_attrs(struct workqueue_attrs *attrs) +{ +	if (attrs) { +		free_cpumask_var(attrs->cpumask); +		kfree(attrs); +	} +} + +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it.  Returns NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +{ +	struct workqueue_attrs *attrs; + +	attrs = kzalloc(sizeof(*attrs), gfp_mask); +	if (!attrs) +		goto fail; +	if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) +		goto fail; + +	cpumask_copy(attrs->cpumask, cpu_possible_mask); +	return attrs; +fail: +	free_workqueue_attrs(attrs); +	return NULL; +} + +static void copy_workqueue_attrs(struct workqueue_attrs *to, +				 const struct workqueue_attrs *from) +{ +	to->nice = from->nice; +	cpumask_copy(to->cpumask, from->cpumask); +} + +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) +{ +	u32 hash = 0; + +	hash = jhash_1word(attrs->nice, hash); +	hash = jhash(cpumask_bits(attrs->cpumask), +		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); +	return hash; +} + +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, +			  const struct workqueue_attrs *b) +{ +	if (a->nice != b->nice) +		return false; +	if (!cpumask_equal(a->cpumask, b->cpumask)) +		return false; +	return true; +} + +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs. + * Returns 0 on success, -errno on failure.  Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. + */ +static int init_worker_pool(struct worker_pool *pool) +{ +	spin_lock_init(&pool->lock); +	pool->id = -1; +	pool->cpu = -1; +	pool->node = NUMA_NO_NODE; +	pool->flags |= POOL_DISASSOCIATED; +	INIT_LIST_HEAD(&pool->worklist); +	INIT_LIST_HEAD(&pool->idle_list); +	hash_init(pool->busy_hash); + +	init_timer_deferrable(&pool->idle_timer); +	pool->idle_timer.function = idle_worker_timeout; +	pool->idle_timer.data = (unsigned long)pool; + +	setup_timer(&pool->mayday_timer, pool_mayday_timeout, +		    (unsigned long)pool); + +	mutex_init(&pool->manager_arb); +	mutex_init(&pool->manager_mutex); +	idr_init(&pool->worker_idr); + +	INIT_HLIST_NODE(&pool->hash_node); +	pool->refcnt = 1; + +	/* shouldn't fail above this point */ +	pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); +	if (!pool->attrs) +		return -ENOMEM; +	return 0; +} + +static void rcu_free_pool(struct rcu_head *rcu) +{ +	struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); + +	idr_destroy(&pool->worker_idr); +	free_workqueue_attrs(pool->attrs); +	kfree(pool); +} + +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner.  get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. + */ +static void put_unbound_pool(struct worker_pool *pool) +{ +	struct worker *worker; + +	lockdep_assert_held(&wq_pool_mutex); + +	if (--pool->refcnt) +		return; + +	/* sanity checks */ +	if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || +	    WARN_ON(!list_empty(&pool->worklist))) +		return; + +	/* release id and unhash */ +	if (pool->id >= 0) +		idr_remove(&worker_pool_idr, pool->id); +	hash_del(&pool->hash_node); + +	/* +	 * Become the manager and destroy all workers.  Grabbing +	 * manager_arb prevents @pool's workers from blocking on +	 * manager_mutex. +	 */ +	mutex_lock(&pool->manager_arb); +	mutex_lock(&pool->manager_mutex); +	spin_lock_irq(&pool->lock); + +	while ((worker = first_worker(pool))) +		destroy_worker(worker); +	WARN_ON(pool->nr_workers || pool->nr_idle); + +	spin_unlock_irq(&pool->lock); +	mutex_unlock(&pool->manager_mutex); +	mutex_unlock(&pool->manager_arb); + +	/* shut down the timers */ +	del_timer_sync(&pool->idle_timer); +	del_timer_sync(&pool->mayday_timer); + +	/* sched-RCU protected to allow dereferences from get_work_pool() */ +	call_rcu_sched(&pool->rcu, rcu_free_pool); +} + +/** + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get + * + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it.  If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one.  On failure, returns NULL. + * + * Should be called with wq_pool_mutex held. + */ +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) +{ +	u32 hash = wqattrs_hash(attrs); +	struct worker_pool *pool; +	int node; + +	lockdep_assert_held(&wq_pool_mutex); + +	/* do we already have a matching pool? */ +	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { +		if (wqattrs_equal(pool->attrs, attrs)) { +			pool->refcnt++; +			goto out_unlock; +		} +	} + +	/* nope, create a new one */ +	pool = kzalloc(sizeof(*pool), GFP_KERNEL); +	if (!pool || init_worker_pool(pool) < 0) +		goto fail; + +	if (workqueue_freezing) +		pool->flags |= POOL_FREEZING; + +	lockdep_set_subclass(&pool->lock, 1);	/* see put_pwq() */ +	copy_workqueue_attrs(pool->attrs, attrs); + +	/* if cpumask is contained inside a NUMA node, we belong to that node */ +	if (wq_numa_enabled) { +		for_each_node(node) { +			if (cpumask_subset(pool->attrs->cpumask, +					   wq_numa_possible_cpumask[node])) { +				pool->node = node; +				break; +			} +		} +	} + +	if (worker_pool_assign_id(pool) < 0) +		goto fail; + +	/* create and start the initial worker */ +	if (create_and_start_worker(pool) < 0) +		goto fail; + +	/* install */ +	hash_add(unbound_pool_hash, &pool->hash_node, hash); +out_unlock: +	return pool; +fail: +	if (pool) +		put_unbound_pool(pool); +	return NULL; +} + +static void rcu_free_pwq(struct rcu_head *rcu) +{ +	kmem_cache_free(pwq_cache, +			container_of(rcu, struct pool_workqueue, rcu)); +} + +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. + */ +static void pwq_unbound_release_workfn(struct work_struct *work) +{ +	struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, +						  unbound_release_work); +	struct workqueue_struct *wq = pwq->wq; +	struct worker_pool *pool = pwq->pool; +	bool is_last; + +	if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) +		return; + +	/* +	 * Unlink @pwq.  Synchronization against wq->mutex isn't strictly +	 * necessary on release but do it anyway.  It's easier to verify +	 * and consistent with the linking path. +	 */ +	mutex_lock(&wq->mutex); +	list_del_rcu(&pwq->pwqs_node); +	is_last = list_empty(&wq->pwqs); +	mutex_unlock(&wq->mutex); + +	mutex_lock(&wq_pool_mutex); +	put_unbound_pool(pool); +	mutex_unlock(&wq_pool_mutex); + +	call_rcu_sched(&pwq->rcu, rcu_free_pwq); + +	/* +	 * If we're the last pwq going away, @wq is already dead and no one +	 * is gonna access it anymore.  Free it. +	 */ +	if (is_last) { +		free_workqueue_attrs(wq->unbound_attrs); +		kfree(wq); +	} +} + +/** + * pwq_adjust_max_active - update a pwq's max_active to the current setting + * @pwq: target pool_workqueue + * + * If @pwq isn't freezing, set @pwq->max_active to the associated + * workqueue's saved_max_active and activate delayed work items + * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero. + */ +static void pwq_adjust_max_active(struct pool_workqueue *pwq) +{ +	struct workqueue_struct *wq = pwq->wq; +	bool freezable = wq->flags & WQ_FREEZABLE; + +	/* for @wq->saved_max_active */ +	lockdep_assert_held(&wq->mutex); + +	/* fast exit for non-freezable wqs */ +	if (!freezable && pwq->max_active == wq->saved_max_active) +		return; + +	spin_lock_irq(&pwq->pool->lock); + +	if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { +		pwq->max_active = wq->saved_max_active; + +		while (!list_empty(&pwq->delayed_works) && +		       pwq->nr_active < pwq->max_active) +			pwq_activate_first_delayed(pwq);  		/* -		 * Allocate enough room to align pwq and put an extra -		 * pointer at the end pointing back to the originally -		 * allocated pointer which will be used for free. +		 * Need to kick a worker after thawed or an unbound wq's +		 * max_active is bumped.  It's a slow path.  Do it always.  		 */ -		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); -		if (ptr) { -			wq->pool_wq.single = PTR_ALIGN(ptr, align); -			*(void **)(wq->pool_wq.single + 1) = ptr; +		wake_up_worker(pwq->pool); +	} else { +		pwq->max_active = 0; +	} + +	spin_unlock_irq(&pwq->pool->lock); +} + +/* initialize newly alloced @pwq which is associated with @wq and @pool */ +static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, +		     struct worker_pool *pool) +{ +	BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + +	memset(pwq, 0, sizeof(*pwq)); + +	pwq->pool = pool; +	pwq->wq = wq; +	pwq->flush_color = -1; +	pwq->refcnt = 1; +	INIT_LIST_HEAD(&pwq->delayed_works); +	INIT_LIST_HEAD(&pwq->pwqs_node); +	INIT_LIST_HEAD(&pwq->mayday_node); +	INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); +} + +/* sync @pwq with the current state of its associated wq and link it */ +static void link_pwq(struct pool_workqueue *pwq) +{ +	struct workqueue_struct *wq = pwq->wq; + +	lockdep_assert_held(&wq->mutex); + +	/* may be called multiple times, ignore if already linked */ +	if (!list_empty(&pwq->pwqs_node)) +		return; + +	/* +	 * Set the matching work_color.  This is synchronized with +	 * wq->mutex to avoid confusing flush_workqueue(). +	 */ +	pwq->work_color = wq->work_color; + +	/* sync max_active to the current setting */ +	pwq_adjust_max_active(pwq); + +	/* link in @pwq */ +	list_add_rcu(&pwq->pwqs_node, &wq->pwqs); +} + +/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ +static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, +					const struct workqueue_attrs *attrs) +{ +	struct worker_pool *pool; +	struct pool_workqueue *pwq; + +	lockdep_assert_held(&wq_pool_mutex); + +	pool = get_unbound_pool(attrs); +	if (!pool) +		return NULL; + +	pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); +	if (!pwq) { +		put_unbound_pool(pool); +		return NULL; +	} + +	init_pwq(pwq, wq, pool); +	return pwq; +} + +/* undo alloc_unbound_pwq(), used only in the error path */ +static void free_unbound_pwq(struct pool_workqueue *pwq) +{ +	lockdep_assert_held(&wq_pool_mutex); + +	if (pwq) { +		put_unbound_pool(pwq->pool); +		kmem_cache_free(pwq_cache, pwq); +	} +} + +/** + * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of interest + * @node: the target NUMA node + * @cpu_going_down: if >= 0, the CPU to consider as offline + * @cpumask: outarg, the resulting cpumask + * + * Calculate the cpumask a workqueue with @attrs should use on @node.  If + * @cpu_going_down is >= 0, that cpu is considered offline during + * calculation.  The result is stored in @cpumask.  This function returns + * %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. + * + * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If + * enabled and @node has online CPUs requested by @attrs, the returned + * cpumask is the intersection of the possible CPUs of @node and + * @attrs->cpumask. + * + * The caller is responsible for ensuring that the cpumask of @node stays + * stable. + */ +static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, +				 int cpu_going_down, cpumask_t *cpumask) +{ +	if (!wq_numa_enabled || attrs->no_numa) +		goto use_dfl; + +	/* does @node have any online CPUs @attrs wants? */ +	cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); +	if (cpu_going_down >= 0) +		cpumask_clear_cpu(cpu_going_down, cpumask); + +	if (cpumask_empty(cpumask)) +		goto use_dfl; + +	/* yeap, return possible CPUs in @node that @attrs wants */ +	cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); +	return !cpumask_equal(cpumask, attrs->cpumask); + +use_dfl: +	cpumask_copy(cpumask, attrs->cpumask); +	return false; +} + +/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ +static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, +						   int node, +						   struct pool_workqueue *pwq) +{ +	struct pool_workqueue *old_pwq; + +	lockdep_assert_held(&wq->mutex); + +	/* link_pwq() can handle duplicate calls */ +	link_pwq(pwq); + +	old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); +	rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); +	return old_pwq; +} + +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA + * machines, this function maps a separate pwq to each NUMA node with + * possibles CPUs in @attrs->cpumask so that work items are affine to the + * NUMA node it was issued on.  Older pwqs are released as in-flight work + * items finish.  Note that a work item which repeatedly requeues itself + * back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations.  Returns 0 on success and -errno on + * failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, +			  const struct workqueue_attrs *attrs) +{ +	struct workqueue_attrs *new_attrs, *tmp_attrs; +	struct pool_workqueue **pwq_tbl, *dfl_pwq; +	int node, ret; + +	/* only unbound workqueues can change attributes */ +	if (WARN_ON(!(wq->flags & WQ_UNBOUND))) +		return -EINVAL; + +	/* creating multiple pwqs breaks ordering guarantee */ +	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) +		return -EINVAL; + +	pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); +	new_attrs = alloc_workqueue_attrs(GFP_KERNEL); +	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); +	if (!pwq_tbl || !new_attrs || !tmp_attrs) +		goto enomem; + +	/* make a copy of @attrs and sanitize it */ +	copy_workqueue_attrs(new_attrs, attrs); +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + +	/* +	 * We may create multiple pwqs with differing cpumasks.  Make a +	 * copy of @new_attrs which will be modified and used to obtain +	 * pools. +	 */ +	copy_workqueue_attrs(tmp_attrs, new_attrs); + +	/* +	 * CPUs should stay stable across pwq creations and installations. +	 * Pin CPUs, determine the target cpumask for each node and create +	 * pwqs accordingly. +	 */ +	get_online_cpus(); + +	mutex_lock(&wq_pool_mutex); + +	/* +	 * If something goes wrong during CPU up/down, we'll fall back to +	 * the default pwq covering whole @attrs->cpumask.  Always create +	 * it even if we don't use it immediately. +	 */ +	dfl_pwq = alloc_unbound_pwq(wq, new_attrs); +	if (!dfl_pwq) +		goto enomem_pwq; + +	for_each_node(node) { +		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { +			pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); +			if (!pwq_tbl[node]) +				goto enomem_pwq; +		} else { +			dfl_pwq->refcnt++; +			pwq_tbl[node] = dfl_pwq;  		}  	} -	/* just in case, make sure it's actually aligned */ -	BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); -	return wq->pool_wq.v ? 0 : -ENOMEM; +	mutex_unlock(&wq_pool_mutex); + +	/* all pwqs have been created successfully, let's install'em */ +	mutex_lock(&wq->mutex); + +	copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + +	/* save the previous pwq and install the new one */ +	for_each_node(node) +		pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); + +	/* @dfl_pwq might not have been used, ensure it's linked */ +	link_pwq(dfl_pwq); +	swap(wq->dfl_pwq, dfl_pwq); + +	mutex_unlock(&wq->mutex); + +	/* put the old pwqs */ +	for_each_node(node) +		put_pwq_unlocked(pwq_tbl[node]); +	put_pwq_unlocked(dfl_pwq); + +	put_online_cpus(); +	ret = 0; +	/* fall through */ +out_free: +	free_workqueue_attrs(tmp_attrs); +	free_workqueue_attrs(new_attrs); +	kfree(pwq_tbl); +	return ret; + +enomem_pwq: +	free_unbound_pwq(dfl_pwq); +	for_each_node(node) +		if (pwq_tbl && pwq_tbl[node] != dfl_pwq) +			free_unbound_pwq(pwq_tbl[node]); +	mutex_unlock(&wq_pool_mutex); +	put_online_cpus(); +enomem: +	ret = -ENOMEM; +	goto out_free;  } -static void free_pwqs(struct workqueue_struct *wq) +/** + * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * @wq: the target workqueue + * @cpu: the CPU coming up or going down + * @online: whether @cpu is coming up or going down + * + * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and + * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of + * @wq accordingly. + * + * If NUMA affinity can't be adjusted due to memory allocation failure, it + * falls back to @wq->dfl_pwq which may not be optimal but is always + * correct. + * + * Note that when the last allowed CPU of a NUMA node goes offline for a + * workqueue with a cpumask spanning multiple nodes, the workers which were + * already executing the work items for the workqueue will lose their CPU + * affinity and may execute on any CPU.  This is similar to how per-cpu + * workqueues behave on CPU_DOWN.  If a workqueue user wants strict + * affinity, it's the user's responsibility to flush the work item from + * CPU_DOWN_PREPARE. + */ +static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, +				   bool online)  { -	if (!(wq->flags & WQ_UNBOUND)) -		free_percpu(wq->pool_wq.pcpu); -	else if (wq->pool_wq.single) { -		/* the pointer to free is stored right after the pwq */ -		kfree(*(void **)(wq->pool_wq.single + 1)); +	int node = cpu_to_node(cpu); +	int cpu_off = online ? -1 : cpu; +	struct pool_workqueue *old_pwq = NULL, *pwq; +	struct workqueue_attrs *target_attrs; +	cpumask_t *cpumask; + +	lockdep_assert_held(&wq_pool_mutex); + +	if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) +		return; + +	/* +	 * We don't wanna alloc/free wq_attrs for each wq for each CPU. +	 * Let's use a preallocated one.  The following buf is protected by +	 * CPU hotplug exclusion. +	 */ +	target_attrs = wq_update_unbound_numa_attrs_buf; +	cpumask = target_attrs->cpumask; + +	mutex_lock(&wq->mutex); +	if (wq->unbound_attrs->no_numa) +		goto out_unlock; + +	copy_workqueue_attrs(target_attrs, wq->unbound_attrs); +	pwq = unbound_pwq_by_node(wq, node); + +	/* +	 * Let's determine what needs to be done.  If the target cpumask is +	 * different from wq's, we need to compare it to @pwq's and create +	 * a new one if they don't match.  If the target cpumask equals +	 * wq's, the default pwq should be used.  If @pwq is already the +	 * default one, nothing to do; otherwise, install the default one. +	 */ +	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { +		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) +			goto out_unlock; +	} else { +		if (pwq == wq->dfl_pwq) +			goto out_unlock; +		else +			goto use_dfl_pwq; +	} + +	mutex_unlock(&wq->mutex); + +	/* create a new pwq */ +	pwq = alloc_unbound_pwq(wq, target_attrs); +	if (!pwq) { +		pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", +			   wq->name); +		goto out_unlock; +	} + +	/* +	 * Install the new pwq.  As this function is called only from CPU +	 * hotplug callbacks and applying a new attrs is wrapped with +	 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed +	 * inbetween. +	 */ +	mutex_lock(&wq->mutex); +	old_pwq = numa_pwq_tbl_install(wq, node, pwq); +	goto out_unlock; + +use_dfl_pwq: +	spin_lock_irq(&wq->dfl_pwq->pool->lock); +	get_pwq(wq->dfl_pwq); +	spin_unlock_irq(&wq->dfl_pwq->pool->lock); +	old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); +out_unlock: +	mutex_unlock(&wq->mutex); +	put_pwq_unlocked(old_pwq); +} + +static int alloc_and_link_pwqs(struct workqueue_struct *wq) +{ +	bool highpri = wq->flags & WQ_HIGHPRI; +	int cpu; + +	if (!(wq->flags & WQ_UNBOUND)) { +		wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); +		if (!wq->cpu_pwqs) +			return -ENOMEM; + +		for_each_possible_cpu(cpu) { +			struct pool_workqueue *pwq = +				per_cpu_ptr(wq->cpu_pwqs, cpu); +			struct worker_pool *cpu_pools = +				per_cpu(cpu_worker_pools, cpu); + +			init_pwq(pwq, wq, &cpu_pools[highpri]); + +			mutex_lock(&wq->mutex); +			link_pwq(pwq); +			mutex_unlock(&wq->mutex); +		} +		return 0; +	} else { +		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);  	}  } @@ -3150,30 +4078,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  					       struct lock_class_key *key,  					       const char *lock_name, ...)  { -	va_list args, args1; +	size_t tbl_size = 0; +	va_list args;  	struct workqueue_struct *wq; -	unsigned int cpu; -	size_t namelen; +	struct pool_workqueue *pwq; -	/* determine namelen, allocate wq and format name */ -	va_start(args, lock_name); -	va_copy(args1, args); -	namelen = vsnprintf(NULL, 0, fmt, args) + 1; +	/* allocate wq and format name */ +	if (flags & WQ_UNBOUND) +		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); -	wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); +	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);  	if (!wq) -		goto err; +		return NULL; -	vsnprintf(wq->name, namelen, fmt, args1); -	va_end(args); -	va_end(args1); +	if (flags & WQ_UNBOUND) { +		wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); +		if (!wq->unbound_attrs) +			goto err_free_wq; +	} -	/* -	 * Workqueues which may be used during memory reclaim should -	 * have a rescuer to guarantee forward progress. -	 */ -	if (flags & WQ_MEM_RECLAIM) -		flags |= WQ_RESCUER; +	va_start(args, lock_name); +	vsnprintf(wq->name, sizeof(wq->name), fmt, args); +	va_end(args);  	max_active = max_active ?: WQ_DFL_ACTIVE;  	max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3181,71 +4107,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	/* init wq */  	wq->flags = flags;  	wq->saved_max_active = max_active; -	mutex_init(&wq->flush_mutex); +	mutex_init(&wq->mutex);  	atomic_set(&wq->nr_pwqs_to_flush, 0); +	INIT_LIST_HEAD(&wq->pwqs);  	INIT_LIST_HEAD(&wq->flusher_queue);  	INIT_LIST_HEAD(&wq->flusher_overflow); +	INIT_LIST_HEAD(&wq->maydays);  	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);  	INIT_LIST_HEAD(&wq->list); -	if (alloc_pwqs(wq) < 0) -		goto err; +	if (alloc_and_link_pwqs(wq) < 0) +		goto err_free_wq; -	for_each_pwq_cpu(cpu, wq) { -		struct pool_workqueue *pwq = get_pwq(cpu, wq); - -		BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); -		pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); -		pwq->wq = wq; -		pwq->flush_color = -1; -		pwq->max_active = max_active; -		INIT_LIST_HEAD(&pwq->delayed_works); -	} - -	if (flags & WQ_RESCUER) { +	/* +	 * Workqueues which may be used during memory reclaim should +	 * have a rescuer to guarantee forward progress. +	 */ +	if (flags & WQ_MEM_RECLAIM) {  		struct worker *rescuer; -		if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) -			goto err; - -		wq->rescuer = rescuer = alloc_worker(); +		rescuer = alloc_worker();  		if (!rescuer) -			goto err; +			goto err_destroy;  		rescuer->rescue_wq = wq;  		rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",  					       wq->name); -		if (IS_ERR(rescuer->task)) -			goto err; +		if (IS_ERR(rescuer->task)) { +			kfree(rescuer); +			goto err_destroy; +		} -		rescuer->task->flags |= PF_THREAD_BOUND; +		wq->rescuer = rescuer; +		rescuer->task->flags |= PF_NO_SETAFFINITY;  		wake_up_process(rescuer->task);  	} +	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) +		goto err_destroy; +  	/* -	 * workqueue_lock protects global freeze state and workqueues -	 * list.  Grab it, set max_active accordingly and add the new -	 * workqueue to workqueues list. +	 * wq_pool_mutex protects global freeze state and workqueues list. +	 * Grab it, adjust max_active and add the new @wq to workqueues +	 * list.  	 */ -	spin_lock(&workqueue_lock); +	mutex_lock(&wq_pool_mutex); -	if (workqueue_freezing && wq->flags & WQ_FREEZABLE) -		for_each_pwq_cpu(cpu, wq) -			get_pwq(cpu, wq)->max_active = 0; +	mutex_lock(&wq->mutex); +	for_each_pwq(pwq, wq) +		pwq_adjust_max_active(pwq); +	mutex_unlock(&wq->mutex);  	list_add(&wq->list, &workqueues); -	spin_unlock(&workqueue_lock); +	mutex_unlock(&wq_pool_mutex);  	return wq; -err: -	if (wq) { -		free_pwqs(wq); -		free_mayday_mask(wq->mayday_mask); -		kfree(wq->rescuer); -		kfree(wq); -	} + +err_free_wq: +	free_workqueue_attrs(wq->unbound_attrs); +	kfree(wq); +	return NULL; +err_destroy: +	destroy_workqueue(wq);  	return NULL;  }  EXPORT_SYMBOL_GPL(__alloc_workqueue_key); @@ -3258,60 +4183,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);   */  void destroy_workqueue(struct workqueue_struct *wq)  { -	unsigned int cpu; +	struct pool_workqueue *pwq; +	int node;  	/* drain it before proceeding with destruction */  	drain_workqueue(wq); +	/* sanity checks */ +	mutex_lock(&wq->mutex); +	for_each_pwq(pwq, wq) { +		int i; + +		for (i = 0; i < WORK_NR_COLORS; i++) { +			if (WARN_ON(pwq->nr_in_flight[i])) { +				mutex_unlock(&wq->mutex); +				return; +			} +		} + +		if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || +		    WARN_ON(pwq->nr_active) || +		    WARN_ON(!list_empty(&pwq->delayed_works))) { +			mutex_unlock(&wq->mutex); +			return; +		} +	} +	mutex_unlock(&wq->mutex); +  	/*  	 * wq list is used to freeze wq, remove from list after  	 * flushing is complete in case freeze races us.  	 */ -	spin_lock(&workqueue_lock); -	list_del(&wq->list); -	spin_unlock(&workqueue_lock); - -	/* sanity check */ -	for_each_pwq_cpu(cpu, wq) { -		struct pool_workqueue *pwq = get_pwq(cpu, wq); -		int i; +	mutex_lock(&wq_pool_mutex); +	list_del_init(&wq->list); +	mutex_unlock(&wq_pool_mutex); -		for (i = 0; i < WORK_NR_COLORS; i++) -			BUG_ON(pwq->nr_in_flight[i]); -		BUG_ON(pwq->nr_active); -		BUG_ON(!list_empty(&pwq->delayed_works)); -	} +	workqueue_sysfs_unregister(wq); -	if (wq->flags & WQ_RESCUER) { +	if (wq->rescuer) {  		kthread_stop(wq->rescuer->task); -		free_mayday_mask(wq->mayday_mask);  		kfree(wq->rescuer); +		wq->rescuer = NULL;  	} -	free_pwqs(wq); -	kfree(wq); -} -EXPORT_SYMBOL_GPL(destroy_workqueue); - -/** - * pwq_set_max_active - adjust max_active of a pwq - * @pwq: target pool_workqueue - * @max_active: new max_active value. - * - * Set @pwq->max_active to @max_active and activate delayed works if - * increased. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) -{ -	pwq->max_active = max_active; +	if (!(wq->flags & WQ_UNBOUND)) { +		/* +		 * The base ref is never dropped on per-cpu pwqs.  Directly +		 * free the pwqs and wq. +		 */ +		free_percpu(wq->cpu_pwqs); +		kfree(wq); +	} else { +		/* +		 * We're the sole accessor of @wq at this point.  Directly +		 * access numa_pwq_tbl[] and dfl_pwq to put the base refs. +		 * @wq will be freed when the last pwq is released. +		 */ +		for_each_node(node) { +			pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); +			RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); +			put_pwq_unlocked(pwq); +		} -	while (!list_empty(&pwq->delayed_works) && -	       pwq->nr_active < pwq->max_active) -		pwq_activate_first_delayed(pwq); +		/* +		 * Put dfl_pwq.  @wq may be freed any time after dfl_pwq is +		 * put.  Don't access it afterwards. +		 */ +		pwq = wq->dfl_pwq; +		wq->dfl_pwq = NULL; +		put_pwq_unlocked(pwq); +	}  } +EXPORT_SYMBOL_GPL(destroy_workqueue);  /**   * workqueue_set_max_active - adjust max_active of a workqueue @@ -3325,30 +4268,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)   */  void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)  { -	unsigned int cpu; +	struct pool_workqueue *pwq; + +	/* disallow meddling with max_active for ordered workqueues */ +	if (WARN_ON(wq->flags & __WQ_ORDERED)) +		return;  	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); -	spin_lock(&workqueue_lock); +	mutex_lock(&wq->mutex);  	wq->saved_max_active = max_active; -	for_each_pwq_cpu(cpu, wq) { -		struct pool_workqueue *pwq = get_pwq(cpu, wq); -		struct worker_pool *pool = pwq->pool; - -		spin_lock_irq(&pool->lock); +	for_each_pwq(pwq, wq) +		pwq_adjust_max_active(pwq); -		if (!(wq->flags & WQ_FREEZABLE) || -		    !(pool->flags & POOL_FREEZING)) -			pwq_set_max_active(pwq, max_active); +	mutex_unlock(&wq->mutex); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); -		spin_unlock_irq(&pool->lock); -	} +/** + * current_is_workqueue_rescuer - is %current workqueue rescuer? + * + * Determine whether %current is a workqueue rescuer.  Can be used from + * work functions to determine whether it's being run off the rescuer task. + */ +bool current_is_workqueue_rescuer(void) +{ +	struct worker *worker = current_wq_worker(); -	spin_unlock(&workqueue_lock); +	return worker && worker->rescue_wq;  } -EXPORT_SYMBOL_GPL(workqueue_set_max_active);  /**   * workqueue_congested - test whether a workqueue is congested @@ -3362,11 +4312,22 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);   * RETURNS:   * %true if congested, %false otherwise.   */ -bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +bool workqueue_congested(int cpu, struct workqueue_struct *wq)  { -	struct pool_workqueue *pwq = get_pwq(cpu, wq); +	struct pool_workqueue *pwq; +	bool ret; + +	rcu_read_lock_sched(); + +	if (!(wq->flags & WQ_UNBOUND)) +		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); +	else +		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); -	return !list_empty(&pwq->delayed_works); +	ret = !list_empty(&pwq->delayed_works); +	rcu_read_unlock_sched(); + +	return ret;  }  EXPORT_SYMBOL_GPL(workqueue_congested); @@ -3383,19 +4344,22 @@ EXPORT_SYMBOL_GPL(workqueue_congested);   */  unsigned int work_busy(struct work_struct *work)  { -	struct worker_pool *pool = get_work_pool(work); +	struct worker_pool *pool;  	unsigned long flags;  	unsigned int ret = 0;  	if (work_pending(work))  		ret |= WORK_BUSY_PENDING; +	local_irq_save(flags); +	pool = get_work_pool(work);  	if (pool) { -		spin_lock_irqsave(&pool->lock, flags); +		spin_lock(&pool->lock);  		if (find_worker_executing_work(pool, work))  			ret |= WORK_BUSY_RUNNING; -		spin_unlock_irqrestore(&pool->lock, flags); +		spin_unlock(&pool->lock);  	} +	local_irq_restore(flags);  	return ret;  } @@ -3421,53 +4385,153 @@ static void wq_unbind_fn(struct work_struct *work)  	int cpu = smp_processor_id();  	struct worker_pool *pool;  	struct worker *worker; -	int i; +	int wi; -	for_each_std_worker_pool(pool, cpu) { -		BUG_ON(cpu != smp_processor_id()); +	for_each_cpu_worker_pool(pool, cpu) { +		WARN_ON_ONCE(cpu != smp_processor_id()); -		mutex_lock(&pool->assoc_mutex); +		mutex_lock(&pool->manager_mutex);  		spin_lock_irq(&pool->lock);  		/* -		 * We've claimed all manager positions.  Make all workers +		 * We've blocked all manager operations.  Make all workers  		 * unbound and set DISASSOCIATED.  Before this, all workers  		 * except for the ones which are still executing works from  		 * before the last CPU down must be on the cpu.  After  		 * this, they may become diasporas.  		 */ -		list_for_each_entry(worker, &pool->idle_list, entry) -			worker->flags |= WORKER_UNBOUND; - -		for_each_busy_worker(worker, i, pool) +		for_each_pool_worker(worker, wi, pool)  			worker->flags |= WORKER_UNBOUND;  		pool->flags |= POOL_DISASSOCIATED;  		spin_unlock_irq(&pool->lock); -		mutex_unlock(&pool->assoc_mutex); +		mutex_unlock(&pool->manager_mutex); + +		/* +		 * Call schedule() so that we cross rq->lock and thus can +		 * guarantee sched callbacks see the %WORKER_UNBOUND flag. +		 * This is necessary as scheduler callbacks may be invoked +		 * from other cpus. +		 */ +		schedule(); + +		/* +		 * Sched callbacks are disabled now.  Zap nr_running. +		 * After this, nr_running stays zero and need_more_worker() +		 * and keep_working() are always true as long as the +		 * worklist is not empty.  This pool now behaves as an +		 * unbound (in terms of concurrency management) pool which +		 * are served by workers tied to the pool. +		 */ +		atomic_set(&pool->nr_running, 0); + +		/* +		 * With concurrency management just turned off, a busy +		 * worker blocking could lead to lengthy stalls.  Kick off +		 * unbound chain execution of currently pending work items. +		 */ +		spin_lock_irq(&pool->lock); +		wake_up_worker(pool); +		spin_unlock_irq(&pool->lock);  	} +} -	/* -	 * Call schedule() so that we cross rq->lock and thus can guarantee -	 * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary -	 * as scheduler callbacks may be invoked from other cpus. -	 */ -	schedule(); +/** + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest + * + * @pool->cpu is coming online.  Rebind all workers to the CPU. + */ +static void rebind_workers(struct worker_pool *pool) +{ +	struct worker *worker; +	int wi; + +	lockdep_assert_held(&pool->manager_mutex);  	/* -	 * Sched callbacks are disabled now.  Zap nr_running.  After this, -	 * nr_running stays zero and need_more_worker() and keep_working() -	 * are always true as long as the worklist is not empty.  Pools on -	 * @cpu now behave as unbound (in terms of concurrency management) -	 * pools which are served by workers tied to the CPU. -	 * -	 * On return from this function, the current worker would trigger -	 * unbound chain execution of pending work items if other workers -	 * didn't already. +	 * Restore CPU affinity of all workers.  As all idle workers should +	 * be on the run-queue of the associated CPU before any local +	 * wake-ups for concurrency management happen, restore CPU affinty +	 * of all workers first and then clear UNBOUND.  As we're called +	 * from CPU_ONLINE, the following shouldn't fail.  	 */ -	for_each_std_worker_pool(pool, cpu) -		atomic_set(&pool->nr_running, 0); +	for_each_pool_worker(worker, wi, pool) +		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, +						  pool->attrs->cpumask) < 0); + +	spin_lock_irq(&pool->lock); + +	for_each_pool_worker(worker, wi, pool) { +		unsigned int worker_flags = worker->flags; + +		/* +		 * A bound idle worker should actually be on the runqueue +		 * of the associated CPU for local wake-ups targeting it to +		 * work.  Kick all idle workers so that they migrate to the +		 * associated CPU.  Doing this in the same loop as +		 * replacing UNBOUND with REBOUND is safe as no worker will +		 * be bound before @pool->lock is released. +		 */ +		if (worker_flags & WORKER_IDLE) +			wake_up_process(worker->task); + +		/* +		 * We want to clear UNBOUND but can't directly call +		 * worker_clr_flags() or adjust nr_running.  Atomically +		 * replace UNBOUND with another NOT_RUNNING flag REBOUND. +		 * @worker will clear REBOUND using worker_clr_flags() when +		 * it initiates the next execution cycle thus restoring +		 * concurrency management.  Note that when or whether +		 * @worker clears REBOUND doesn't affect correctness. +		 * +		 * ACCESS_ONCE() is necessary because @worker->flags may be +		 * tested without holding any lock in +		 * wq_worker_waking_up().  Without it, NOT_RUNNING test may +		 * fail incorrectly leading to premature concurrency +		 * management operations. +		 */ +		WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); +		worker_flags |= WORKER_REBOUND; +		worker_flags &= ~WORKER_UNBOUND; +		ACCESS_ONCE(worker->flags) = worker_flags; +	} + +	spin_unlock_irq(&pool->lock); +} + +/** + * restore_unbound_workers_cpumask - restore cpumask of unbound workers + * @pool: unbound pool of interest + * @cpu: the CPU which is coming up + * + * An unbound pool may end up with a cpumask which doesn't have any online + * CPUs.  When a worker of such pool get scheduled, the scheduler resets + * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any + * online CPU before, cpus_allowed of all its workers should be restored. + */ +static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) +{ +	static cpumask_t cpumask; +	struct worker *worker; +	int wi; + +	lockdep_assert_held(&pool->manager_mutex); + +	/* is @cpu allowed for @pool? */ +	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) +		return; + +	/* is @cpu the only online CPU? */ +	cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); +	if (cpumask_weight(&cpumask) != 1) +		return; + +	/* as we're called from CPU_ONLINE, the following shouldn't fail */ +	for_each_pool_worker(worker, wi, pool) +		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, +						  pool->attrs->cpumask) < 0);  }  /* @@ -3478,39 +4542,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,  					       unsigned long action,  					       void *hcpu)  { -	unsigned int cpu = (unsigned long)hcpu; +	int cpu = (unsigned long)hcpu;  	struct worker_pool *pool; +	struct workqueue_struct *wq; +	int pi;  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_UP_PREPARE: -		for_each_std_worker_pool(pool, cpu) { -			struct worker *worker; - +		for_each_cpu_worker_pool(pool, cpu) {  			if (pool->nr_workers)  				continue; - -			worker = create_worker(pool); -			if (!worker) +			if (create_and_start_worker(pool) < 0)  				return NOTIFY_BAD; - -			spin_lock_irq(&pool->lock); -			start_worker(worker); -			spin_unlock_irq(&pool->lock);  		}  		break;  	case CPU_DOWN_FAILED:  	case CPU_ONLINE: -		for_each_std_worker_pool(pool, cpu) { -			mutex_lock(&pool->assoc_mutex); -			spin_lock_irq(&pool->lock); +		mutex_lock(&wq_pool_mutex); -			pool->flags &= ~POOL_DISASSOCIATED; -			rebind_workers(pool); +		for_each_pool(pool, pi) { +			mutex_lock(&pool->manager_mutex); + +			if (pool->cpu == cpu) { +				spin_lock_irq(&pool->lock); +				pool->flags &= ~POOL_DISASSOCIATED; +				spin_unlock_irq(&pool->lock); + +				rebind_workers(pool); +			} else if (pool->cpu < 0) { +				restore_unbound_workers_cpumask(pool, cpu); +			} -			spin_unlock_irq(&pool->lock); -			mutex_unlock(&pool->assoc_mutex); +			mutex_unlock(&pool->manager_mutex);  		} + +		/* update NUMA affinity of unbound workqueues */ +		list_for_each_entry(wq, &workqueues, list) +			wq_update_unbound_numa(wq, cpu, true); + +		mutex_unlock(&wq_pool_mutex);  		break;  	}  	return NOTIFY_OK; @@ -3524,14 +4595,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,  						 unsigned long action,  						 void *hcpu)  { -	unsigned int cpu = (unsigned long)hcpu; +	int cpu = (unsigned long)hcpu;  	struct work_struct unbind_work; +	struct workqueue_struct *wq;  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_DOWN_PREPARE: -		/* unbinding should happen on the local CPU */ +		/* unbinding per-cpu workers should happen on the local CPU */  		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);  		queue_work_on(cpu, system_highpri_wq, &unbind_work); + +		/* update NUMA affinity of unbound workqueues */ +		mutex_lock(&wq_pool_mutex); +		list_for_each_entry(wq, &workqueues, list) +			wq_update_unbound_numa(wq, cpu, false); +		mutex_unlock(&wq_pool_mutex); + +		/* wait for per-cpu unbinding to finish */  		flush_work(&unbind_work);  		break;  	} @@ -3564,7 +4644,7 @@ static void work_for_cpu_fn(struct work_struct *work)   * It is up to the caller to ensure that the cpu doesn't go offline.   * The caller must not hold any locks which would prevent @fn from completing.   */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +long work_on_cpu(int cpu, long (*fn)(void *), void *arg)  {  	struct work_for_cpu wfc = { .fn = fn, .arg = arg }; @@ -3582,44 +4662,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu);   * freeze_workqueues_begin - begin freezing workqueues   *   * Start freezing workqueues.  After this function returns, all freezable - * workqueues will queue new works to their frozen_works list instead of + * workqueues will queue new works to their delayed_works list instead of   * pool->worklist.   *   * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.   */  void freeze_workqueues_begin(void)  { -	unsigned int cpu; +	struct worker_pool *pool; +	struct workqueue_struct *wq; +	struct pool_workqueue *pwq; +	int pi; -	spin_lock(&workqueue_lock); +	mutex_lock(&wq_pool_mutex); -	BUG_ON(workqueue_freezing); +	WARN_ON_ONCE(workqueue_freezing);  	workqueue_freezing = true; -	for_each_wq_cpu(cpu) { -		struct worker_pool *pool; -		struct workqueue_struct *wq; - -		for_each_std_worker_pool(pool, cpu) { -			spin_lock_irq(&pool->lock); - -			WARN_ON_ONCE(pool->flags & POOL_FREEZING); -			pool->flags |= POOL_FREEZING; - -			list_for_each_entry(wq, &workqueues, list) { -				struct pool_workqueue *pwq = get_pwq(cpu, wq); - -				if (pwq && pwq->pool == pool && -				    (wq->flags & WQ_FREEZABLE)) -					pwq->max_active = 0; -			} +	/* set FREEZING */ +	for_each_pool(pool, pi) { +		spin_lock_irq(&pool->lock); +		WARN_ON_ONCE(pool->flags & POOL_FREEZING); +		pool->flags |= POOL_FREEZING; +		spin_unlock_irq(&pool->lock); +	} -			spin_unlock_irq(&pool->lock); -		} +	list_for_each_entry(wq, &workqueues, list) { +		mutex_lock(&wq->mutex); +		for_each_pwq(pwq, wq) +			pwq_adjust_max_active(pwq); +		mutex_unlock(&wq->mutex);  	} -	spin_unlock(&workqueue_lock); +	mutex_unlock(&wq_pool_mutex);  }  /** @@ -3629,7 +4705,7 @@ void freeze_workqueues_begin(void)   * between freeze_workqueues_begin() and thaw_workqueues().   *   * CONTEXT: - * Grabs and releases workqueue_lock. + * Grabs and releases wq_pool_mutex.   *   * RETURNS:   * %true if some freezable workqueues are still busy.  %false if freezing @@ -3637,34 +4713,34 @@ void freeze_workqueues_begin(void)   */  bool freeze_workqueues_busy(void)  { -	unsigned int cpu;  	bool busy = false; +	struct workqueue_struct *wq; +	struct pool_workqueue *pwq; -	spin_lock(&workqueue_lock); +	mutex_lock(&wq_pool_mutex); -	BUG_ON(!workqueue_freezing); +	WARN_ON_ONCE(!workqueue_freezing); -	for_each_wq_cpu(cpu) { -		struct workqueue_struct *wq; +	list_for_each_entry(wq, &workqueues, list) { +		if (!(wq->flags & WQ_FREEZABLE)) +			continue;  		/*  		 * nr_active is monotonically decreasing.  It's safe  		 * to peek without lock.  		 */ -		list_for_each_entry(wq, &workqueues, list) { -			struct pool_workqueue *pwq = get_pwq(cpu, wq); - -			if (!pwq || !(wq->flags & WQ_FREEZABLE)) -				continue; - -			BUG_ON(pwq->nr_active < 0); +		rcu_read_lock_sched(); +		for_each_pwq(pwq, wq) { +			WARN_ON_ONCE(pwq->nr_active < 0);  			if (pwq->nr_active) {  				busy = true; +				rcu_read_unlock_sched();  				goto out_unlock;  			}  		} +		rcu_read_unlock_sched();  	}  out_unlock: -	spin_unlock(&workqueue_lock); +	mutex_unlock(&wq_pool_mutex);  	return busy;  } @@ -3675,104 +4751,141 @@ out_unlock:   * frozen works are transferred to their respective pool worklists.   *   * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.   */  void thaw_workqueues(void)  { -	unsigned int cpu; +	struct workqueue_struct *wq; +	struct pool_workqueue *pwq; +	struct worker_pool *pool; +	int pi; -	spin_lock(&workqueue_lock); +	mutex_lock(&wq_pool_mutex);  	if (!workqueue_freezing)  		goto out_unlock; -	for_each_wq_cpu(cpu) { -		struct worker_pool *pool; -		struct workqueue_struct *wq; +	/* clear FREEZING */ +	for_each_pool(pool, pi) { +		spin_lock_irq(&pool->lock); +		WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); +		pool->flags &= ~POOL_FREEZING; +		spin_unlock_irq(&pool->lock); +	} -		for_each_std_worker_pool(pool, cpu) { -			spin_lock_irq(&pool->lock); +	/* restore max_active and repopulate worklist */ +	list_for_each_entry(wq, &workqueues, list) { +		mutex_lock(&wq->mutex); +		for_each_pwq(pwq, wq) +			pwq_adjust_max_active(pwq); +		mutex_unlock(&wq->mutex); +	} -			WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); -			pool->flags &= ~POOL_FREEZING; +	workqueue_freezing = false; +out_unlock: +	mutex_unlock(&wq_pool_mutex); +} +#endif /* CONFIG_FREEZER */ -			list_for_each_entry(wq, &workqueues, list) { -				struct pool_workqueue *pwq = get_pwq(cpu, wq); +static void __init wq_numa_init(void) +{ +	cpumask_var_t *tbl; +	int node, cpu; -				if (!pwq || pwq->pool != pool || -				    !(wq->flags & WQ_FREEZABLE)) -					continue; +	/* determine NUMA pwq table len - highest node id + 1 */ +	for_each_node(node) +		wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); -				/* restore max_active and repopulate worklist */ -				pwq_set_max_active(pwq, wq->saved_max_active); -			} +	if (num_possible_nodes() <= 1) +		return; -			wake_up_worker(pool); +	if (wq_disable_numa) { +		pr_info("workqueue: NUMA affinity support disabled\n"); +		return; +	} + +	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); +	BUG_ON(!wq_update_unbound_numa_attrs_buf); -			spin_unlock_irq(&pool->lock); +	/* +	 * We want masks of possible CPUs of each node which isn't readily +	 * available.  Build one from cpu_to_node() which should have been +	 * fully initialized by now. +	 */ +	tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); +	BUG_ON(!tbl); + +	for_each_node(node) +		BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + +	for_each_possible_cpu(cpu) { +		node = cpu_to_node(cpu); +		if (WARN_ON(node == NUMA_NO_NODE)) { +			pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); +			/* happens iff arch is bonkers, let's just proceed */ +			return;  		} +		cpumask_set_cpu(cpu, tbl[node]);  	} -	workqueue_freezing = false; -out_unlock: -	spin_unlock(&workqueue_lock); +	wq_numa_possible_cpumask = tbl; +	wq_numa_enabled = true;  } -#endif /* CONFIG_FREEZER */  static int __init init_workqueues(void)  { -	unsigned int cpu; +	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; +	int i, cpu;  	/* make sure we have enough bits for OFFQ pool ID */  	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <  		     WORK_CPU_END * NR_STD_WORKER_POOLS); +	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + +	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); +  	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);  	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); +	wq_numa_init(); +  	/* initialize CPU pools */ -	for_each_wq_cpu(cpu) { +	for_each_possible_cpu(cpu) {  		struct worker_pool *pool; -		for_each_std_worker_pool(pool, cpu) { -			spin_lock_init(&pool->lock); +		i = 0; +		for_each_cpu_worker_pool(pool, cpu) { +			BUG_ON(init_worker_pool(pool));  			pool->cpu = cpu; -			pool->flags |= POOL_DISASSOCIATED; -			INIT_LIST_HEAD(&pool->worklist); -			INIT_LIST_HEAD(&pool->idle_list); -			hash_init(pool->busy_hash); - -			init_timer_deferrable(&pool->idle_timer); -			pool->idle_timer.function = idle_worker_timeout; -			pool->idle_timer.data = (unsigned long)pool; - -			setup_timer(&pool->mayday_timer, pool_mayday_timeout, -				    (unsigned long)pool); - -			mutex_init(&pool->assoc_mutex); -			ida_init(&pool->worker_ida); +			cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); +			pool->attrs->nice = std_nice[i++]; +			pool->node = cpu_to_node(cpu);  			/* alloc pool ID */ +			mutex_lock(&wq_pool_mutex);  			BUG_ON(worker_pool_assign_id(pool)); +			mutex_unlock(&wq_pool_mutex);  		}  	}  	/* create the initial worker */ -	for_each_online_wq_cpu(cpu) { +	for_each_online_cpu(cpu) {  		struct worker_pool *pool; -		for_each_std_worker_pool(pool, cpu) { -			struct worker *worker; +		for_each_cpu_worker_pool(pool, cpu) { +			pool->flags &= ~POOL_DISASSOCIATED; +			BUG_ON(create_and_start_worker(pool) < 0); +		} +	} -			if (cpu != WORK_CPU_UNBOUND) -				pool->flags &= ~POOL_DISASSOCIATED; +	/* create default unbound wq attrs */ +	for (i = 0; i < NR_STD_WORKER_POOLS; i++) { +		struct workqueue_attrs *attrs; -			worker = create_worker(pool); -			BUG_ON(!worker); -			spin_lock_irq(&pool->lock); -			start_worker(worker); -			spin_unlock_irq(&pool->lock); -		} +		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); +		attrs->nice = std_nice[i]; +		unbound_std_wq_attrs[i] = attrs;  	}  	system_wq = alloc_workqueue("events", 0, 0); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 07650264ec1..84ab6e1dc6f 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -32,14 +32,12 @@ struct worker {  	struct list_head	scheduled;	/* L: scheduled works */  	struct task_struct	*task;		/* I: worker task */  	struct worker_pool	*pool;		/* I: the associated pool */ +						/* L: for rescuers */  	/* 64 bytes boundary on 64bit, 32 on 32bit */  	unsigned long		last_active;	/* L: last active timestamp */  	unsigned int		flags;		/* X: flags */  	int			id;		/* I: worker id */ -	/* for rebinding worker to CPU */ -	struct work_struct	rebind_work;	/* L: for busy worker */ -  	/* used only by rescuers to point to the target workqueue */  	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */  }; @@ -58,8 +56,7 @@ static inline struct worker *current_wq_worker(void)   * Scheduler hooks for concurrency managed workqueue.  Only to be used from   * sched.c and workqueue.c.   */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, -				       unsigned int cpu); +void wq_worker_waking_up(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */  |