diff options
Diffstat (limited to 'kernel')
106 files changed, 5139 insertions, 3167 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c072b6da23..eceac38f3c6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -153,23 +153,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates  # fail and that the kernel may be used afterwards.  #  ############################################################################### -sign_key_with_hash := -ifeq ($(CONFIG_MODULE_SIG_SHA1),y) -sign_key_with_hash := -sha1 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA224),y) -sign_key_with_hash := -sha224 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA256),y) -sign_key_with_hash := -sha256 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA384),y) -sign_key_with_hash := -sha384 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA512),y) -sign_key_with_hash := -sha512 -endif -ifeq ($(sign_key_with_hash),) +ifndef CONFIG_MODULE_SIG_HASH  $(error Could not determine digest type to use from kernel config)  endif @@ -182,8 +166,8 @@ signing_key.priv signing_key.x509: x509.genkey  	@echo "### needs to be run as root, and uses a hardware random"  	@echo "### number generator if one is available."  	@echo "###" -	openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ -		-x509 -config x509.genkey \ +	openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ +		-batch -x509 -config x509.genkey \  		-outform DER -out signing_key.x509 \  		-keyout signing_key.priv  	@echo "###" diff --git a/kernel/acct.c b/kernel/acct.c index 0d2981358e0..b9bd7f098ee 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -566,6 +566,7 @@ out:  void acct_collect(long exitcode, int group_dead)  {  	struct pacct_struct *pacct = ¤t->signal->pacct; +	cputime_t utime, stime;  	unsigned long vsize = 0;  	if (group_dead && current->mm) { @@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)  		pacct->ac_flag |= ACORE;  	if (current->flags & PF_SIGNALED)  		pacct->ac_flag |= AXSIG; -	pacct->ac_utime += current->utime; -	pacct->ac_stime += current->stime; +	task_cputime(current, &utime, &stime); +	pacct->ac_utime += utime; +	pacct->ac_stime += stime;  	pacct->ac_minflt += current->min_flt;  	pacct->ac_majflt += current->maj_flt;  	spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/async.c b/kernel/async.c index a1d585c351d..8ddee2c3e5b 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -57,56 +57,52 @@ asynchronous and synchronous parts of the kernel.  #include <linux/slab.h>  #include <linux/workqueue.h> +#include "workqueue_internal.h" +  static async_cookie_t next_cookie = 1; -#define MAX_WORK	32768 +#define MAX_WORK		32768 +#define ASYNC_COOKIE_MAX	ULLONG_MAX	/* infinity cookie */ -static LIST_HEAD(async_pending); -static ASYNC_DOMAIN(async_running); -static LIST_HEAD(async_domains); +static LIST_HEAD(async_global_pending);	/* pending from all registered doms */ +static ASYNC_DOMAIN(async_dfl_domain);  static DEFINE_SPINLOCK(async_lock); -static DEFINE_MUTEX(async_register_mutex);  struct async_entry { -	struct list_head	list; +	struct list_head	domain_list; +	struct list_head	global_list;  	struct work_struct	work;  	async_cookie_t		cookie;  	async_func_ptr		*func;  	void			*data; -	struct async_domain	*running; +	struct async_domain	*domain;  };  static DECLARE_WAIT_QUEUE_HEAD(async_done);  static atomic_t entry_count; - -/* - * MUST be called with the lock held! - */ -static async_cookie_t  __lowest_in_progress(struct async_domain *running) +static async_cookie_t lowest_in_progress(struct async_domain *domain)  { -	struct async_entry *entry; - -	if (!list_empty(&running->domain)) { -		entry = list_first_entry(&running->domain, typeof(*entry), list); -		return entry->cookie; -	} +	struct async_entry *first = NULL; +	async_cookie_t ret = ASYNC_COOKIE_MAX; +	unsigned long flags; -	list_for_each_entry(entry, &async_pending, list) -		if (entry->running == running) -			return entry->cookie; +	spin_lock_irqsave(&async_lock, flags); -	return next_cookie;	/* "infinity" value */ -} +	if (domain) { +		if (!list_empty(&domain->pending)) +			first = list_first_entry(&domain->pending, +					struct async_entry, domain_list); +	} else { +		if (!list_empty(&async_global_pending)) +			first = list_first_entry(&async_global_pending, +					struct async_entry, global_list); +	} -static async_cookie_t  lowest_in_progress(struct async_domain *running) -{ -	unsigned long flags; -	async_cookie_t ret; +	if (first) +		ret = first->cookie; -	spin_lock_irqsave(&async_lock, flags); -	ret = __lowest_in_progress(running);  	spin_unlock_irqrestore(&async_lock, flags);  	return ret;  } @@ -120,14 +116,8 @@ static void async_run_entry_fn(struct work_struct *work)  		container_of(work, struct async_entry, work);  	unsigned long flags;  	ktime_t uninitialized_var(calltime), delta, rettime; -	struct async_domain *running = entry->running; - -	/* 1) move self to the running queue */ -	spin_lock_irqsave(&async_lock, flags); -	list_move_tail(&entry->list, &running->domain); -	spin_unlock_irqrestore(&async_lock, flags); -	/* 2) run (and print duration) */ +	/* 1) run (and print duration) */  	if (initcall_debug && system_state == SYSTEM_BOOTING) {  		printk(KERN_DEBUG "calling  %lli_%pF @ %i\n",  			(long long)entry->cookie, @@ -144,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)  			(long long)ktime_to_ns(delta) >> 10);  	} -	/* 3) remove self from the running queue */ +	/* 2) remove self from the pending queues */  	spin_lock_irqsave(&async_lock, flags); -	list_del(&entry->list); -	if (running->registered && --running->count == 0) -		list_del_init(&running->node); +	list_del_init(&entry->domain_list); +	list_del_init(&entry->global_list); -	/* 4) free the entry */ +	/* 3) free the entry */  	kfree(entry);  	atomic_dec(&entry_count);  	spin_unlock_irqrestore(&async_lock, flags); -	/* 5) wake up any waiters */ +	/* 4) wake up any waiters */  	wake_up(&async_done);  } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)  {  	struct async_entry *entry;  	unsigned long flags; @@ -183,16 +172,22 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a  		ptr(data, newcookie);  		return newcookie;  	} +	INIT_LIST_HEAD(&entry->domain_list); +	INIT_LIST_HEAD(&entry->global_list);  	INIT_WORK(&entry->work, async_run_entry_fn);  	entry->func = ptr;  	entry->data = data; -	entry->running = running; +	entry->domain = domain;  	spin_lock_irqsave(&async_lock, flags); + +	/* allocate cookie and queue */  	newcookie = entry->cookie = next_cookie++; -	list_add_tail(&entry->list, &async_pending); -	if (running->registered && running->count++ == 0) -		list_add_tail(&running->node, &async_domains); + +	list_add_tail(&entry->domain_list, &domain->pending); +	if (domain->registered) +		list_add_tail(&entry->global_list, &async_global_pending); +  	atomic_inc(&entry_count);  	spin_unlock_irqrestore(&async_lock, flags); @@ -215,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a   */  async_cookie_t async_schedule(async_func_ptr *ptr, void *data)  { -	return __async_schedule(ptr, data, &async_running); +	return __async_schedule(ptr, data, &async_dfl_domain);  }  EXPORT_SYMBOL_GPL(async_schedule); @@ -223,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);   * async_schedule_domain - schedule a function for asynchronous execution within a certain domain   * @ptr: function to execute asynchronously   * @data: data pointer to pass to the function - * @running: running list for the domain + * @domain: the domain   *   * Returns an async_cookie_t that may be used for checkpointing later. - * @running may be used in the async_synchronize_*_domain() functions - * to wait within a certain synchronization domain rather than globally. - * A synchronization domain is specified via the running queue @running to use. - * Note: This function may be called from atomic or non-atomic contexts. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally.  A + * synchronization domain is specified via @domain.  Note: This function + * may be called from atomic or non-atomic contexts.   */  async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, -				     struct async_domain *running) +				     struct async_domain *domain)  { -	return __async_schedule(ptr, data, running); +	return __async_schedule(ptr, data, domain);  }  EXPORT_SYMBOL_GPL(async_schedule_domain); @@ -245,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);   */  void async_synchronize_full(void)  { -	mutex_lock(&async_register_mutex); -	do { -		struct async_domain *domain = NULL; - -		spin_lock_irq(&async_lock); -		if (!list_empty(&async_domains)) -			domain = list_first_entry(&async_domains, typeof(*domain), node); -		spin_unlock_irq(&async_lock); - -		async_synchronize_cookie_domain(next_cookie, domain); -	} while (!list_empty(&async_domains)); -	mutex_unlock(&async_register_mutex); +	async_synchronize_full_domain(NULL);  }  EXPORT_SYMBOL_GPL(async_synchronize_full); @@ -271,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);   */  void async_unregister_domain(struct async_domain *domain)  { -	mutex_lock(&async_register_mutex);  	spin_lock_irq(&async_lock); -	WARN_ON(!domain->registered || !list_empty(&domain->node) || -		!list_empty(&domain->domain)); +	WARN_ON(!domain->registered || !list_empty(&domain->pending));  	domain->registered = 0;  	spin_unlock_irq(&async_lock); -	mutex_unlock(&async_register_mutex);  }  EXPORT_SYMBOL_GPL(async_unregister_domain);  /**   * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @domain: running list to synchronize on + * @domain: the domain to synchronize   *   * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @domain have been done. + * synchronization domain specified by @domain have been done.   */  void async_synchronize_full_domain(struct async_domain *domain)  { -	async_synchronize_cookie_domain(next_cookie, domain); +	async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);  }  EXPORT_SYMBOL_GPL(async_synchronize_full_domain);  /**   * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing   * @cookie: async_cookie_t to use as checkpoint - * @running: running list to synchronize on + * @domain: the domain to synchronize (%NULL for all registered domains)   *   * This function waits until all asynchronous function calls for the - * synchronization domain specified by running list @running submitted - * prior to @cookie have been done. + * synchronization domain specified by @domain submitted prior to @cookie + * have been done.   */ -void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)  {  	ktime_t uninitialized_var(starttime), delta, endtime; -	if (!running) -		return; -  	if (initcall_debug && system_state == SYSTEM_BOOTING) {  		printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));  		starttime = ktime_get();  	} -	wait_event(async_done, lowest_in_progress(running) >= cookie); +	wait_event(async_done, lowest_in_progress(domain) >= cookie);  	if (initcall_debug && system_state == SYSTEM_BOOTING) {  		endtime = ktime_get(); @@ -337,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);   */  void async_synchronize_cookie(async_cookie_t cookie)  { -	async_synchronize_cookie_domain(cookie, &async_running); +	async_synchronize_cookie_domain(cookie, &async_dfl_domain);  }  EXPORT_SYMBOL_GPL(async_synchronize_cookie); + +/** + * current_is_async - is %current an async worker task? + * + * Returns %true if %current is an async worker task. + */ +bool current_is_async(void) +{ +	struct worker *worker = current_wq_worker(); + +	return worker && worker->current_func == async_run_entry_fn; +} diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4fe52b3b6ef..fb2fb11fbb2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,7 @@  #include <linux/module.h>  #include <linux/delayacct.h>  #include <linux/cgroupstats.h> -#include <linux/hash.h> +#include <linux/hashtable.h>  #include <linux/namei.h>  #include <linux/pid_namespace.h>  #include <linux/idr.h> @@ -376,22 +376,18 @@ static int css_set_count;   * account cgroups in empty hierarchies.   */  #define CSS_SET_HASH_BITS	7 -#define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state *css[])  {  	int i; -	int index; -	unsigned long tmp = 0UL; +	unsigned long key = 0UL;  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) -		tmp += (unsigned long)css[i]; -	tmp = (tmp >> 16) ^ tmp; +		key += (unsigned long)css[i]; +	key = (key >> 16) ^ key; -	index = hash_long(tmp, CSS_SET_HASH_BITS); - -	return &css_set_table[index]; +	return key;  }  /* We don't maintain the lists running through each css_set to its @@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)  	}  	/* This css_set is dead. unlink it and release cgroup refcounts */ -	hlist_del(&cg->hlist); +	hash_del(&cg->hlist);  	css_set_count--;  	list_for_each_entry_safe(link, saved_link, &cg->cg_links, @@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)  		struct cgroup *cgrp = link->cgrp;  		list_del(&link->cg_link_list);  		list_del(&link->cgrp_link_list); + +		/* +		 * We may not be holding cgroup_mutex, and if cgrp->count is +		 * dropped to 0 the cgroup can be destroyed at any time, hence +		 * rcu_read_lock is used to keep it alive. +		 */ +		rcu_read_lock();  		if (atomic_dec_and_test(&cgrp->count) &&  		    notify_on_release(cgrp)) {  			if (taskexit)  				set_bit(CGRP_RELEASABLE, &cgrp->flags);  			check_for_release(cgrp);  		} +		rcu_read_unlock();  		kfree(link);  	} @@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set(  {  	int i;  	struct cgroupfs_root *root = cgrp->root; -	struct hlist_head *hhead;  	struct hlist_node *node;  	struct css_set *cg; +	unsigned long key;  	/*  	 * Build the set of subsystem state objects that we want to see in the @@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set(  		}  	} -	hhead = css_set_hash(template); -	hlist_for_each_entry(cg, node, hhead, hlist) { +	key = css_set_hash(template); +	hash_for_each_possible(css_set_table, cg, node, hlist, key) {  		if (!compare_css_sets(cg, oldcg, cgrp, template))  			continue; @@ -657,8 +661,8 @@ static struct css_set *find_css_set(  	struct list_head tmp_cg_links; -	struct hlist_head *hhead;  	struct cg_cgroup_link *link; +	unsigned long key;  	/* First see if we already have a cgroup group that matches  	 * the desired set */ @@ -704,8 +708,8 @@ static struct css_set *find_css_set(  	css_set_count++;  	/* Add this cgroup group to the hash table */ -	hhead = css_set_hash(res->subsys); -	hlist_add_head(&res->hlist, hhead); +	key = css_set_hash(res->subsys); +	hash_add(css_set_table, &res->hlist, key);  	write_unlock(&css_set_lock); @@ -856,47 +860,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)  	return inode;  } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static void cgroup_free_fn(struct work_struct *work)  { -	/* is dentry a directory ? if so, kfree() associated cgroup */ -	if (S_ISDIR(inode->i_mode)) { -		struct cgroup *cgrp = dentry->d_fsdata; -		struct cgroup_subsys *ss; -		BUG_ON(!(cgroup_is_removed(cgrp))); -		/* It's possible for external users to be holding css -		 * reference counts on a cgroup; css_put() needs to -		 * be able to access the cgroup after decrementing -		 * the reference count in order to know if it needs to -		 * queue the cgroup to be handled by the release -		 * agent */ -		synchronize_rcu(); +	struct cgroup *cgrp = container_of(work, struct cgroup, free_work); +	struct cgroup_subsys *ss; -		mutex_lock(&cgroup_mutex); -		/* -		 * Release the subsystem state objects. -		 */ -		for_each_subsys(cgrp->root, ss) -			ss->css_free(cgrp); +	mutex_lock(&cgroup_mutex); +	/* +	 * Release the subsystem state objects. +	 */ +	for_each_subsys(cgrp->root, ss) +		ss->css_free(cgrp); -		cgrp->root->number_of_cgroups--; -		mutex_unlock(&cgroup_mutex); +	cgrp->root->number_of_cgroups--; +	mutex_unlock(&cgroup_mutex); -		/* -		 * Drop the active superblock reference that we took when we -		 * created the cgroup -		 */ -		deactivate_super(cgrp->root->sb); +	/* +	 * Drop the active superblock reference that we took when we +	 * created the cgroup +	 */ +	deactivate_super(cgrp->root->sb); -		/* -		 * if we're getting rid of the cgroup, refcount should ensure -		 * that there are no pidlists left. -		 */ -		BUG_ON(!list_empty(&cgrp->pidlists)); +	/* +	 * if we're getting rid of the cgroup, refcount should ensure +	 * that there are no pidlists left. +	 */ +	BUG_ON(!list_empty(&cgrp->pidlists)); -		simple_xattrs_free(&cgrp->xattrs); +	simple_xattrs_free(&cgrp->xattrs); -		ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); -		kfree_rcu(cgrp, rcu_head); +	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); +	kfree(cgrp); +} + +static void cgroup_free_rcu(struct rcu_head *head) +{ +	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); + +	schedule_work(&cgrp->free_work); +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ +	/* is dentry a directory ? if so, kfree() associated cgroup */ +	if (S_ISDIR(inode->i_mode)) { +		struct cgroup *cgrp = dentry->d_fsdata; + +		BUG_ON(!(cgroup_is_removed(cgrp))); +		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);  	} else {  		struct cfent *cfe = __d_cfe(dentry);  		struct cgroup *cgrp = dentry->d_parent->d_fsdata; @@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d)  	dput(parent);  } -static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  {  	struct cfent *cfe;  	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);  	lockdep_assert_held(&cgroup_mutex); +	/* +	 * If we're doing cleanup due to failure of cgroup_create(), +	 * the corresponding @cfe may not exist. +	 */  	list_for_each_entry(cfe, &cgrp->files, node) {  		struct dentry *d = cfe->dentry; @@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  		list_del_init(&cfe->node);  		dput(d); -		return 0; +		break;  	} -	return -ENOENT;  }  /** @@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,  		}  	}  	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; -	synchronize_rcu();  	return 0;  } @@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	INIT_LIST_HEAD(&cgrp->allcg_node);  	INIT_LIST_HEAD(&cgrp->release_list);  	INIT_LIST_HEAD(&cgrp->pidlists); +	INIT_WORK(&cgrp->free_work, cgroup_free_fn);  	mutex_init(&cgrp->pidlist_mutex);  	INIT_LIST_HEAD(&cgrp->event_list);  	spin_lock_init(&cgrp->event_list_lock); @@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		struct cgroupfs_root *existing_root;  		const struct cred *cred;  		int i; +		struct hlist_node *node; +		struct css_set *cg;  		BUG_ON(sb->s_root != NULL); @@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		/* Link the top cgroup in this hierarchy into all  		 * the css_set objects */  		write_lock(&css_set_lock); -		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { -			struct hlist_head *hhead = &css_set_table[i]; -			struct hlist_node *node; -			struct css_set *cg; - -			hlist_for_each_entry(cg, node, hhead, hlist) -				link_css_set(&tmp_cg_links, cg, root_cgrp); -		} +		hash_for_each(css_set_table, i, node, cg, hlist) +			link_css_set(&tmp_cg_links, cg, root_cgrp);  		write_unlock(&css_set_lock);  		free_cg_links(&tmp_cg_links); @@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)  	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),  			   "cgroup_path() called without proper locking"); -	if (!dentry || cgrp == dummytop) { +	if (cgrp == dummytop) {  		/*  		 * Inactive subsystems have no dentry for their root  		 * cgroup @@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  			ss->attach(cgrp, &tset);  	} -	synchronize_rcu();  out:  	if (retval) {  		for_each_subsys(root, ss) { @@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	/*  	 * step 5: success! and cleanup  	 */ -	synchronize_rcu();  	retval = 0;  out_put_css_set_refs:  	if (retval) { @@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,  		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)  			continue; -		if (is_add) +		if (is_add) {  			err = cgroup_add_file(cgrp, subsys, cft); -		else -			err = cgroup_rm_file(cgrp, cft); -		if (err) { -			pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", -				   is_add ? "add" : "remove", cft->name, err); +			if (err) +				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", +					cft->name, err);  			ret = err; +		} else { +			cgroup_rm_file(cgrp, cft);  		}  	}  	return ret; @@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,  }  EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +/** + * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup + * @pos: cgroup of interest + * + * Return the rightmost descendant of @pos.  If there's no descendant, + * @pos is returned.  This can be used during pre-order traversal to skip + * subtree of @pos. + */ +struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +{ +	struct cgroup *last, *tmp; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	do { +		last = pos; +		/* ->prev isn't RCU safe, walk ->next till the end */ +		pos = NULL; +		list_for_each_entry_rcu(tmp, &last->children, sibling) +			pos = tmp; +	} while (pos); + +	return last; +} +EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +  static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)  {  	struct cgroup *last; @@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work)  			remove);  	struct cgroup *cgrp = event->cgrp; +	remove_wait_queue(event->wqh, &event->wait); +  	event->cft->unregister_event(cgrp, event->cft, event->eventfd); +	/* Notify userspace the event is going away. */ +	eventfd_signal(event->eventfd, 1); +  	eventfd_ctx_put(event->eventfd);  	kfree(event);  	dput(cgrp->dentry); @@ -3773,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,  	unsigned long flags = (unsigned long)key;  	if (flags & POLLHUP) { -		__remove_wait_queue(event->wqh, &event->wait); -		spin_lock(&cgrp->event_list_lock); -		list_del_init(&event->list); -		spin_unlock(&cgrp->event_list_lock);  		/* -		 * We are in atomic context, but cgroup_event_remove() may -		 * sleep, so we have to call it in workqueue. +		 * If the event has been detached at cgroup removal, we +		 * can simply return knowing the other side will cleanup +		 * for us. +		 * +		 * We can't race against event freeing since the other +		 * side will require wqh->lock via remove_wait_queue(), +		 * which we hold.  		 */ -		schedule_work(&event->remove); +		spin_lock(&cgrp->event_list_lock); +		if (!list_empty(&event->list)) { +			list_del_init(&event->list); +			/* +			 * We are in atomic context, but cgroup_event_remove() +			 * may sleep, so we have to call it in workqueue. +			 */ +			schedule_work(&event->remove); +		} +		spin_unlock(&cgrp->event_list_lock);  	}  	return 0; @@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,  				      const char *buffer)  {  	struct cgroup_event *event = NULL; +	struct cgroup *cgrp_cfile;  	unsigned int efd, cfd;  	struct file *efile = NULL;  	struct file *cfile = NULL; @@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,  		goto fail;  	} +	/* +	 * The file to be monitored must be in the same cgroup as +	 * cgroup.event_control is. +	 */ +	cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); +	if (cgrp_cfile != cgrp) { +		ret = -EINVAL; +		goto fail; +	} +  	if (!event->cft->register_event || !event->cft->unregister_event) {  		ret = -EINVAL;  		goto fail; @@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	init_cgroup_housekeeping(cgrp); +	dentry->d_fsdata = cgrp; +	cgrp->dentry = dentry; +  	cgrp->parent = parent;  	cgrp->root = parent->root;  	cgrp->top_cgroup = parent->top_cgroup; @@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	lockdep_assert_held(&dentry->d_inode->i_mutex);  	/* allocation complete, commit to creation */ -	dentry->d_fsdata = cgrp; -	cgrp->dentry = dentry;  	list_add_tail(&cgrp->allcg_node, &root->allcg_list);  	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);  	root->number_of_cgroups++; @@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  	/*  	 * Unregister events and notify userspace.  	 * Notify userspace about cgroup removing only after rmdir of cgroup -	 * directory to avoid race between userspace and kernelspace. Use -	 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since -	 * cgroup_event_wake() is called with the wait queue head locked, -	 * remove_wait_queue() cannot be called while holding event_list_lock. +	 * directory to avoid race between userspace and kernelspace.  	 */  	spin_lock(&cgrp->event_list_lock); -	list_splice_init(&cgrp->event_list, &tmp_list); -	spin_unlock(&cgrp->event_list_lock); -	list_for_each_entry_safe(event, tmp, &tmp_list, list) { +	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {  		list_del_init(&event->list); -		remove_wait_queue(event->wqh, &event->wait); -		eventfd_signal(event->eventfd, 1);  		schedule_work(&event->remove);  	} +	spin_unlock(&cgrp->event_list_lock);  	return 0;  } @@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  {  	struct cgroup_subsys_state *css;  	int i, ret; +	struct hlist_node *node, *tmp; +	struct css_set *cg; +	unsigned long key;  	/* check name and function validity */  	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	 * this is all done under the css_set_lock.  	 */  	write_lock(&css_set_lock); -	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { -		struct css_set *cg; -		struct hlist_node *node, *tmp; -		struct hlist_head *bucket = &css_set_table[i], *new_bucket; - -		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { -			/* skip entries that we already rehashed */ -			if (cg->subsys[ss->subsys_id]) -				continue; -			/* remove existing entry */ -			hlist_del(&cg->hlist); -			/* set new value */ -			cg->subsys[ss->subsys_id] = css; -			/* recompute hash and restore entry */ -			new_bucket = css_set_hash(cg->subsys); -			hlist_add_head(&cg->hlist, new_bucket); -		} +	hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) { +		/* skip entries that we already rehashed */ +		if (cg->subsys[ss->subsys_id]) +			continue; +		/* remove existing entry */ +		hash_del(&cg->hlist); +		/* set new value */ +		cg->subsys[ss->subsys_id] = css; +		/* recompute hash and restore entry */ +		key = css_set_hash(cg->subsys); +		hash_add(css_set_table, node, key);  	}  	write_unlock(&css_set_lock); @@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);  void cgroup_unload_subsys(struct cgroup_subsys *ss)  {  	struct cg_cgroup_link *link; -	struct hlist_head *hhead;  	BUG_ON(ss->module == NULL); @@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	write_lock(&css_set_lock);  	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {  		struct css_set *cg = link->cg; +		unsigned long key; -		hlist_del(&cg->hlist); +		hash_del(&cg->hlist);  		cg->subsys[ss->subsys_id] = NULL; -		hhead = css_set_hash(cg->subsys); -		hlist_add_head(&cg->hlist, hhead); +		key = css_set_hash(cg->subsys); +		hash_add(css_set_table, &cg->hlist, key);  	}  	write_unlock(&css_set_lock); @@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void)  	list_add(&init_css_set_link.cg_link_list,  		 &init_css_set.cg_links); -	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) -		INIT_HLIST_HEAD(&css_set_table[i]); -  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		struct cgroup_subsys *ss = subsys[i]; @@ -4667,7 +4716,7 @@ int __init cgroup_init(void)  {  	int err;  	int i; -	struct hlist_head *hhead; +	unsigned long key;  	err = bdi_init(&cgroup_backing_dev_info);  	if (err) @@ -4686,8 +4735,8 @@ int __init cgroup_init(void)  	}  	/* Add init_css_set to the hash table */ -	hhead = css_set_hash(init_css_set.subsys); -	hlist_add_head(&init_css_set.hlist, hhead); +	key = css_set_hash(init_css_set.subsys); +	hash_add(css_set_table, &init_css_set.hlist, key);  	BUG_ON(!init_root_id(&rootnode));  	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4982,8 +5031,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  	}  	task_unlock(tsk); -	if (cg) -		put_css_set_taskexit(cg); +	put_css_set_taskexit(cg);  }  /** diff --git a/kernel/compat.c b/kernel/compat.c index 36700e9e2be..19971d8c729 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,  		 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));  } -asmlinkage long compat_sys_getitimer(int which, -		struct compat_itimerval __user *it) +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, +		struct compat_itimerval __user *, it)  {  	struct itimerval kit;  	int error; @@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,  	return error;  } -asmlinkage long compat_sys_setitimer(int which, -		struct compat_itimerval __user *in, -		struct compat_itimerval __user *out) +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, +		struct compat_itimerval __user *, in, +		struct compat_itimerval __user *, out)  {  	struct itimerval kin, kout;  	int error; @@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)  	memcpy(blocked->sig, &set, sizeof(set));  } -asmlinkage long compat_sys_sigprocmask(int how, -				       compat_old_sigset_t __user *nset, -				       compat_old_sigset_t __user *oset) +COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, +		       compat_old_sigset_t __user *, nset, +		       compat_old_sigset_t __user *, oset)  {  	old_sigset_t old_set, new_set;  	sigset_t new_blocked; @@ -593,7 +593,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,  		else  			ret = put_compat_rusage(&ru, uru);  		if (ret) -			return ret; +			return -EFAULT;  	}  	BUG_ON(info.si_code & __SI_MASK); @@ -971,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,  }  void -sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)  {  	switch (_NSIG_WORDS) {  	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); @@ -982,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)  }  EXPORT_SYMBOL_GPL(sigset_from_compat); -asmlinkage long -compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, -		struct compat_siginfo __user *uinfo, -		struct compat_timespec __user *uts, compat_size_t sigsetsize) +void +sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +{ +	switch (_NSIG_WORDS) { +	case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; +	case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; +	case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; +	case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; +	} +} + +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, +		struct compat_siginfo __user *, uinfo, +		struct compat_timespec __user *, uts, compat_size_t, sigsetsize)  {  	compat_sigset_t s32;  	sigset_t s; @@ -1013,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,  	}  	return ret; - -} - -asmlinkage long -compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, -			     struct compat_siginfo __user *uinfo) -{ -	siginfo_t info; - -	if (copy_siginfo_from_user32(&info, uinfo)) -		return -EFAULT; -	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);  }  #ifdef __ARCH_WANT_COMPAT_SYS_TIME @@ -1067,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)  #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) -{ -	sigset_t newset; -	compat_sigset_t newset32; - -	/* XXX: Don't preclude handling different sized sigset_t's.  */ -	if (sigsetsize != sizeof(sigset_t)) -		return -EINVAL; - -	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) -		return -EFAULT; -	sigset_from_compat(&newset, &newset32); -	return sigsuspend(&newset); -} -#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ -  asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)  {  	struct timex txc; @@ -1222,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)  	return 0;  } -#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL -asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, -						 struct compat_timespec __user *interval) +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, +		       compat_pid_t, pid, +		       struct compat_timespec __user *, interval)  {  	struct timespec t;  	int ret; @@ -1237,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,  		return -EFAULT;  	return ret;  } -#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */  /*   * Allocate user-space memory for the duration of a single system call, diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e0e07fd5550..65349f07b87 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,29 +1,41 @@ +/* + * Context tracking: Probe on high level context boundaries such as kernel + * and userspace. This includes syscalls and exceptions entry/exit. + * + * This is used by RCU to remove its dependency on the timer tick while a CPU + * runs in userspace. + * + *  Started by Frederic Weisbecker: + * + * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * + * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, + * Steven Rostedt, Peter Zijlstra for suggestions and improvements. + * + */ +  #include <linux/context_tracking.h> +#include <linux/kvm_host.h>  #include <linux/rcupdate.h>  #include <linux/sched.h> -#include <linux/percpu.h>  #include <linux/hardirq.h> +#include <linux/export.h> -struct context_tracking { -	/* -	 * When active is false, hooks are not set to -	 * minimize overhead: TIF flags are cleared -	 * and calls to user_enter/exit are ignored. This -	 * may be further optimized using static keys. -	 */ -	bool active; -	enum { -		IN_KERNEL = 0, -		IN_USER, -	} state; -}; - -static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +DEFINE_PER_CPU(struct context_tracking, context_tracking) = {  #ifdef CONFIG_CONTEXT_TRACKING_FORCE  	.active = true,  #endif  }; +/** + * user_enter - Inform the context tracking that the CPU is going to + *              enter userspace mode. + * + * This function must be called right before we switch from the kernel + * to userspace, when it's guaranteed the remaining kernel instructions + * to execute won't use any RCU read side critical section because this + * function sets RCU in extended quiescent state. + */  void user_enter(void)  {  	unsigned long flags; @@ -39,40 +51,90 @@ void user_enter(void)  	if (in_interrupt())  		return; +	/* Kernel threads aren't supposed to go to userspace */  	WARN_ON_ONCE(!current->mm);  	local_irq_save(flags);  	if (__this_cpu_read(context_tracking.active) &&  	    __this_cpu_read(context_tracking.state) != IN_USER) { -		__this_cpu_write(context_tracking.state, IN_USER); +		/* +		 * At this stage, only low level arch entry code remains and +		 * then we'll run in userspace. We can assume there won't be +		 * any RCU read-side critical section until the next call to +		 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency +		 * on the tick. +		 */ +		vtime_user_enter(current);  		rcu_user_enter(); +		__this_cpu_write(context_tracking.state, IN_USER);  	}  	local_irq_restore(flags);  } + +/** + * user_exit - Inform the context tracking that the CPU is + *             exiting userspace mode and entering the kernel. + * + * This function must be called after we entered the kernel from userspace + * before any use of RCU read side critical section. This potentially include + * any high level kernel code like syscalls, exceptions, signal handling, etc... + * + * This call supports re-entrancy. This way it can be called from any exception + * handler without needing to know if we came from userspace or not. + */  void user_exit(void)  {  	unsigned long flags; -	/* -	 * Some contexts may involve an exception occuring in an irq, -	 * leading to that nesting: -	 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() -	 * This would mess up the dyntick_nesting count though. And rcu_irq_*() -	 * helpers are enough to protect RCU uses inside the exception. So -	 * just return immediately if we detect we are in an IRQ. -	 */  	if (in_interrupt())  		return;  	local_irq_save(flags);  	if (__this_cpu_read(context_tracking.state) == IN_USER) { -		__this_cpu_write(context_tracking.state, IN_KERNEL); +		/* +		 * We are going to run code that may use RCU. Inform +		 * RCU core about that (ie: we may need the tick again). +		 */  		rcu_user_exit(); +		vtime_user_exit(current); +		__this_cpu_write(context_tracking.state, IN_KERNEL);  	}  	local_irq_restore(flags);  } +void guest_enter(void) +{ +	if (vtime_accounting_enabled()) +		vtime_guest_enter(current); +	else +		__guest_enter(); +} +EXPORT_SYMBOL_GPL(guest_enter); + +void guest_exit(void) +{ +	if (vtime_accounting_enabled()) +		vtime_guest_exit(current); +	else +		__guest_exit(); +} +EXPORT_SYMBOL_GPL(guest_exit); + + +/** + * context_tracking_task_switch - context switch the syscall callbacks + * @prev: the task that is being switched out + * @next: the task that is being switched in + * + * The context tracking uses the syscall slow path to implement its user-kernel + * boundaries probes on syscalls. This way it doesn't impact the syscall fast + * path on CPUs that don't do context tracking. + * + * But we need to clear the flag on the previous task because it may later + * migrate to some CPU that doesn't do the context tracking. As such the TIF + * flag may not be desired there. + */  void context_tracking_task_switch(struct task_struct *prev,  			     struct task_struct *next)  { diff --git a/kernel/cpu.c b/kernel/cpu.c index 3046a503242..b5e4ab2d427 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)  static inline void check_for_tasks(int cpu)  {  	struct task_struct *p; +	cputime_t utime, stime;  	write_lock_irq(&tasklist_lock);  	for_each_process(p) { +		task_cputime(p, &utime, &stime);  		if (task_cpu(p) == cpu && p->state == TASK_RUNNING && -		    (p->utime || p->stime)) +		    (utime || stime))  			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "  				"(state = %ld, flags = %x)\n",  				p->comm, task_pid_nr(p), cpu, @@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)  		return err;  	cpu_notify(CPU_DYING | param->mod, param->hcpu); +	/* Park the stopper thread */ +	kthread_park(current);  	return 0;  } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb..4f9dfe43ecb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,14 +61,6 @@  #include <linux/cgroup.h>  /* - * Workqueue for cpuset related tasks. - * - * Using kevent workqueue may cause deadlock when memory_migrate - * is set. So we create a separate workqueue thread for cpuset. - */ -static struct workqueue_struct *cpuset_wq; - -/*   * Tracks how many cpusets are currently defined in system.   * When there is only one cpuset (the root cpuset) we can   * short circuit some hooks. @@ -95,18 +87,21 @@ struct cpuset {  	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */  	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */ -	struct cpuset *parent;		/* my parent */ -  	struct fmeter fmeter;		/* memory_pressure filter */ +	/* +	 * Tasks are being attached to this cpuset.  Used to prevent +	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). +	 */ +	int attach_in_progress; +  	/* partition number for rebuild_sched_domains() */  	int pn;  	/* for custom sched domain */  	int relax_domain_level; -	/* used for walking a cpuset hierarchy */ -	struct list_head stack_list; +	struct work_struct hotplug_work;  };  /* Retrieve the cpuset for a cgroup */ @@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)  			    struct cpuset, css);  } +static inline struct cpuset *parent_cs(const struct cpuset *cs) +{ +	struct cgroup *pcgrp = cs->css.cgroup->parent; + +	if (pcgrp) +		return cgroup_cs(pcgrp); +	return NULL; +} +  #ifdef CONFIG_NUMA  static inline bool task_has_mempolicy(struct task_struct *task)  { @@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)  /* bits in struct cpuset flags field */  typedef enum { +	CS_ONLINE,  	CS_CPU_EXCLUSIVE,  	CS_MEM_EXCLUSIVE,  	CS_MEM_HARDWALL, @@ -147,13 +152,12 @@ typedef enum {  	CS_SPREAD_SLAB,  } cpuset_flagbits_t; -/* the type of hotplug event */ -enum hotplug_event { -	CPUSET_CPU_OFFLINE, -	CPUSET_MEM_OFFLINE, -}; -  /* convenient tests for these bits */ +static inline bool is_cpuset_online(const struct cpuset *cs) +{ +	return test_bit(CS_ONLINE, &cs->flags); +} +  static inline int is_cpu_exclusive(const struct cpuset *cs)  {  	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); @@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)  }  static struct cpuset top_cpuset = { -	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), +	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | +		  (1 << CS_MEM_EXCLUSIVE)),  }; +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_cgrp: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs.  Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs)		\ +	cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup)	\ +		if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs.  Must be used + * with RCU read locked.  The caller may modify @pos_cgrp by calling + * cgroup_rightmost_descendant() to skip subtree. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs)	\ +	cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ +		if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) +  /* - * There are two global mutexes guarding cpuset structures.  The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific - * callback_mutex, below. They can nest.  It is ok to first take - * cgroup_mutex, then nest callback_mutex.  We also require taking - * task_lock() when dereferencing a task's cpuset pointer.  See "The - * task_lock() exception", at the end of this comment. + * There are two global mutexes guarding cpuset structures - cpuset_mutex + * and callback_mutex.  The latter may nest inside the former.  We also + * require taking task_lock() when dereferencing a task's cpuset pointer. + * See "The task_lock() exception", at the end of this comment.   * - * A task must hold both mutexes to modify cpusets.  If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets.  It can perform various checks on - * the cpuset structure first, knowing nothing will change.  It can - * also allocate memory while just holding cgroup_mutex.  While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets.  Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. + * A task must hold both mutexes to modify cpusets.  If a task holds + * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * is the only task able to also acquire callback_mutex and be able to + * modify cpusets.  It can perform various checks on the cpuset structure + * first, knowing nothing will change.  It can also allocate memory while + * just holding cpuset_mutex.  While it is performing these checks, various + * callback routines can briefly acquire callback_mutex to query cpusets. + * Once it is ready to make the changes, it takes callback_mutex, blocking + * everyone else.   *   * Calls to the kernel memory allocator can not be made while holding   * callback_mutex, as that would risk double tripping on callback_mutex @@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {   * guidelines for accessing subsystem state in kernel/cgroup.c   */ +static DEFINE_MUTEX(cpuset_mutex);  static DEFINE_MUTEX(callback_mutex);  /* @@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];  static DEFINE_SPINLOCK(cpuset_buffer_lock);  /* + * CPU / memory hotplug is handled asynchronously. + */ +static struct workqueue_struct *cpuset_propagate_hotplug_wq; + +static void cpuset_hotplug_workfn(struct work_struct *work); +static void cpuset_propagate_hotplug_workfn(struct work_struct *work); +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); + +static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); + +/*   * This is ugly, but preserves the userspace API for existing cpuset   * users. If someone tries to mount the "cpuset" filesystem, we   * silently switch it to mount "cgroup" instead @@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,  				  struct cpumask *pmask)  {  	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) -		cs = cs->parent; +		cs = parent_cs(cs);  	if (cs)  		cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);  	else @@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)  {  	while (cs && !nodes_intersects(cs->mems_allowed,  					node_states[N_MEMORY])) -		cs = cs->parent; +		cs = parent_cs(cs);  	if (cs)  		nodes_and(*pmask, cs->mems_allowed,  					node_states[N_MEMORY]); @@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)  /*   * update task's spread flag if cpuset's page/slab spread flag is set   * - * Called with callback_mutex/cgroup_mutex held + * Called with callback_mutex/cpuset_mutex held   */  static void cpuset_update_task_spread_flag(struct cpuset *cs,  					struct task_struct *tsk) @@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,   *   * One cpuset is a subset of another if all its allowed CPUs and   * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set.  Call holding cgroup_mutex. + * are only set if the other's are set.  Call holding cpuset_mutex.   */  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)   * If we replaced the flag and mask values of the current cpuset   * (cur) with those values in the trial cpuset (trial), would   * our various subset and exclusive rules still be valid?  Presumes - * cgroup_mutex held. + * cpuset_mutex held.   *   * 'cur' is the address of an actual, in-use cpuset.  Operations   * such as list traversal that depend on the actual address of the @@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)  {  	struct cgroup *cont;  	struct cpuset *c, *par; +	int ret; + +	rcu_read_lock();  	/* Each of our child cpusets must be a subset of us */ -	list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { -		if (!is_cpuset_subset(cgroup_cs(cont), trial)) -			return -EBUSY; -	} +	ret = -EBUSY; +	cpuset_for_each_child(c, cont, cur) +		if (!is_cpuset_subset(c, trial)) +			goto out;  	/* Remaining checks don't apply to root cpuset */ +	ret = 0;  	if (cur == &top_cpuset) -		return 0; +		goto out; -	par = cur->parent; +	par = parent_cs(cur);  	/* We must be a subset of our parent cpuset */ +	ret = -EACCES;  	if (!is_cpuset_subset(trial, par)) -		return -EACCES; +		goto out;  	/*  	 * If either I or some sibling (!= me) is exclusive, we can't  	 * overlap  	 */ -	list_for_each_entry(cont, &par->css.cgroup->children, sibling) { -		c = cgroup_cs(cont); +	ret = -EINVAL; +	cpuset_for_each_child(c, cont, par) {  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&  		    c != cur &&  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) -			return -EINVAL; +			goto out;  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&  		    c != cur &&  		    nodes_intersects(trial->mems_allowed, c->mems_allowed)) -			return -EINVAL; +			goto out;  	} -	/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ -	if (cgroup_task_count(cur->css.cgroup)) { -		if (cpumask_empty(trial->cpus_allowed) || -		    nodes_empty(trial->mems_allowed)) { -			return -ENOSPC; -		} -	} +	/* +	 * Cpusets with tasks - existing or newly being attached - can't +	 * have empty cpus_allowed or mems_allowed. +	 */ +	ret = -ENOSPC; +	if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && +	    (cpumask_empty(trial->cpus_allowed) || +	     nodes_empty(trial->mems_allowed))) +		goto out; -	return 0; +	ret = 0; +out: +	rcu_read_unlock(); +	return ret;  }  #ifdef CONFIG_SMP @@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)  	return;  } -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +static void update_domain_attr_tree(struct sched_domain_attr *dattr, +				    struct cpuset *root_cs)  { -	LIST_HEAD(q); - -	list_add(&c->stack_list, &q); -	while (!list_empty(&q)) { -		struct cpuset *cp; -		struct cgroup *cont; -		struct cpuset *child; - -		cp = list_first_entry(&q, struct cpuset, stack_list); -		list_del(q.next); +	struct cpuset *cp; +	struct cgroup *pos_cgrp; -		if (cpumask_empty(cp->cpus_allowed)) +	rcu_read_lock(); +	cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { +		/* skip the whole subtree if @cp doesn't have any CPU */ +		if (cpumask_empty(cp->cpus_allowed)) { +			pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);  			continue; +		}  		if (is_sched_load_balance(cp))  			update_domain_attr(dattr, cp); - -		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { -			child = cgroup_cs(cont); -			list_add_tail(&child->stack_list, &q); -		}  	} +	rcu_read_unlock();  }  /* @@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)   * domains when operating in the severe memory shortage situations   * that could cause allocation failures below.   * - * Must be called with cgroup_lock held. + * Must be called with cpuset_mutex held.   *   * The three key local variables below are:   *    q  - a linked-list queue of cpuset pointers, used to implement a @@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)  static int generate_sched_domains(cpumask_var_t **domains,  			struct sched_domain_attr **attributes)  { -	LIST_HEAD(q);		/* queue of cpusets to be scanned */  	struct cpuset *cp;	/* scans q */  	struct cpuset **csa;	/* array of all cpuset ptrs */  	int csn;		/* how many cpuset ptrs in csa so far */ @@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,  	struct sched_domain_attr *dattr;  /* attributes for custom domains */  	int ndoms = 0;		/* number of sched domains in result */  	int nslot;		/* next empty doms[] struct cpumask slot */ +	struct cgroup *pos_cgrp;  	doms = NULL;  	dattr = NULL; @@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,  		goto done;  	csn = 0; -	list_add(&top_cpuset.stack_list, &q); -	while (!list_empty(&q)) { -		struct cgroup *cont; -		struct cpuset *child;   /* scans child cpusets of cp */ - -		cp = list_first_entry(&q, struct cpuset, stack_list); -		list_del(q.next); - -		if (cpumask_empty(cp->cpus_allowed)) -			continue; - +	rcu_read_lock(); +	cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {  		/* -		 * All child cpusets contain a subset of the parent's cpus, so -		 * just skip them, and then we call update_domain_attr_tree() -		 * to calc relax_domain_level of the corresponding sched -		 * domain. +		 * Continue traversing beyond @cp iff @cp has some CPUs and +		 * isn't load balancing.  The former is obvious.  The +		 * latter: All child cpusets contain a subset of the +		 * parent's cpus, so just skip them, and then we call +		 * update_domain_attr_tree() to calc relax_domain_level of +		 * the corresponding sched domain.  		 */ -		if (is_sched_load_balance(cp)) { -			csa[csn++] = cp; +		if (!cpumask_empty(cp->cpus_allowed) && +		    !is_sched_load_balance(cp))  			continue; -		} -		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { -			child = cgroup_cs(cont); -			list_add_tail(&child->stack_list, &q); -		} -  	} +		if (is_sched_load_balance(cp)) +			csa[csn++] = cp; + +		/* skip @cp's subtree */ +		pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); +	} +	rcu_read_unlock();  	for (i = 0; i < csn; i++)  		csa[i]->pn = i; @@ -725,25 +763,25 @@ done:  /*   * Rebuild scheduler domains.   * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains.   * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. + * Call with cpuset_mutex held.  Takes get_online_cpus().   */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void)  {  	struct sched_domain_attr *attr;  	cpumask_var_t *doms;  	int ndoms; +	lockdep_assert_held(&cpuset_mutex);  	get_online_cpus();  	/* Generate domain masks and attrs */ -	cgroup_lock();  	ndoms = generate_sched_domains(&doms, &attr); -	cgroup_unlock();  	/* Have scheduler rebuild the domains */  	partition_sched_domains(ndoms, doms, attr); @@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)  	put_online_cpus();  }  #else /* !CONFIG_SMP */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void)  {  } @@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,  }  #endif /* CONFIG_SMP */ -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ -	queue_work(cpuset_wq, &rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */  void rebuild_sched_domains(void)  { -	do_rebuild_sched_domains(NULL); +	mutex_lock(&cpuset_mutex); +	rebuild_sched_domains_locked(); +	mutex_unlock(&cpuset_mutex);  }  /** @@ -808,7 +813,7 @@ void rebuild_sched_domains(void)   * @tsk: task to test   * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner   * - * Call with cgroup_mutex held.  May take callback_mutex during call. + * Call with cpuset_mutex held.  May take callback_mutex during call.   * Called for each task in a cgroup by cgroup_scan_tasks().   * Return nonzero if this tasks's cpus_allowed mask should be changed (in other   * words, if its mask is not equal to its cpuset's mask). @@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,   * cpus_allowed mask needs to be changed.   *   * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point.   */  static void cpuset_change_cpumask(struct task_struct *tsk,  				  struct cgroup_scanner *scan) @@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()   * - * Called with cgroup_mutex held + * Called with cpuset_mutex held   *   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,   * calling callback functions for each. @@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  	heap_free(&heap);  	if (is_load_balanced) -		async_rebuild_sched_domains(); +		rebuild_sched_domains_locked();  	return 0;  } @@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,   *    Temporarilly set tasks mems_allowed to target nodes of migration,   *    so that the migration code can allocate pages on these nodes.   * - *    Call holding cgroup_mutex, so current's cpuset won't change + *    Call holding cpuset_mutex, so current's cpuset won't change   *    during this call, as manage_mutex holds off any cpuset_attach()   *    calls.  Therefore we don't need to take task_lock around the   *    call to guarantee_online_mems(), as we know no one is changing @@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,  /*   * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy   * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cgroup_mutex held. + * memory_migrate flag is set. Called with cpuset_mutex held.   */  static void cpuset_change_nodemask(struct task_struct *p,  				   struct cgroup_scanner *scan) @@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,  	struct cpuset *cs;  	int migrate;  	const nodemask_t *oldmem = scan->data; -	static nodemask_t newmems;	/* protected by cgroup_mutex */ +	static nodemask_t newmems;	/* protected by cpuset_mutex */  	cs = cgroup_cs(scan->cg);  	guarantee_online_mems(cs, &newmems); @@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;   * @oldmem: old mems_allowed of cpuset cs   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()   * - * Called with cgroup_mutex held + * Called with cpuset_mutex held   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0   * if @heap != NULL.   */ @@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,  	 * take while holding tasklist_lock.  Forks can happen - the  	 * mpol_dup() cpuset_being_rebound check will catch such forks,  	 * and rebind their vma mempolicies too.  Because we still hold -	 * the global cgroup_mutex, we know that no other rebind effort +	 * the global cpuset_mutex, we know that no other rebind effort  	 * will be contending for the global variable cpuset_being_rebound.  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()  	 * is idempotent.  Also migrate pages in each mm to new nodes. @@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,   * mempolicies and if the cpuset is marked 'memory_migrate',   * migrate the tasks pages to the new memory.   * - * Call with cgroup_mutex held.  May take callback_mutex during call. + * Call with cpuset_mutex held.  May take callback_mutex during call.   * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,   * lock each such tasks mm->mmap_sem, scan its vma's and rebind   * their mempolicies to the cpusets new mems_allowed. @@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)  		cs->relax_domain_level = val;  		if (!cpumask_empty(cs->cpus_allowed) &&  		    is_sched_load_balance(cs)) -			async_rebuild_sched_domains(); +			rebuild_sched_domains_locked();  	}  	return 0; @@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)   * Called by cgroup_scan_tasks() for each task in a cgroup.   *   * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point.   */  static void cpuset_change_flag(struct task_struct *tsk,  				struct cgroup_scanner *scan) @@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,   * @cs: the cpuset in which each task's spread flags needs to be changed   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()   * - * Called with cgroup_mutex held + * Called with cpuset_mutex held   *   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,   * calling callback functions for each. @@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)   * cs:		the cpuset to update   * turning_on: 	whether the flag is being set or cleared   * - * Call with cgroup_mutex held. + * Call with cpuset_mutex held.   */  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,  	mutex_unlock(&callback_mutex);  	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) -		async_rebuild_sched_domains(); +		rebuild_sched_domains_locked();  	if (spread_flag_changed)  		update_tasks_flags(cs, &heap); @@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)  	return val;  } -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in can_attach, and they must - * persist until attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */  static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  {  	struct cpuset *cs = cgroup_cs(cgrp);  	struct task_struct *task;  	int ret; +	mutex_lock(&cpuset_mutex); + +	ret = -ENOSPC;  	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) -		return -ENOSPC; +		goto out_unlock;  	cgroup_taskset_for_each(task, cgrp, tset) {  		/* @@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  		 * set_cpus_allowed_ptr() on all attached tasks before  		 * cpus_allowed may be changed.  		 */ +		ret = -EINVAL;  		if (task->flags & PF_THREAD_BOUND) -			return -EINVAL; -		if ((ret = security_task_setscheduler(task))) -			return ret; +			goto out_unlock; +		ret = security_task_setscheduler(task); +		if (ret) +			goto out_unlock;  	} -	/* prepare for attach */ -	if (cs == &top_cpuset) -		cpumask_copy(cpus_attach, cpu_possible_mask); -	else -		guarantee_online_cpus(cs, cpus_attach); - -	guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +	/* +	 * Mark attach is in progress.  This makes validate_change() fail +	 * changes which zero cpus/mems_allowed. +	 */ +	cs->attach_in_progress++; +	ret = 0; +out_unlock: +	mutex_unlock(&cpuset_mutex); +	return ret; +} -	return 0; +static void cpuset_cancel_attach(struct cgroup *cgrp, +				 struct cgroup_taskset *tset) +{ +	mutex_lock(&cpuset_mutex); +	cgroup_cs(cgrp)->attach_in_progress--; +	mutex_unlock(&cpuset_mutex);  } +/* + * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach() + * but we can't allocate it dynamically there.  Define it global and + * allocate from cpuset_init(). + */ +static cpumask_var_t cpus_attach; +  static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  { +	/* static bufs protected by cpuset_mutex */ +	static nodemask_t cpuset_attach_nodemask_from; +	static nodemask_t cpuset_attach_nodemask_to;  	struct mm_struct *mm;  	struct task_struct *task;  	struct task_struct *leader = cgroup_taskset_first(tset); @@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  	struct cpuset *cs = cgroup_cs(cgrp);  	struct cpuset *oldcs = cgroup_cs(oldcgrp); +	mutex_lock(&cpuset_mutex); + +	/* prepare for attach */ +	if (cs == &top_cpuset) +		cpumask_copy(cpus_attach, cpu_possible_mask); +	else +		guarantee_online_cpus(cs, cpus_attach); + +	guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +  	cgroup_taskset_for_each(task, cgrp, tset) {  		/*  		 * can_attach beforehand should guarantee that this doesn't @@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  					  &cpuset_attach_nodemask_to);  		mmput(mm);  	} + +	cs->attach_in_progress--; + +	/* +	 * We may have raced with CPU/memory hotunplug.  Trigger hotplug +	 * propagation if @cs doesn't have any CPU or memory.  It will move +	 * the newly added tasks to the nearest parent which can execute. +	 */ +	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) +		schedule_cpuset_propagate_hotplug(cs); + +	mutex_unlock(&cpuset_mutex);  }  /* The various types of files and directories in a cpuset file system */ @@ -1469,12 +1510,13 @@ typedef enum {  static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)  { -	int retval = 0;  	struct cpuset *cs = cgroup_cs(cgrp);  	cpuset_filetype_t type = cft->private; +	int retval = -ENODEV; -	if (!cgroup_lock_live_group(cgrp)) -		return -ENODEV; +	mutex_lock(&cpuset_mutex); +	if (!is_cpuset_online(cs)) +		goto out_unlock;  	switch (type) {  	case FILE_CPU_EXCLUSIVE: @@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)  		retval = -EINVAL;  		break;  	} -	cgroup_unlock(); +out_unlock: +	mutex_unlock(&cpuset_mutex);  	return retval;  }  static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)  { -	int retval = 0;  	struct cpuset *cs = cgroup_cs(cgrp);  	cpuset_filetype_t type = cft->private; +	int retval = -ENODEV; -	if (!cgroup_lock_live_group(cgrp)) -		return -ENODEV; +	mutex_lock(&cpuset_mutex); +	if (!is_cpuset_online(cs)) +		goto out_unlock;  	switch (type) {  	case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)  		retval = -EINVAL;  		break;  	} -	cgroup_unlock(); +out_unlock: +	mutex_unlock(&cpuset_mutex);  	return retval;  } @@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)  static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,  				const char *buf)  { -	int retval = 0;  	struct cpuset *cs = cgroup_cs(cgrp);  	struct cpuset *trialcs; +	int retval = -ENODEV; -	if (!cgroup_lock_live_group(cgrp)) -		return -ENODEV; +	/* +	 * CPU or memory hotunplug may leave @cs w/o any execution +	 * resources, in which case the hotplug code asynchronously updates +	 * configuration and transfers all tasks to the nearest ancestor +	 * which can execute. +	 * +	 * As writes to "cpus" or "mems" may restore @cs's execution +	 * resources, wait for the previously scheduled operations before +	 * proceeding, so that we don't end up keep removing tasks added +	 * after execution capability is restored. +	 * +	 * Flushing cpuset_hotplug_work is enough to synchronize against +	 * hotplug hanlding; however, cpuset_attach() may schedule +	 * propagation work directly.  Flush the workqueue too. +	 */ +	flush_work(&cpuset_hotplug_work); +	flush_workqueue(cpuset_propagate_hotplug_wq); + +	mutex_lock(&cpuset_mutex); +	if (!is_cpuset_online(cs)) +		goto out_unlock;  	trialcs = alloc_trial_cpuset(cs);  	if (!trialcs) {  		retval = -ENOMEM; -		goto out; +		goto out_unlock;  	}  	switch (cft->private) { @@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,  	}  	free_trial_cpuset(trialcs); -out: -	cgroup_unlock(); +out_unlock: +	mutex_unlock(&cpuset_mutex);  	return retval;  } @@ -1790,15 +1854,12 @@ static struct cftype files[] = {  static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)  { -	struct cgroup *parent_cg = cont->parent; -	struct cgroup *tmp_cg; -	struct cpuset *parent, *cs; +	struct cpuset *cs; -	if (!parent_cg) +	if (!cont->parent)  		return &top_cpuset.css; -	parent = cgroup_cs(parent_cg); -	cs = kmalloc(sizeof(*cs), GFP_KERNEL); +	cs = kzalloc(sizeof(*cs), GFP_KERNEL);  	if (!cs)  		return ERR_PTR(-ENOMEM);  	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { @@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)  		return ERR_PTR(-ENOMEM);  	} -	cs->flags = 0; -	if (is_spread_page(parent)) -		set_bit(CS_SPREAD_PAGE, &cs->flags); -	if (is_spread_slab(parent)) -		set_bit(CS_SPREAD_SLAB, &cs->flags);  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);  	cpumask_clear(cs->cpus_allowed);  	nodes_clear(cs->mems_allowed);  	fmeter_init(&cs->fmeter); +	INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);  	cs->relax_domain_level = -1; -	cs->parent = parent; +	return &cs->css; +} + +static int cpuset_css_online(struct cgroup *cgrp) +{ +	struct cpuset *cs = cgroup_cs(cgrp); +	struct cpuset *parent = parent_cs(cs); +	struct cpuset *tmp_cs; +	struct cgroup *pos_cg; + +	if (!parent) +		return 0; + +	mutex_lock(&cpuset_mutex); + +	set_bit(CS_ONLINE, &cs->flags); +	if (is_spread_page(parent)) +		set_bit(CS_SPREAD_PAGE, &cs->flags); +	if (is_spread_slab(parent)) +		set_bit(CS_SPREAD_SLAB, &cs->flags); +  	number_of_cpusets++; -	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) -		goto skip_clone; +	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) +		goto out_unlock;  	/*  	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)  	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive  	 * (and likewise for mems) to the new cgroup.  	 */ -	list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { -		struct cpuset *tmp_cs = cgroup_cs(tmp_cg); - -		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) -			goto skip_clone; +	rcu_read_lock(); +	cpuset_for_each_child(tmp_cs, pos_cg, parent) { +		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { +			rcu_read_unlock(); +			goto out_unlock; +		}  	} +	rcu_read_unlock();  	mutex_lock(&callback_mutex);  	cs->mems_allowed = parent->mems_allowed;  	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);  	mutex_unlock(&callback_mutex); -skip_clone: -	return &cs->css; +out_unlock: +	mutex_unlock(&cpuset_mutex); +	return 0; +} + +static void cpuset_css_offline(struct cgroup *cgrp) +{ +	struct cpuset *cs = cgroup_cs(cgrp); + +	mutex_lock(&cpuset_mutex); + +	if (is_sched_load_balance(cs)) +		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + +	number_of_cpusets--; +	clear_bit(CS_ONLINE, &cs->flags); + +	mutex_unlock(&cpuset_mutex);  }  /*   * If the cpuset being removed has its flag 'sched_load_balance'   * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). + * will call rebuild_sched_domains_locked().   */  static void cpuset_css_free(struct cgroup *cont)  {  	struct cpuset *cs = cgroup_cs(cont); -	if (is_sched_load_balance(cs)) -		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - -	number_of_cpusets--;  	free_cpumask_var(cs->cpus_allowed);  	kfree(cs);  } @@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)  struct cgroup_subsys cpuset_subsys = {  	.name = "cpuset",  	.css_alloc = cpuset_css_alloc, +	.css_online = cpuset_css_online, +	.css_offline = cpuset_css_offline,  	.css_free = cpuset_css_free,  	.can_attach = cpuset_can_attach, +	.cancel_attach = cpuset_cancel_attach,  	.attach = cpuset_attach,  	.subsys_id = cpuset_subsys_id,  	.base_cftypes = files, @@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,  {  	struct cgroup *new_cgroup = scan->data; +	cgroup_lock();  	cgroup_attach_task(new_cgroup, tsk); +	cgroup_unlock();  }  /** @@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,   * @from: cpuset in which the tasks currently reside   * @to: cpuset to which the tasks will be moved   * - * Called with cgroup_mutex held + * Called with cpuset_mutex held   * callback_mutex must not be held, as cpuset_attach() will take it.   *   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, @@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)   * removing that CPU or node from all cpusets.  If this removes the   * last CPU or node from a cpuset, then move the tasks in the empty   * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it.   */  static void remove_tasks_in_empty_cpuset(struct cpuset *cs)  {  	struct cpuset *parent;  	/* -	 * The cgroup's css_sets list is in use if there are tasks -	 * in the cpuset; the list is empty if there are none; -	 * the cs->css.refcnt seems always 0. -	 */ -	if (list_empty(&cs->css.cgroup->css_sets)) -		return; - -	/*  	 * Find its next-highest non-empty parent, (top cpuset  	 * has online cpus, so can't be empty).  	 */ -	parent = cs->parent; +	parent = parent_cs(cs);  	while (cpumask_empty(parent->cpus_allowed) ||  			nodes_empty(parent->mems_allowed)) -		parent = parent->parent; +		parent = parent_cs(parent);  	move_member_tasks_to_cpuset(cs, parent);  } -/* - * Helper function to traverse cpusets. - * It can be used to walk the cpuset tree from top to bottom, completing - * one layer before dropping down to the next (thus always processing a - * node before any of its children). +/** + * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * @cs: cpuset in interest + * + * Compare @cs's cpu and mem masks against top_cpuset and if some have gone + * offline, update @cs accordingly.  If @cs ends up with no CPU or memory, + * all its tasks are moved to the nearest ancestor with both resources.   */ -static struct cpuset *cpuset_next(struct list_head *queue) +static void cpuset_propagate_hotplug_workfn(struct work_struct *work)  { -	struct cpuset *cp; -	struct cpuset *child;	/* scans child cpusets of cp */ -	struct cgroup *cont; +	static cpumask_t off_cpus; +	static nodemask_t off_mems, tmp_mems; +	struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); +	bool is_empty; -	if (list_empty(queue)) -		return NULL; +	mutex_lock(&cpuset_mutex); + +	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); +	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); -	cp = list_first_entry(queue, struct cpuset, stack_list); -	list_del(queue->next); -	list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { -		child = cgroup_cs(cont); -		list_add_tail(&child->stack_list, queue); +	/* remove offline cpus from @cs */ +	if (!cpumask_empty(&off_cpus)) { +		mutex_lock(&callback_mutex); +		cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); +		mutex_unlock(&callback_mutex); +		update_tasks_cpumask(cs, NULL); +	} + +	/* remove offline mems from @cs */ +	if (!nodes_empty(off_mems)) { +		tmp_mems = cs->mems_allowed; +		mutex_lock(&callback_mutex); +		nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); +		mutex_unlock(&callback_mutex); +		update_tasks_nodemask(cs, &tmp_mems, NULL);  	} -	return cp; +	is_empty = cpumask_empty(cs->cpus_allowed) || +		nodes_empty(cs->mems_allowed); + +	mutex_unlock(&cpuset_mutex); + +	/* +	 * If @cs became empty, move tasks to the nearest ancestor with +	 * execution resources.  This is full cgroup operation which will +	 * also call back into cpuset.  Should be done outside any lock. +	 */ +	if (is_empty) +		remove_tasks_in_empty_cpuset(cs); + +	/* the following may free @cs, should be the last operation */ +	css_put(&cs->css);  } +/** + * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset + * @cs: cpuset of interest + * + * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and + * memory masks according to top_cpuset. + */ +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) +{ +	/* +	 * Pin @cs.  The refcnt will be released when the work item +	 * finishes executing. +	 */ +	if (!css_tryget(&cs->css)) +		return; -/* - * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory - * online/offline) and update the cpusets accordingly. - * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such - * cpuset must be moved to a parent cpuset. +	/* +	 * Queue @cs->hotplug_work.  If already pending, lose the css ref. +	 * cpuset_propagate_hotplug_wq is ordered and propagation will +	 * happen in the order this function is called. +	 */ +	if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) +		css_put(&cs->css); +} + +/** + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset   * - * Called with cgroup_mutex held.  We take callback_mutex to modify - * cpus_allowed and mems_allowed. + * This function is called after either CPU or memory configuration has + * changed and updates cpuset accordingly.  The top_cpuset is always + * synchronized to cpu_active_mask and N_MEMORY, which is necessary in + * order to make cpusets transparent (of no affect) on systems that are + * actively using CPU hotplug but making no active use of cpusets.   * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next.  It always processes a node before - * any of its children. + * Non-root cpusets are only affected by offlining.  If any CPUs or memory + * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all + * descendants.   * - * In the case of memory hot-unplug, it will remove nodes from N_MEMORY - * if all present pages from a node are offlined. + * Note that CPU offlining during suspend is ignored.  We don't modify + * cpusets across suspend/resume cycles at all.   */ -static void -scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) +static void cpuset_hotplug_workfn(struct work_struct *work)  { -	LIST_HEAD(queue); -	struct cpuset *cp;		/* scans cpusets being updated */ -	static nodemask_t oldmems;	/* protected by cgroup_mutex */ +	static cpumask_t new_cpus, tmp_cpus; +	static nodemask_t new_mems, tmp_mems; +	bool cpus_updated, mems_updated; +	bool cpus_offlined, mems_offlined; -	list_add_tail((struct list_head *)&root->stack_list, &queue); +	mutex_lock(&cpuset_mutex); -	switch (event) { -	case CPUSET_CPU_OFFLINE: -		while ((cp = cpuset_next(&queue)) != NULL) { +	/* fetch the available cpus/mems and find out which changed how */ +	cpumask_copy(&new_cpus, cpu_active_mask); +	new_mems = node_states[N_MEMORY]; -			/* Continue past cpusets with all cpus online */ -			if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) -				continue; +	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); +	cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, +				       &new_cpus); -			/* Remove offline cpus from this cpuset. */ -			mutex_lock(&callback_mutex); -			cpumask_and(cp->cpus_allowed, cp->cpus_allowed, -							cpu_active_mask); -			mutex_unlock(&callback_mutex); +	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); +	nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); +	mems_offlined = !nodes_empty(tmp_mems); -			/* Move tasks from the empty cpuset to a parent */ -			if (cpumask_empty(cp->cpus_allowed)) -				remove_tasks_in_empty_cpuset(cp); -			else -				update_tasks_cpumask(cp, NULL); -		} -		break; +	/* synchronize cpus_allowed to cpu_active_mask */ +	if (cpus_updated) { +		mutex_lock(&callback_mutex); +		cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); +		mutex_unlock(&callback_mutex); +		/* we don't mess with cpumasks of tasks in top_cpuset */ +	} -	case CPUSET_MEM_OFFLINE: -		while ((cp = cpuset_next(&queue)) != NULL) { +	/* synchronize mems_allowed to N_MEMORY */ +	if (mems_updated) { +		tmp_mems = top_cpuset.mems_allowed; +		mutex_lock(&callback_mutex); +		top_cpuset.mems_allowed = new_mems; +		mutex_unlock(&callback_mutex); +		update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); +	} -			/* Continue past cpusets with all mems online */ -			if (nodes_subset(cp->mems_allowed, -					node_states[N_MEMORY])) -				continue; +	/* if cpus or mems went down, we need to propagate to descendants */ +	if (cpus_offlined || mems_offlined) { +		struct cpuset *cs; +		struct cgroup *pos_cgrp; -			oldmems = cp->mems_allowed; +		rcu_read_lock(); +		cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) +			schedule_cpuset_propagate_hotplug(cs); +		rcu_read_unlock(); +	} -			/* Remove offline mems from this cpuset. */ -			mutex_lock(&callback_mutex); -			nodes_and(cp->mems_allowed, cp->mems_allowed, -						node_states[N_MEMORY]); -			mutex_unlock(&callback_mutex); +	mutex_unlock(&cpuset_mutex); -			/* Move tasks from the empty cpuset to a parent */ -			if (nodes_empty(cp->mems_allowed)) -				remove_tasks_in_empty_cpuset(cp); -			else -				update_tasks_nodemask(cp, &oldmems, NULL); -		} +	/* wait for propagations to finish */ +	flush_workqueue(cpuset_propagate_hotplug_wq); + +	/* rebuild sched domains if cpus_allowed has changed */ +	if (cpus_updated) { +		struct sched_domain_attr *attr; +		cpumask_var_t *doms; +		int ndoms; + +		mutex_lock(&cpuset_mutex); +		ndoms = generate_sched_domains(&doms, &attr); +		mutex_unlock(&cpuset_mutex); + +		partition_sched_domains(ndoms, doms, attr);  	}  } -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period.  This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * The only exception to this is suspend/resume, where we don't - * modify cpusets at all. - * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_active_mask on each CPU hotplug (cpuhp) event. - * - * Called within get_online_cpus().  Needs to call cgroup_lock() - * before calling generate_sched_domains(). - * - * @cpu_online: Indicates whether this is a CPU online event (true) or - * a CPU offline event (false). - */  void cpuset_update_active_cpus(bool cpu_online)  { -	struct sched_domain_attr *attr; -	cpumask_var_t *doms; -	int ndoms; - -	cgroup_lock(); -	mutex_lock(&callback_mutex); -	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); -	mutex_unlock(&callback_mutex); - -	if (!cpu_online) -		scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); - -	ndoms = generate_sched_domains(&doms, &attr); -	cgroup_unlock(); - -	/* Have scheduler rebuild the domains */ -	partition_sched_domains(ndoms, doms, attr); +	/* +	 * We're inside cpu hotplug critical region which usually nests +	 * inside cgroup synchronization.  Bounce actual hotplug processing +	 * to a work item to avoid reverse locking order. +	 * +	 * We still need to do partition_sched_domains() synchronously; +	 * otherwise, the scheduler will get confused and put tasks to the +	 * dead CPU.  Fall back to the default single domain. +	 * cpuset_hotplug_workfn() will rebuild it as necessary. +	 */ +	partition_sched_domains(1, NULL, NULL); +	schedule_work(&cpuset_hotplug_work);  }  #ifdef CONFIG_MEMORY_HOTPLUG @@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)  static int cpuset_track_online_nodes(struct notifier_block *self,  				unsigned long action, void *arg)  { -	static nodemask_t oldmems;	/* protected by cgroup_mutex */ - -	cgroup_lock(); -	switch (action) { -	case MEM_ONLINE: -		oldmems = top_cpuset.mems_allowed; -		mutex_lock(&callback_mutex); -		top_cpuset.mems_allowed = node_states[N_MEMORY]; -		mutex_unlock(&callback_mutex); -		update_tasks_nodemask(&top_cpuset, &oldmems, NULL); -		break; -	case MEM_OFFLINE: -		/* -		 * needn't update top_cpuset.mems_allowed explicitly because -		 * scan_cpusets_upon_hotplug() will update it. -		 */ -		scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); -		break; -	default: -		break; -	} -	cgroup_unlock(); - +	schedule_work(&cpuset_hotplug_work);  	return NOTIFY_OK;  }  #endif @@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)  	hotplug_memory_notifier(cpuset_track_online_nodes, 10); -	cpuset_wq = create_singlethread_workqueue("cpuset"); -	BUG_ON(!cpuset_wq); +	cpuset_propagate_hotplug_wq = +		alloc_ordered_workqueue("cpuset_hotplug", 0); +	BUG_ON(!cpuset_propagate_hotplug_wq);  }  /** @@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)   */  static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)  { -	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) -		cs = cs->parent; +	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) +		cs = parent_cs(cs);  	return cs;  } @@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)  }  /** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ -	mutex_unlock(&callback_mutex); -} - -/**   * cpuset_mem_spread_node() - On which node to begin search for a file page   * cpuset_slab_spread_node() - On which node to begin search for a slab page   * @@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)  	dentry = task_cs(tsk)->css.cgroup->dentry;  	spin_lock(&cpuset_buffer_lock); -	snprintf(cpuset_name, CPUSET_NAME_LEN, -		 dentry ? (const char *)dentry->d_name.name : "/"); + +	if (!dentry) { +		strcpy(cpuset_name, "/"); +	} else { +		spin_lock(&dentry->d_lock); +		strlcpy(cpuset_name, (const char *)dentry->d_name.name, +			CPUSET_NAME_LEN); +		spin_unlock(&dentry->d_lock); +	} +  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,  			   tsk->mems_allowed);  	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", @@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)   *  - Used for /proc/<pid>/cpuset.   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it   *    doesn't really matter if tsk->cpuset changes after we read it, - *    and we take cgroup_mutex, keeping cpuset_attach() from changing it + *    and we take cpuset_mutex, keeping cpuset_attach() from changing it   *    anyway.   */  static int proc_cpuset_show(struct seq_file *m, void *unused_v) @@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)  	if (!tsk)  		goto out_free; -	retval = -EINVAL; -	cgroup_lock(); +	rcu_read_lock();  	css = task_subsys_state(tsk, cpuset_subsys_id);  	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); +	rcu_read_unlock();  	if (retval < 0) -		goto out_unlock; +		goto out_put_task;  	seq_puts(m, buf);  	seq_putc(m, '\n'); -out_unlock: -	cgroup_unlock(); +out_put_task:  	put_task_struct(tsk);  out_free:  	kfree(buf); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9a61738cefc..c26278fd485 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -29,6 +29,7 @@   */  #include <linux/pid_namespace.h>  #include <linux/clocksource.h> +#include <linux/serial_core.h>  #include <linux/interrupt.h>  #include <linux/spinlock.h>  #include <linux/console.h> diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index ce615e06448..38573f35a5a 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -31,6 +31,7 @@  #include <linux/kernel.h>  #include <linux/kgdb.h>  #include <linux/kdb.h> +#include <linux/serial_core.h>  #include <linux/reboot.h>  #include <linux/uaccess.h>  #include <asm/cacheflush.h> diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4d5f8d5612f..8875254120b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv)  	kdb_printf("Module                  Size  modstruct     Used by\n");  	list_for_each_entry(mod, kdb_modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		kdb_printf("%-20s%8u  0x%p ", mod->name,  			   mod->core_size, (void *)mod); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 418b3f7053a..d473988c1d0 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)  	unsigned long long t2, t3;  	unsigned long flags;  	struct timespec ts; +	cputime_t utime, stime, stimescaled, utimescaled;  	/* Though tsk->delays accessed later, early exit avoids  	 * unnecessary returning of other data @@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)  		goto done;  	tmp = (s64)d->cpu_run_real_total; -	cputime_to_timespec(tsk->utime + tsk->stime, &ts); +	task_cputime(tsk, &utime, &stime); +	cputime_to_timespec(utime + stime, &ts);  	tmp += timespec_to_ns(&ts);  	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;  	tmp = (s64)d->cpu_scaled_run_real_total; -	cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); +	task_cputime_scaled(tsk, &utimescaled, &stimescaled); +	cputime_to_timespec(utimescaled + stimescaled, &ts);  	tmp += timespec_to_ns(&ts);  	d->cpu_scaled_run_real_total =  		(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3b106554b42..ccc457e3635 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)  }  /* + * Initialize event state based on the perf_event_attr::disabled. + */ +static inline void perf_event__state_init(struct perf_event *event) +{ +	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : +					      PERF_EVENT_STATE_INACTIVE; +} + +/*   * Called at perf_event creation and when events are attached/detached from a   * group.   */ @@ -6162,11 +6171,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	if (task) {  		event->attach_state = PERF_ATTACH_TASK; + +		if (attr->type == PERF_TYPE_TRACEPOINT) +			event->hw.tp_target = task;  #ifdef CONFIG_HAVE_HW_BREAKPOINT  		/*  		 * hw_breakpoint is a bit difficult here..  		 */ -		if (attr->type == PERF_TYPE_BREAKPOINT) +		else if (attr->type == PERF_TYPE_BREAKPOINT)  			event->hw.bp_target = task;  #endif  	} @@ -6179,8 +6191,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	event->overflow_handler	= overflow_handler;  	event->overflow_handler_context = context; -	if (attr->disabled) -		event->state = PERF_EVENT_STATE_OFF; +	perf_event__state_init(event);  	pmu = NULL; @@ -6609,9 +6620,17 @@ SYSCALL_DEFINE5(perf_event_open,  		mutex_lock(&gctx->mutex);  		perf_remove_from_context(group_leader); + +		/* +		 * Removing from the context ends up with disabled +		 * event. What we want here is event in the initial +		 * startup state, ready to be add into new context. +		 */ +		perf_event__state_init(group_leader);  		list_for_each_entry(sibling, &group_leader->sibling_list,  				    group_entry) {  			perf_remove_from_context(sibling); +			perf_event__state_init(sibling);  			put_ctx(gctx);  		}  		mutex_unlock(&gctx->mutex); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fe8a916507e..a64f8aeb5c1 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)   err_alloc:  	for_each_possible_cpu(err_cpu) {  		for (i = 0; i < TYPE_MAX; i++) -			kfree(per_cpu(nr_task_bp_pinned[i], cpu)); +			kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));  		if (err_cpu == cpu)  			break;  	} diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dea7acfbb07..a567c8c7ef3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@  #include <linux/pagemap.h>	/* read_mapping_page */  #include <linux/slab.h>  #include <linux/sched.h> +#include <linux/export.h>  #include <linux/rmap.h>		/* anon_vma_prepare */  #include <linux/mmu_notifier.h>	/* set_pte_at_notify */  #include <linux/swap.h>		/* try_to_free_swap */ @@ -41,58 +42,31 @@  #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE  static struct rb_root uprobes_tree = RB_ROOT; - -static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */ - -#define UPROBES_HASH_SZ	13 -  /* - * We need separate register/unregister and mmap/munmap lock hashes because - * of mmap_sem nesting. - * - * uprobe_register() needs to install probes on (potentially) all processes - * and thus needs to acquire multiple mmap_sems (consequtively, not - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem - * for the particular process doing the mmap. - * - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem - * because of lock order against i_mmap_mutex. This means there's a hole in - * the register vma iteration where a mmap() can happen. - * - * Thus uprobe_register() can race with uprobe_mmap() and we can try and - * install a probe where one is already installed. + * allows us to skip the uprobe_mmap if there are no uprobe events active + * at this time.  Probably a fine grained per inode count is better?   */ +#define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree) -/* serialize (un)register */ -static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; - -#define uprobes_hash(v)		(&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) +static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */ +#define UPROBES_HASH_SZ	13  /* serialize uprobe->pending_list */  static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];  #define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])  static struct percpu_rw_semaphore dup_mmap_sem; -/* - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe - * events active at this time.  Probably a fine grained per inode count is - * better? - */ -static atomic_t uprobe_events = ATOMIC_INIT(0); -  /* Have a copy of original instruction */  #define UPROBE_COPY_INSN	0 -/* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBE_RUN_HANDLER	1  /* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP	2 +#define UPROBE_SKIP_SSTEP	1  struct uprobe {  	struct rb_node		rb_node;	/* node in the rb tree */  	atomic_t		ref; +	struct rw_semaphore	register_rwsem;  	struct rw_semaphore	consumer_rwsem; -	struct mutex		copy_mutex;	/* TODO: kill me and UPROBE_COPY_INSN */  	struct list_head	pending_list;  	struct uprobe_consumer	*consumers;  	struct inode		*inode;		/* Also hold a ref to inode */ @@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)  	u = __insert_uprobe(uprobe);  	spin_unlock(&uprobes_treelock); -	/* For now assume that the instruction need not be single-stepped */ -	__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); -  	return u;  } @@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)  	uprobe->inode = igrab(inode);  	uprobe->offset = offset; +	init_rwsem(&uprobe->register_rwsem);  	init_rwsem(&uprobe->consumer_rwsem); -	mutex_init(&uprobe->copy_mutex); +	/* For now assume that the instruction need not be single-stepped */ +	__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);  	/* add to uprobes_tree, sorted on inode:offset */  	cur_uprobe = insert_uprobe(uprobe); @@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)  		kfree(uprobe);  		uprobe = cur_uprobe;  		iput(inode); -	} else { -		atomic_inc(&uprobe_events);  	}  	return uprobe;  } -static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) -{ -	struct uprobe_consumer *uc; - -	if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) -		return; - -	down_read(&uprobe->consumer_rwsem); -	for (uc = uprobe->consumers; uc; uc = uc->next) { -		if (!uc->filter || uc->filter(uc, current)) -			uc->handler(uc, regs); -	} -	up_read(&uprobe->consumer_rwsem); -} - -/* Returns the previous consumer */ -static struct uprobe_consumer * -consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)  {  	down_write(&uprobe->consumer_rwsem);  	uc->next = uprobe->consumers;  	uprobe->consumers = uc;  	up_write(&uprobe->consumer_rwsem); - -	return uc->next;  }  /* @@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,  	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))  		return ret; -	mutex_lock(&uprobe->copy_mutex); +	/* TODO: move this into _register, until then we abuse this sem. */ +	down_write(&uprobe->consumer_rwsem);  	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))  		goto out; @@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,  	set_bit(UPROBE_COPY_INSN, &uprobe->flags);   out: -	mutex_unlock(&uprobe->copy_mutex); +	up_write(&uprobe->consumer_rwsem); + +	return ret; +} + +static inline bool consumer_filter(struct uprobe_consumer *uc, +				   enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ +	return !uc->filter || uc->filter(uc, ctx, mm); +} + +static bool filter_chain(struct uprobe *uprobe, +			 enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ +	struct uprobe_consumer *uc; +	bool ret = false; + +	down_read(&uprobe->consumer_rwsem); +	for (uc = uprobe->consumers; uc; uc = uc->next) { +		ret = consumer_filter(uc, ctx, mm); +		if (ret) +			break; +	} +	up_read(&uprobe->consumer_rwsem);  	return ret;  } @@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,  	bool first_uprobe;  	int ret; -	/* -	 * If probe is being deleted, unregister thread could be done with -	 * the vma-rmap-walk through. Adding a probe now can be fatal since -	 * nobody will be able to cleanup. Also we could be from fork or -	 * mremap path, where the probe might have already been inserted. -	 * Hence behave as if probe already existed. -	 */ -	if (!uprobe->consumers) -		return 0; -  	ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);  	if (ret)  		return ret; @@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,  static int  remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)  { -	/* can happen if uprobe_register() fails */ -	if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) -		return 0; -  	set_bit(MMF_RECALC_UPROBES, &mm->flags);  	return set_orig_insn(&uprobe->arch, mm, vaddr);  } +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ +	return !RB_EMPTY_NODE(&uprobe->rb_node); +}  /*   * There could be threads that have already hit the breakpoint. They   * will recheck the current insn and restart if find_uprobe() fails. @@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad   */  static void delete_uprobe(struct uprobe *uprobe)  { +	if (WARN_ON(!uprobe_is_active(uprobe))) +		return; +  	spin_lock(&uprobes_treelock);  	rb_erase(&uprobe->rb_node, &uprobes_tree);  	spin_unlock(&uprobes_treelock); +	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */  	iput(uprobe->inode);  	put_uprobe(uprobe); -	atomic_dec(&uprobe_events);  }  struct map_info { @@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)  	return curr;  } -static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +static int +register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)  { +	bool is_register = !!new;  	struct map_info *info;  	int err = 0; @@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)  		    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)  			goto unlock; -		if (is_register) -			err = install_breakpoint(uprobe, mm, vma, info->vaddr); -		else -			err |= remove_breakpoint(uprobe, mm, info->vaddr); +		if (is_register) { +			/* consult only the "caller", new consumer. */ +			if (consumer_filter(new, +					UPROBE_FILTER_REGISTER, mm)) +				err = install_breakpoint(uprobe, mm, vma, info->vaddr); +		} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { +			if (!filter_chain(uprobe, +					UPROBE_FILTER_UNREGISTER, mm)) +				err |= remove_breakpoint(uprobe, mm, info->vaddr); +		}   unlock:  		up_write(&mm->mmap_sem); @@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)  	return err;  } -static int __uprobe_register(struct uprobe *uprobe) +static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)  { -	return register_for_each_vma(uprobe, true); +	consumer_add(uprobe, uc); +	return register_for_each_vma(uprobe, uc);  } -static void __uprobe_unregister(struct uprobe *uprobe) +static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)  { -	if (!register_for_each_vma(uprobe, false)) -		delete_uprobe(uprobe); +	int err; + +	if (!consumer_del(uprobe, uc))	/* WARN? */ +		return; +	err = register_for_each_vma(uprobe, NULL);  	/* TODO : cant unregister? schedule a worker thread */ +	if (!uprobe->consumers && !err) +		delete_uprobe(uprobe);  }  /* @@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *  	struct uprobe *uprobe;  	int ret; -	if (!inode || !uc || uc->next) -		return -EINVAL; - +	/* Racy, just to catch the obvious mistakes */  	if (offset > i_size_read(inode))  		return -EINVAL; -	ret = 0; -	mutex_lock(uprobes_hash(inode)); + retry:  	uprobe = alloc_uprobe(inode, offset); - -	if (!uprobe) { -		ret = -ENOMEM; -	} else if (!consumer_add(uprobe, uc)) { -		ret = __uprobe_register(uprobe); -		if (ret) { -			uprobe->consumers = NULL; -			__uprobe_unregister(uprobe); -		} else { -			set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); -		} +	if (!uprobe) +		return -ENOMEM; +	/* +	 * We can race with uprobe_unregister()->delete_uprobe(). +	 * Check uprobe_is_active() and retry if it is false. +	 */ +	down_write(&uprobe->register_rwsem); +	ret = -EAGAIN; +	if (likely(uprobe_is_active(uprobe))) { +		ret = __uprobe_register(uprobe, uc); +		if (ret) +			__uprobe_unregister(uprobe, uc);  	} +	up_write(&uprobe->register_rwsem); +	put_uprobe(uprobe); -	mutex_unlock(uprobes_hash(inode)); -	if (uprobe) -		put_uprobe(uprobe); +	if (unlikely(ret == -EAGAIN)) +		goto retry; +	return ret; +} +EXPORT_SYMBOL_GPL(uprobe_register); + +/* + * uprobe_apply - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: consumer which wants to add more or remove some breakpoints + * @add: add or remove the breakpoints + */ +int uprobe_apply(struct inode *inode, loff_t offset, +			struct uprobe_consumer *uc, bool add) +{ +	struct uprobe *uprobe; +	struct uprobe_consumer *con; +	int ret = -ENOENT; + +	uprobe = find_uprobe(inode, offset); +	if (!uprobe) +		return ret; + +	down_write(&uprobe->register_rwsem); +	for (con = uprobe->consumers; con && con != uc ; con = con->next) +		; +	if (con) +		ret = register_for_each_vma(uprobe, add ? uc : NULL); +	up_write(&uprobe->register_rwsem); +	put_uprobe(uprobe);  	return ret;  } @@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume  {  	struct uprobe *uprobe; -	if (!inode || !uc) -		return; -  	uprobe = find_uprobe(inode, offset);  	if (!uprobe)  		return; -	mutex_lock(uprobes_hash(inode)); +	down_write(&uprobe->register_rwsem); +	__uprobe_unregister(uprobe, uc); +	up_write(&uprobe->register_rwsem); +	put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister); -	if (consumer_del(uprobe, uc)) { -		if (!uprobe->consumers) { -			__uprobe_unregister(uprobe); -			clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); -		} +static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) +{ +	struct vm_area_struct *vma; +	int err = 0; + +	down_read(&mm->mmap_sem); +	for (vma = mm->mmap; vma; vma = vma->vm_next) { +		unsigned long vaddr; +		loff_t offset; + +		if (!valid_vma(vma, false) || +		    vma->vm_file->f_mapping->host != uprobe->inode) +			continue; + +		offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; +		if (uprobe->offset <  offset || +		    uprobe->offset >= offset + vma->vm_end - vma->vm_start) +			continue; + +		vaddr = offset_to_vaddr(vma, uprobe->offset); +		err |= remove_breakpoint(uprobe, mm, vaddr);  	} +	up_read(&mm->mmap_sem); -	mutex_unlock(uprobes_hash(inode)); -	if (uprobe) -		put_uprobe(uprobe); +	return err;  }  static struct rb_node * @@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)  	struct uprobe *uprobe, *u;  	struct inode *inode; -	if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) +	if (no_uprobe_events() || !valid_vma(vma, true))  		return 0;  	inode = vma->vm_file->f_mapping->host; @@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)  	mutex_lock(uprobes_mmap_hash(inode));  	build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - +	/* +	 * We can race with uprobe_unregister(), this uprobe can be already +	 * removed. But in this case filter_chain() must return false, all +	 * consumers have gone away. +	 */  	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { -		if (!fatal_signal_pending(current)) { +		if (!fatal_signal_pending(current) && +		    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {  			unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);  			install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);  		} @@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e   */  void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)  { -	if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) +	if (no_uprobe_events() || !valid_vma(vma, false))  		return;  	if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ @@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon  /* Slot allocation for XOL */  static int xol_add_vma(struct xol_area *area)  { -	struct mm_struct *mm; -	int ret; - -	area->page = alloc_page(GFP_HIGHUSER); -	if (!area->page) -		return -ENOMEM; - -	ret = -EALREADY; -	mm = current->mm; +	struct mm_struct *mm = current->mm; +	int ret = -EALREADY;  	down_write(&mm->mmap_sem);  	if (mm->uprobes_state.xol_area)  		goto fail;  	ret = -ENOMEM; -  	/* Try to map as high as possible, this is only a hint. */  	area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);  	if (area->vaddr & ~PAGE_MASK) { @@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)  	smp_wmb();	/* pairs with get_xol_area() */  	mm->uprobes_state.xol_area = area;  	ret = 0; - -fail: + fail:  	up_write(&mm->mmap_sem); -	if (ret) -		__free_page(area->page);  	return ret;  } -static struct xol_area *get_xol_area(struct mm_struct *mm) -{ -	struct xol_area *area; - -	area = mm->uprobes_state.xol_area; -	smp_read_barrier_depends();	/* pairs with wmb in xol_add_vma() */ - -	return area; -} -  /* - * xol_alloc_area - Allocate process's xol_area. - * This area will be used for storing instructions for execution out of - * line. + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line.   *   * Returns the allocated area or NULL.   */ -static struct xol_area *xol_alloc_area(void) +static struct xol_area *get_xol_area(void)  { +	struct mm_struct *mm = current->mm;  	struct xol_area *area; +	area = mm->uprobes_state.xol_area; +	if (area) +		goto ret; +  	area = kzalloc(sizeof(*area), GFP_KERNEL);  	if (unlikely(!area)) -		return NULL; +		goto out;  	area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); -  	if (!area->bitmap) -		goto fail; +		goto free_area; + +	area->page = alloc_page(GFP_HIGHUSER); +	if (!area->page) +		goto free_bitmap;  	init_waitqueue_head(&area->wq);  	if (!xol_add_vma(area))  		return area; -fail: +	__free_page(area->page); + free_bitmap:  	kfree(area->bitmap); + free_area:  	kfree(area); - -	return get_xol_area(current->mm); + out: +	area = mm->uprobes_state.xol_area; + ret: +	smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */ +	return area;  }  /* @@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)  }  /* - * xol_get_insn_slot - If was not allocated a slot, then - * allocate a slot. + * xol_get_insn_slot - allocate a slot for xol.   * Returns the allocated slot address or 0.   */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +static unsigned long xol_get_insn_slot(struct uprobe *uprobe)  {  	struct xol_area *area;  	unsigned long offset; +	unsigned long xol_vaddr;  	void *vaddr; -	area = get_xol_area(current->mm); -	if (!area) { -		area = xol_alloc_area(); -		if (!area) -			return 0; -	} -	current->utask->xol_vaddr = xol_take_insn_slot(area); +	area = get_xol_area(); +	if (!area) +		return 0; -	/* -	 * Initialize the slot if xol_vaddr points to valid -	 * instruction slot. -	 */ -	if (unlikely(!current->utask->xol_vaddr)) +	xol_vaddr = xol_take_insn_slot(area); +	if (unlikely(!xol_vaddr))  		return 0; -	current->utask->vaddr = slot_addr; -	offset = current->utask->xol_vaddr & ~PAGE_MASK; +	/* Initialize the slot */ +	offset = xol_vaddr & ~PAGE_MASK;  	vaddr = kmap_atomic(area->page);  	memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);  	kunmap_atomic(vaddr); @@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot  	 */  	flush_dcache_page(area->page); -	return current->utask->xol_vaddr; +	return xol_vaddr;  }  /* @@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)  		return;  	slot_addr = tsk->utask->xol_vaddr; - -	if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) +	if (unlikely(!slot_addr))  		return;  	area = tsk->mm->uprobes_state.xol_area; @@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)  }  /* - * Allocate a uprobe_task object for the task. - * Called when the thread hits a breakpoint for the first time. + * Allocate a uprobe_task object for the task if if necessary. + * Called when the thread hits a breakpoint.   *   * Returns:   * - pointer to new uprobe_task on success   * - NULL otherwise   */ -static struct uprobe_task *add_utask(void) +static struct uprobe_task *get_utask(void)  { -	struct uprobe_task *utask; - -	utask = kzalloc(sizeof *utask, GFP_KERNEL); -	if (unlikely(!utask)) -		return NULL; - -	current->utask = utask; -	return utask; +	if (!current->utask) +		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); +	return current->utask;  }  /* Prepare to single-step probed instruction out of line. */  static int -pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)  { -	if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) -		return 0; +	struct uprobe_task *utask; +	unsigned long xol_vaddr; +	int err; + +	utask = get_utask(); +	if (!utask) +		return -ENOMEM; + +	xol_vaddr = xol_get_insn_slot(uprobe); +	if (!xol_vaddr) +		return -ENOMEM; + +	utask->xol_vaddr = xol_vaddr; +	utask->vaddr = bp_vaddr; + +	err = arch_uprobe_pre_xol(&uprobe->arch, regs); +	if (unlikely(err)) { +		xol_free_insn_slot(current); +		return err; +	} -	return -EFAULT; +	utask->active_uprobe = uprobe; +	utask->state = UTASK_SSTEP; +	return 0;  }  /* @@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)  		 * This is not strictly accurate, we can race with  		 * uprobe_unregister() and see the already removed  		 * uprobe if delete_uprobe() was not yet called. +		 * Or this uprobe can be filtered out.  		 */  		if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))  			return; @@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)  	return uprobe;  } +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ +	struct uprobe_consumer *uc; +	int remove = UPROBE_HANDLER_REMOVE; + +	down_read(&uprobe->register_rwsem); +	for (uc = uprobe->consumers; uc; uc = uc->next) { +		int rc = uc->handler(uc, regs); + +		WARN(rc & ~UPROBE_HANDLER_MASK, +			"bad rc=0x%x from %pf()\n", rc, uc->handler); +		remove &= rc; +	} + +	if (remove && uprobe->consumers) { +		WARN_ON(!uprobe_is_active(uprobe)); +		unapply_uprobe(uprobe, current->mm); +	} +	up_read(&uprobe->register_rwsem); +} +  /*   * Run handler and ask thread to singlestep.   * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.   */  static void handle_swbp(struct pt_regs *regs)  { -	struct uprobe_task *utask;  	struct uprobe *uprobe;  	unsigned long bp_vaddr;  	int uninitialized_var(is_swbp); @@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)  		}  		return;  	} + +	/* change it in advance for ->handler() and restart */ +	instruction_pointer_set(regs, bp_vaddr); +  	/*  	 * TODO: move copy_insn/etc into _register and remove this hack.  	 * After we hit the bp, _unregister + _register can install the @@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)  	 */  	smp_rmb(); /* pairs with wmb() in install_breakpoint() */  	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) -		goto restart; - -	utask = current->utask; -	if (!utask) { -		utask = add_utask(); -		/* Cannot allocate; re-execute the instruction. */ -		if (!utask) -			goto restart; -	} +		goto out;  	handler_chain(uprobe, regs);  	if (can_skip_sstep(uprobe, regs))  		goto out; -	if (!pre_ssout(uprobe, regs, bp_vaddr)) { -		utask->active_uprobe = uprobe; -		utask->state = UTASK_SSTEP; +	if (!pre_ssout(uprobe, regs, bp_vaddr))  		return; -	} -restart: -	/* -	 * cannot singlestep; cannot skip instruction; -	 * re-execute the instruction. -	 */ -	instruction_pointer_set(regs, bp_vaddr); +	/* can_skip_sstep() succeeded, or restart if can't singlestep */  out:  	put_uprobe(uprobe);  } @@ -1609,10 +1649,8 @@ static int __init init_uprobes(void)  {  	int i; -	for (i = 0; i < UPROBES_HASH_SZ; i++) { -		mutex_init(&uprobes_mutex[i]); +	for (i = 0; i < UPROBES_HASH_SZ; i++)  		mutex_init(&uprobes_mmap_mutex[i]); -	}  	if (percpu_init_rwsem(&dup_mmap_sem))  		return -ENOMEM; diff --git a/kernel/exit.c b/kernel/exit.c index b4df2193721..7dd20408707 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)  	bool group_dead = thread_group_leader(tsk);  	struct sighand_struct *sighand;  	struct tty_struct *uninitialized_var(tty); +	cputime_t utime, stime;  	sighand = rcu_dereference_check(tsk->sighand,  					lockdep_tasklist_lock_is_held()); @@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)  		 * We won't ever get here for the group leader, since it  		 * will have been the last reference on the signal_struct.  		 */ -		sig->utime += tsk->utime; -		sig->stime += tsk->stime; -		sig->gtime += tsk->gtime; +		task_cputime(tsk, &utime, &stime); +		sig->utime += utime; +		sig->stime += stime; +		sig->gtime += task_gtime(tsk);  		sig->min_flt += tsk->min_flt;  		sig->maj_flt += tsk->maj_flt;  		sig->nvcsw += tsk->nvcsw; @@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)  		sig = p->signal;  		psig->cutime += tgutime + sig->cutime;  		psig->cstime += tgstime + sig->cstime; -		psig->cgtime += p->gtime + sig->gtime + sig->cgtime; +		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;  		psig->cmin_flt +=  			p->min_flt + sig->min_flt + sig->cmin_flt;  		psig->cmaj_flt += diff --git a/kernel/fork.c b/kernel/fork.c index 4ff724f81f2..8f62b2a0f12 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1233,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,  #ifndef CONFIG_VIRT_CPU_ACCOUNTING  	p->prev_cputime.utime = p->prev_cputime.stime = 0;  #endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +	seqlock_init(&p->vtime_seqlock); +	p->vtime_snap = 0; +	p->vtime_snap_whence = VTIME_SLEEPING; +#endif +  #if defined(SPLIT_RSS_COUNTING)  	memset(&p->rss_stat, 0, sizeof(p->rss_stat));  #endif diff --git a/kernel/futex.c b/kernel/futex.c index 19eb089ca00..fbc07a29ec5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,7 @@  #include <linux/pid.h>  #include <linux/nsproxy.h>  #include <linux/ptrace.h> +#include <linux/sched/rt.h>  #include <asm/futex.h> @@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,  	if (!futex_cmpxchg_enabled)  		return -ENOSYS; -	WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); -  	rcu_read_lock();  	ret = -ESRCH; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83e368b005f..f9f44fd4d34 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -11,6 +11,7 @@  #include <linux/nsproxy.h>  #include <linux/futex.h>  #include <linux/ptrace.h> +#include <linux/syscalls.h>  #include <asm/uaccess.h> @@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)  	}  } -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, -			   compat_size_t len) +COMPAT_SYSCALL_DEFINE2(set_robust_list, +		struct compat_robust_list_head __user *, head, +		compat_size_t, len)  {  	if (!futex_cmpxchg_enabled)  		return -ENOSYS; @@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,  	return 0;  } -asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, -			   compat_size_t __user *len_ptr) +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, +			compat_uptr_t __user *, head_ptr, +			compat_size_t __user *, len_ptr)  {  	struct compat_robust_list_head __user *head;  	unsigned long ret; @@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,  	if (!futex_cmpxchg_enabled)  		return -ENOSYS; -	WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); -  	rcu_read_lock();  	ret = -ESRCH; @@ -172,9 +171,9 @@ err_unlock:  	return ret;  } -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, -		struct compat_timespec __user *utime, u32 __user *uaddr2, -		u32 val3) +COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, +		struct compat_timespec __user *, utime, u32 __user *, uaddr2, +		u32, val3)  {  	struct timespec ts;  	ktime_t t, *tp = NULL; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index a92028196cc..d4da55d1fb6 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL  config GCOV_PROFILE_ALL  	bool "Profile entire Kernel"  	depends on GCOV_KERNEL -	depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE +	depends on SUPERH || S390 || X86 || PPC || MICROBLAZE  	default n  	---help---  	This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6db7a5ed52b..cc47812d3fe 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,6 +44,8 @@  #include <linux/err.h>  #include <linux/debugobjects.h>  #include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h>  #include <linux/timer.h>  #include <asm/uaccess.h> @@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)   * and expiry check is done in the hrtimer_interrupt or in the softirq.   */  static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, -					    struct hrtimer_clock_base *base, -					    int wakeup) +					    struct hrtimer_clock_base *base)  { -	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { -		if (wakeup) { -			raw_spin_unlock(&base->cpu_base->lock); -			raise_softirq_irqoff(HRTIMER_SOFTIRQ); -			raw_spin_lock(&base->cpu_base->lock); -		} else -			__raise_softirq_irqoff(HRTIMER_SOFTIRQ); - -		return 1; -	} - -	return 0; +	return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);  }  static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) @@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }  static inline void  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }  static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, -					    struct hrtimer_clock_base *base, -					    int wakeup) +					    struct hrtimer_clock_base *base)  {  	return 0;  } @@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,  	 *  	 * XXX send_remote_softirq() ?  	 */ -	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) -		hrtimer_enqueue_reprogram(timer, new_base, wakeup); +	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) +		&& hrtimer_enqueue_reprogram(timer, new_base)) { +		if (wakeup) { +			/* +			 * We need to drop cpu_base->lock to avoid a +			 * lock ordering issue vs. rq->lock. +			 */ +			raw_spin_unlock(&new_base->cpu_base->lock); +			raise_softirq_irqoff(HRTIMER_SOFTIRQ); +			local_irq_restore(flags); +			return ret; +		} else { +			__raise_softirq_irqoff(HRTIMER_SOFTIRQ); +		} +	}  	unlock_hrtimer_base(timer, &flags); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3aca9f29d30..cbd97ce0b00 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)  EXPORT_SYMBOL(irq_set_handler_data);  /** - *	irq_set_msi_desc - set MSI descriptor data for an irq - *	@irq:	Interrupt number - *	@entry:	Pointer to MSI descriptor data + *	irq_set_msi_desc_off - set MSI descriptor data for an irq at offset + *	@irq_base:	Interrupt number base + *	@irq_offset:	Interrupt number offset + *	@entry:		Pointer to MSI descriptor data   * - *	Set the MSI descriptor entry for an irq + *	Set the MSI descriptor entry for an irq at offset   */ -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, +			 struct msi_desc *entry)  {  	unsigned long flags; -	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); +	struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);  	if (!desc)  		return -EINVAL;  	desc->irq_data.msi_desc = entry; -	if (entry) -		entry->irq = irq; +	if (entry && !irq_offset) +		entry->irq = irq_base;  	irq_put_desc_unlock(desc, flags);  	return 0;  }  /** + *	irq_set_msi_desc - set MSI descriptor data for an irq + *	@irq:	Interrupt number + *	@entry:	Pointer to MSI descriptor data + * + *	Set the MSI descriptor entry for an irq + */ +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +{ +	return irq_set_msi_desc_off(irq, 0, entry); +} + +/**   *	irq_set_chip_data - set irq chip data for an irq   *	@irq:	Interrupt number   *	@data:	Pointer to chip specific data diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e49a288fa47..fa17855ca65 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,6 +16,7 @@  #include <linux/interrupt.h>  #include <linux/slab.h>  #include <linux/sched.h> +#include <linux/sched/rt.h>  #include <linux/task_work.h>  #include "internals.h" @@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)  out:  	irq_put_desc_unlock(desc, flags);  } +EXPORT_SYMBOL_GPL(enable_percpu_irq);  void disable_percpu_irq(unsigned int irq)  { @@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)  	irq_percpu_disable(desc, cpu);  	irq_put_desc_unlock(desc, flags);  } +EXPORT_SYMBOL_GPL(disable_percpu_irq);  /*   * Internal function to unregister a percpu irqaction. diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 611cd6003c4..7b5f012bde9 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)  	/*  	 * All handlers must agree on IRQF_SHARED, so we test just the -	 * first. Check for action->next as well. +	 * first.  	 */  	action = desc->action;  	if (!action || !(action->flags & IRQF_SHARED) || -	    (action->flags & __IRQF_TIMER) || -	    (action->handler(irq, action->dev_id) == IRQ_HANDLED) || -	    !action->next) +	    (action->flags & __IRQF_TIMER))  		goto out;  	/* Already running on another processor */ @@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)  	do {  		if (handle_irq_event(desc) == IRQ_HANDLED)  			ret = IRQ_HANDLED; +		/* Make sure that there is still a valid action */  		action = desc->action;  	} while ((desc->istate & IRQS_PENDING) && action);  	desc->istate &= ~IRQS_POLL_INPROGRESS; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 1588e3b2871..55fcce6065c 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -12,37 +12,36 @@  #include <linux/percpu.h>  #include <linux/hardirq.h>  #include <linux/irqflags.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/cpu.h> +#include <linux/notifier.h>  #include <asm/processor.h> -/* - * An entry can be in one of four states: - * - * free	     NULL, 0 -> {claimed}       : free to be used - * claimed   NULL, 3 -> {pending}       : claimed to be enqueued - * pending   next, 3 -> {busy}          : queued, pending callback - * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed - */ - -#define IRQ_WORK_PENDING	1UL -#define IRQ_WORK_BUSY		2UL -#define IRQ_WORK_FLAGS		3UL  static DEFINE_PER_CPU(struct llist_head, irq_work_list); +static DEFINE_PER_CPU(int, irq_work_raised);  /*   * Claim the entry so that no one else will poke at it.   */  static bool irq_work_claim(struct irq_work *work)  { -	unsigned long flags, nflags; +	unsigned long flags, oflags, nflags; +	/* +	 * Start with our best wish as a premise but only trust any +	 * flag value after cmpxchg() result. +	 */ +	flags = work->flags & ~IRQ_WORK_PENDING;  	for (;;) { -		flags = work->flags; -		if (flags & IRQ_WORK_PENDING) -			return false;  		nflags = flags | IRQ_WORK_FLAGS; -		if (cmpxchg(&work->flags, flags, nflags) == flags) +		oflags = cmpxchg(&work->flags, flags, nflags); +		if (oflags == flags)  			break; +		if (oflags & IRQ_WORK_PENDING) +			return false; +		flags = oflags;  		cpu_relax();  	} @@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)  }  /* - * Queue the entry and raise the IPI if needed. + * Enqueue the irq_work @entry unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress.   */ -static void __irq_work_queue(struct irq_work *work) +void irq_work_queue(struct irq_work *work)  { -	bool empty; +	/* Only queue if not already pending */ +	if (!irq_work_claim(work)) +		return; +	/* Queue the entry and raise the IPI if needed. */  	preempt_disable(); -	empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); -	/* The list was empty, raise self-interrupt to start processing. */ -	if (empty) -		arch_irq_work_raise(); +	llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); + +	/* +	 * If the work is not "lazy" or the tick is stopped, raise the irq +	 * work interrupt (if supported by the arch), otherwise, just wait +	 * for the next tick. +	 */ +	if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { +		if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) +			arch_irq_work_raise(); +	}  	preempt_enable();  } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* - * Enqueue the irq_work @entry, returns true on success, failure when the - * @entry was already enqueued by someone else. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue(struct irq_work *work) +bool irq_work_needs_cpu(void)  { -	if (!irq_work_claim(work)) { -		/* -		 * Already enqueued, can't do! -		 */ +	struct llist_head *this_list; + +	this_list = &__get_cpu_var(irq_work_list); +	if (llist_empty(this_list))  		return false; -	} -	__irq_work_queue(work); +	/* All work should have been flushed before going offline */ +	WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); +  	return true;  } -EXPORT_SYMBOL_GPL(irq_work_queue); -/* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. - */ -void irq_work_run(void) +static void __irq_work_run(void)  { +	unsigned long flags;  	struct irq_work *work;  	struct llist_head *this_list;  	struct llist_node *llnode; + +	/* +	 * Reset the "raised" state right before we check the list because +	 * an NMI may enqueue after we find the list empty from the runner. +	 */ +	__this_cpu_write(irq_work_raised, 0); +	barrier(); +  	this_list = &__get_cpu_var(irq_work_list);  	if (llist_empty(this_list))  		return; -	BUG_ON(!in_irq());  	BUG_ON(!irqs_disabled());  	llnode = llist_del_all(this_list); @@ -119,16 +130,31 @@ void irq_work_run(void)  		/*  		 * Clear the PENDING bit, after this point the @work  		 * can be re-used. +		 * Make it immediately visible so that other CPUs trying +		 * to claim that work don't rely on us to handle their data +		 * while we are in the middle of the func.  		 */ -		work->flags = IRQ_WORK_BUSY; +		flags = work->flags & ~IRQ_WORK_PENDING; +		xchg(&work->flags, flags); +  		work->func(work);  		/*  		 * Clear the BUSY bit and return to the free state if  		 * no-one else claimed it meanwhile.  		 */ -		(void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); +		(void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);  	}  } + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ +	BUG_ON(!in_irq()); +	__irq_work_run(); +}  EXPORT_SYMBOL_GPL(irq_work_run);  /* @@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)  		cpu_relax();  }  EXPORT_SYMBOL_GPL(irq_work_sync); + +#ifdef CONFIG_HOTPLUG_CPU +static int irq_work_cpu_notify(struct notifier_block *self, +			       unsigned long action, void *hcpu) +{ +	long cpu = (long)hcpu; + +	switch (action) { +	case CPU_DYING: +		/* Called from stop_machine */ +		if (WARN_ON_ONCE(cpu != smp_processor_id())) +			break; +		__irq_work_run(); +		break; +	default: +		break; +	} +	return NOTIFY_OK; +} + +static struct notifier_block cpu_notify; + +static __init int irq_work_init_cpu_notifier(void) +{ +	cpu_notify.notifier_call = irq_work_cpu_notify; +	cpu_notify.priority = 0; +	register_cpu_notifier(&cpu_notify); +	return 0; +} +device_initcall(irq_work_init_cpu_notifier); + +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5..2436ffcec91 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -54,6 +54,12 @@ struct resource crashk_res = {  	.end   = 0,  	.flags = IORESOURCE_BUSY | IORESOURCE_MEM  }; +struct resource crashk_low_res = { +	.name  = "Crash kernel low", +	.start = 0, +	.end   = 0, +	.flags = IORESOURCE_BUSY | IORESOURCE_MEM +};  int kexec_should_crash(struct task_struct *p)  { @@ -1369,10 +1375,11 @@ static int __init parse_crashkernel_simple(char 		*cmdline,   * That function is the entry point for command line parsing and should be   * called from the arch-specific code.   */ -int __init parse_crashkernel(char 		 *cmdline, +static int __init __parse_crashkernel(char *cmdline,  			     unsigned long long system_ram,  			     unsigned long long *crash_size, -			     unsigned long long *crash_base) +			     unsigned long long *crash_base, +				const char *name)  {  	char 	*p = cmdline, *ck_cmdline = NULL;  	char	*first_colon, *first_space; @@ -1382,16 +1389,16 @@ int __init parse_crashkernel(char 		 *cmdline,  	*crash_base = 0;  	/* find crashkernel and use the last one if there are more */ -	p = strstr(p, "crashkernel="); +	p = strstr(p, name);  	while (p) {  		ck_cmdline = p; -		p = strstr(p+1, "crashkernel="); +		p = strstr(p+1, name);  	}  	if (!ck_cmdline)  		return -EINVAL; -	ck_cmdline += 12; /* strlen("crashkernel=") */ +	ck_cmdline += strlen(name);  	/*  	 * if the commandline contains a ':', then that's the extended @@ -1409,6 +1416,23 @@ int __init parse_crashkernel(char 		 *cmdline,  	return 0;  } +int __init parse_crashkernel(char *cmdline, +			     unsigned long long system_ram, +			     unsigned long long *crash_size, +			     unsigned long long *crash_base) +{ +	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, +					"crashkernel="); +} + +int __init parse_crashkernel_low(char *cmdline, +			     unsigned long long system_ram, +			     unsigned long long *crash_size, +			     unsigned long long *crash_base) +{ +	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, +					"crashkernel_low="); +}  static void update_vmcoreinfo_note(void)  { diff --git a/kernel/kmod.c b/kernel/kmod.c index 0023a87e8de..56dd34976d7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -38,6 +38,7 @@  #include <linux/suspend.h>  #include <linux/rwsem.h>  #include <linux/ptrace.h> +#include <linux/async.h>  #include <asm/uaccess.h>  #include <trace/events/module.h> @@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)  #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */  	static int kmod_loop_msg; +	/* +	 * We don't allow synchronous module loading from async.  Module +	 * init may invoke async_synchronize_full() which will end up +	 * waiting for this task which already is waiting for the module +	 * loading to complete, leading to a deadlock. +	 */ +	WARN_ON_ONCE(wait && current_is_async()); +  	va_start(args, fmt);  	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);  	va_end(args); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa40..550294d58a0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -471,7 +471,6 @@ static LIST_HEAD(unoptimizing_list);  static void kprobe_optimizer(struct work_struct *work);  static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); -static DECLARE_COMPLETION(optimizer_comp);  #define OPTIMIZE_DELAY 5  /* @@ -552,8 +551,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)  /* Start optimizer after OPTIMIZE_DELAY passed */  static __kprobes void kick_kprobe_optimizer(void)  { -	if (!delayed_work_pending(&optimizing_work)) -		schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); +	schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);  }  /* Kprobe jump optimizer */ @@ -592,16 +590,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)  	/* Step 5: Kick optimizer again if needed */  	if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))  		kick_kprobe_optimizer(); -	else -		/* Wake up all waiters */ -		complete_all(&optimizer_comp);  }  /* Wait for completing optimization and unoptimization */  static __kprobes void wait_for_kprobe_optimizer(void)  { -	if (delayed_work_pending(&optimizing_work)) -		wait_for_completion(&optimizer_comp); +	mutex_lock(&kprobe_mutex); + +	while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { +		mutex_unlock(&kprobe_mutex); + +		/* this will also make optimizing_work execute immmediately */ +		flush_delayed_work(&optimizing_work); +		/* @optimizing_work might not have been queued yet, relax */ +		cpu_relax(); + +		mutex_lock(&kprobe_mutex); +	} + +	mutex_unlock(&kprobe_mutex);  }  /* Optimize kprobe if p is ready to be optimized */ @@ -919,7 +926,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)  }  #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE  static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {  	.func = kprobe_ftrace_handler,  	.flags = FTRACE_OPS_FL_SAVE_REGS, @@ -964,7 +971,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)  			   (unsigned long)p->addr, 1, 0);  	WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);  } -#else	/* !KPROBES_CAN_USE_FTRACE */ +#else	/* !CONFIG_KPROBES_ON_FTRACE */  #define prepare_kprobe(p)	arch_prepare_kprobe(p)  #define arm_kprobe_ftrace(p)	do {} while (0)  #define disarm_kprobe_ftrace(p)	do {} while (0) @@ -1414,12 +1421,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,  	 */  	ftrace_addr = ftrace_location((unsigned long)p->addr);  	if (ftrace_addr) { -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE  		/* Given address is not on the instruction boundary */  		if ((unsigned long)p->addr != ftrace_addr)  			return -EILSEQ;  		p->flags |= KPROBE_FLAG_FTRACE; -#else	/* !KPROBES_CAN_USE_FTRACE */ +#else	/* !CONFIG_KPROBES_ON_FTRACE */  		return -EINVAL;  #endif  	} diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7981e5b2350..8a0efac4f99 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  #endif  	if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {  		debug_locks_off(); -		printk("BUG: MAX_LOCK_DEPTH too low!\n"); +		printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n", +		       curr->lockdep_depth, MAX_LOCK_DEPTH);  		printk("turning off the locking correctness validator.\n"); + +		lockdep_print_held_locks(current); +		debug_show_all_locks();  		dump_stack(); +  		return 0;  	} @@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  }  static int -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,  			   unsigned long ip)  {  	if (!debug_locks_off()) @@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,  		return 0;  	if (curr->lockdep_depth <= 0) -		return print_unlock_inbalance_bug(curr, lock, ip); +		return print_unlock_imbalance_bug(curr, lock, ip);  	return 1;  } @@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,  			goto found_it;  		prev_hlock = hlock;  	} -	return print_unlock_inbalance_bug(curr, lock, ip); +	return print_unlock_imbalance_bug(curr, lock, ip);  found_it:  	lockdep_init_map(lock, name, key, 0); @@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,  			goto found_it;  		prev_hlock = hlock;  	} -	return print_unlock_inbalance_bug(curr, lock, ip); +	return print_unlock_imbalance_bug(curr, lock, ip);  found_it:  	if (hlock->instance == lock) diff --git a/kernel/module.c b/kernel/module.c index 950076eb327..0925c9a7197 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -188,6 +188,7 @@ struct load_info {     ongoing or failed initialization etc. */  static inline int strong_try_module_get(struct module *mod)  { +	BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);  	if (mod && mod->state == MODULE_STATE_COMING)  		return -EBUSY;  	if (try_module_get(mod)) @@ -196,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)  		return -ENOENT;  } -static inline void add_taint_module(struct module *mod, unsigned flag) +static inline void add_taint_module(struct module *mod, unsigned flag, +				    enum lockdep_ok lockdep_ok)  { -	add_taint(flag); +	add_taint(flag, lockdep_ok);  	mod->taints |= (1U << flag);  } @@ -343,6 +345,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,  #endif  		}; +		if (mod->state == MODULE_STATE_UNFORMED) +			continue; +  		if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))  			return true;  	} @@ -450,16 +455,24 @@ const struct kernel_symbol *find_symbol(const char *name,  EXPORT_SYMBOL_GPL(find_symbol);  /* Search for module by name: must hold module_mutex. */ -struct module *find_module(const char *name) +static struct module *find_module_all(const char *name, +				      bool even_unformed)  {  	struct module *mod;  	list_for_each_entry(mod, &modules, list) { +		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (strcmp(mod->name, name) == 0)  			return mod;  	}  	return NULL;  } + +struct module *find_module(const char *name) +{ +	return find_module_all(name, false); +}  EXPORT_SYMBOL_GPL(find_module);  #ifdef CONFIG_SMP @@ -525,6 +538,8 @@ bool is_module_percpu_address(unsigned long addr)  	preempt_disable();  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (!mod->percpu_size)  			continue;  		for_each_possible_cpu(cpu) { @@ -713,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)  {  	int ret = (flags & O_TRUNC);  	if (ret) -		add_taint(TAINT_FORCED_RMMOD); +		add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);  	return ret;  }  #else @@ -1048,6 +1063,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,  	case MODULE_STATE_GOING:  		state = "going";  		break; +	default: +		BUG();  	}  	return sprintf(buffer, "%s\n", state);  } @@ -1122,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)  	if (!test_taint(TAINT_FORCED_MODULE))  		printk(KERN_WARNING "%s: %s: kernel tainted.\n",  		       mod->name, reason); -	add_taint_module(mod, TAINT_FORCED_MODULE); +	add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);  	return 0;  #else  	return -ENOEXEC; @@ -1786,6 +1803,8 @@ void set_all_modules_text_rw(void)  	mutex_lock(&module_mutex);  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if ((mod->module_core) && (mod->core_text_size)) {  			set_page_attributes(mod->module_core,  						mod->module_core + mod->core_text_size, @@ -1807,6 +1826,8 @@ void set_all_modules_text_ro(void)  	mutex_lock(&module_mutex);  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if ((mod->module_core) && (mod->core_text_size)) {  			set_page_attributes(mod->module_core,  						mod->module_core + mod->core_text_size, @@ -2127,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)  		if (!test_taint(TAINT_PROPRIETARY_MODULE))  			printk(KERN_WARNING "%s: module license '%s' taints "  				"kernel.\n", mod->name, license); -		add_taint_module(mod, TAINT_PROPRIETARY_MODULE); +		add_taint_module(mod, TAINT_PROPRIETARY_MODULE, +				 LOCKDEP_NOW_UNRELIABLE);  	}  } @@ -2527,6 +2549,13 @@ static int copy_module_from_fd(int fd, struct load_info *info)  		err = -EFBIG;  		goto out;  	} + +	/* Don't hand 0 to vmalloc, it whines. */ +	if (stat.size == 0) { +		err = -EINVAL; +		goto out; +	} +  	info->hdr = vmalloc(stat.size);  	if (!info->hdr) {  		err = -ENOMEM; @@ -2673,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)  	}  	if (!get_modinfo(info, "intree")) -		add_taint_module(mod, TAINT_OOT_MODULE); +		add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);  	if (get_modinfo(info, "staging")) { -		add_taint_module(mod, TAINT_CRAP); +		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);  		printk(KERN_WARNING "%s: module is from the staging directory,"  		       " the quality is unknown, you have been warned.\n",  		       mod->name); @@ -2842,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)  	 * using GPL-only symbols it needs.  	 */  	if (strcmp(mod->name, "ndiswrapper") == 0) -		add_taint(TAINT_PROPRIETARY_MODULE); +		add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);  	/* driverloader was caught wrongly pretending to be under GPL */  	if (strcmp(mod->name, "driverloader") == 0) -		add_taint_module(mod, TAINT_PROPRIETARY_MODULE); +		add_taint_module(mod, TAINT_PROPRIETARY_MODULE, +				 LOCKDEP_NOW_UNRELIABLE);  	/* lve claims to be GPL but upstream won't provide source */  	if (strcmp(mod->name, "lve") == 0) -		add_taint_module(mod, TAINT_PROPRIETARY_MODULE); +		add_taint_module(mod, TAINT_PROPRIETARY_MODULE, +				 LOCKDEP_NOW_UNRELIABLE);  #ifdef CONFIG_MODVERSIONS  	if ((mod->num_syms && !mod->crcs) @@ -2990,8 +3021,9 @@ static bool finished_loading(const char *name)  	bool ret;  	mutex_lock(&module_mutex); -	mod = find_module(name); -	ret = !mod || mod->state != MODULE_STATE_COMING; +	mod = find_module_all(name, true); +	ret = !mod || mod->state == MODULE_STATE_LIVE +		|| mod->state == MODULE_STATE_GOING;  	mutex_unlock(&module_mutex);  	return ret; @@ -3113,12 +3145,72 @@ static int may_init_module(void)  	return 0;  } +/* + * We try to place it in the list now to make sure it's unique before + * we dedicate too many resources.  In particular, temporary percpu + * memory exhaustion. + */ +static int add_unformed_module(struct module *mod) +{ +	int err; +	struct module *old; + +	mod->state = MODULE_STATE_UNFORMED; + +again: +	mutex_lock(&module_mutex); +	if ((old = find_module_all(mod->name, true)) != NULL) { +		if (old->state == MODULE_STATE_COMING +		    || old->state == MODULE_STATE_UNFORMED) { +			/* Wait in case it fails to load. */ +			mutex_unlock(&module_mutex); +			err = wait_event_interruptible(module_wq, +					       finished_loading(mod->name)); +			if (err) +				goto out_unlocked; +			goto again; +		} +		err = -EEXIST; +		goto out; +	} +	list_add_rcu(&mod->list, &modules); +	err = 0; + +out: +	mutex_unlock(&module_mutex); +out_unlocked: +	return err; +} + +static int complete_formation(struct module *mod, struct load_info *info) +{ +	int err; + +	mutex_lock(&module_mutex); + +	/* Find duplicate symbols (must be called under lock). */ +	err = verify_export_symbols(mod); +	if (err < 0) +		goto out; + +	/* This relies on module_mutex for list integrity. */ +	module_bug_finalize(info->hdr, info->sechdrs, mod); + +	/* Mark state as coming so strong_try_module_get() ignores us, +	 * but kallsyms etc. can see us. */ +	mod->state = MODULE_STATE_COMING; + +out: +	mutex_unlock(&module_mutex); +	return err; +} +  /* Allocate and load the module: note that size of section 0 is always     zero, and we rely on this for optional sections. */  static int load_module(struct load_info *info, const char __user *uargs,  		       int flags)  { -	struct module *mod, *old; +	struct module *mod;  	long err;  	err = module_sig_check(info); @@ -3136,16 +3228,26 @@ static int load_module(struct load_info *info, const char __user *uargs,  		goto free_copy;  	} +	/* Reserve our place in the list. */ +	err = add_unformed_module(mod); +	if (err) +		goto free_module; +  #ifdef CONFIG_MODULE_SIG  	mod->sig_ok = info->sig_ok; -	if (!mod->sig_ok) -		add_taint_module(mod, TAINT_FORCED_MODULE); +	if (!mod->sig_ok) { +		printk_once(KERN_NOTICE +			    "%s: module verification failed: signature and/or" +			    " required key missing - tainting kernel\n", +			    mod->name); +		add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); +	}  #endif  	/* Now module is in final location, initialize linked lists, etc. */  	err = module_unload_init(mod);  	if (err) -		goto free_module; +		goto unlink_mod;  	/* Now we've got everything in the final locations, we can  	 * find optional sections. */ @@ -3180,54 +3282,23 @@ static int load_module(struct load_info *info, const char __user *uargs,  		goto free_arch_cleanup;  	} -	/* Mark state as coming so strong_try_module_get() ignores us. */ -	mod->state = MODULE_STATE_COMING; - -	/* Now sew it into the lists so we can get lockdep and oops -	 * info during argument parsing.  No one should access us, since -	 * strong_try_module_get() will fail. -	 * lockdep/oops can run asynchronous, so use the RCU list insertion -	 * function to insert in a way safe to concurrent readers. -	 * The mutex protects against concurrent writers. -	 */ -again: -	mutex_lock(&module_mutex); -	if ((old = find_module(mod->name)) != NULL) { -		if (old->state == MODULE_STATE_COMING) { -			/* Wait in case it fails to load. */ -			mutex_unlock(&module_mutex); -			err = wait_event_interruptible(module_wq, -					       finished_loading(mod->name)); -			if (err) -				goto free_arch_cleanup; -			goto again; -		} -		err = -EEXIST; -		goto unlock; -	} - -	/* This has to be done once we're sure module name is unique. */  	dynamic_debug_setup(info->debug, info->num_debug); -	/* Find duplicate symbols */ -	err = verify_export_symbols(mod); -	if (err < 0) -		goto ddebug; - -	module_bug_finalize(info->hdr, info->sechdrs, mod); -	list_add_rcu(&mod->list, &modules); -	mutex_unlock(&module_mutex); +	/* Finally it's fully formed, ready to start executing. */ +	err = complete_formation(mod, info); +	if (err) +		goto ddebug_cleanup;  	/* Module is ready to execute: parsing args may do that. */  	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,  			 -32768, 32767, &ddebug_dyndbg_module_param_cb);  	if (err < 0) -		goto unlink; +		goto bug_cleanup;  	/* Link in to syfs. */  	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);  	if (err < 0) -		goto unlink; +		goto bug_cleanup;  	/* Get rid of temporary copy. */  	free_copy(info); @@ -3237,16 +3308,13 @@ again:  	return do_init_module(mod); - unlink: + bug_cleanup: +	/* module_bug_cleanup needs module_mutex protection */  	mutex_lock(&module_mutex); -	/* Unlink carefully: kallsyms could be walking list. */ -	list_del_rcu(&mod->list);  	module_bug_cleanup(mod); -	wake_up_all(&module_wq); - ddebug: -	dynamic_debug_remove(info->debug); - unlock:  	mutex_unlock(&module_mutex); + ddebug_cleanup: +	dynamic_debug_remove(info->debug);  	synchronize_sched();  	kfree(mod->args);   free_arch_cleanup: @@ -3255,6 +3323,12 @@ again:  	free_modinfo(mod);   free_unload:  	module_unload_free(mod); + unlink_mod: +	mutex_lock(&module_mutex); +	/* Unlink carefully: kallsyms could be walking list. */ +	list_del_rcu(&mod->list); +	wake_up_all(&module_wq); +	mutex_unlock(&module_mutex);   free_module:  	module_deallocate(mod, info);   free_copy: @@ -3377,6 +3451,8 @@ const char *module_address_lookup(unsigned long addr,  	preempt_disable();  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (within_module_init(addr, mod) ||  		    within_module_core(addr, mod)) {  			if (modname) @@ -3400,6 +3476,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)  	preempt_disable();  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (within_module_init(addr, mod) ||  		    within_module_core(addr, mod)) {  			const char *sym; @@ -3424,6 +3502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,  	preempt_disable();  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (within_module_init(addr, mod) ||  		    within_module_core(addr, mod)) {  			const char *sym; @@ -3451,6 +3531,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,  	preempt_disable();  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (symnum < mod->num_symtab) {  			*value = mod->symtab[symnum].st_value;  			*type = mod->symtab[symnum].st_info; @@ -3493,9 +3575,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)  			ret = mod_find_symname(mod, colon+1);  		*colon = ':';  	} else { -		list_for_each_entry_rcu(mod, &modules, list) +		list_for_each_entry_rcu(mod, &modules, list) { +			if (mod->state == MODULE_STATE_UNFORMED) +				continue;  			if ((ret = mod_find_symname(mod, name)) != 0)  				break; +		}  	}  	preempt_enable();  	return ret; @@ -3510,6 +3595,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,  	int ret;  	list_for_each_entry(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		for (i = 0; i < mod->num_symtab; i++) {  			ret = fn(data, mod->strtab + mod->symtab[i].st_name,  				 mod, mod->symtab[i].st_value); @@ -3525,6 +3612,7 @@ static char *module_flags(struct module *mod, char *buf)  {  	int bx = 0; +	BUG_ON(mod->state == MODULE_STATE_UNFORMED);  	if (mod->taints ||  	    mod->state == MODULE_STATE_GOING ||  	    mod->state == MODULE_STATE_COMING) { @@ -3566,6 +3654,10 @@ static int m_show(struct seq_file *m, void *p)  	struct module *mod = list_entry(p, struct module, list);  	char buf[8]; +	/* We always ignore unformed modules. */ +	if (mod->state == MODULE_STATE_UNFORMED) +		return 0; +  	seq_printf(m, "%s %u",  		   mod->name, mod->init_size + mod->core_size);  	print_unload_info(m, mod); @@ -3626,6 +3718,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)  	preempt_disable();  	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (mod->num_exentries == 0)  			continue; @@ -3674,10 +3768,13 @@ struct module *__module_address(unsigned long addr)  	if (addr < module_addr_min || addr > module_addr_max)  		return NULL; -	list_for_each_entry_rcu(mod, &modules, list) +	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		if (within_module_core(addr, mod)  		    || within_module_init(addr, mod))  			return mod; +	}  	return NULL;  }  EXPORT_SYMBOL_GPL(__module_address); @@ -3730,8 +3827,11 @@ void print_modules(void)  	printk(KERN_DEFAULT "Modules linked in:");  	/* Most callers should already have preempt disabled, but make sure */  	preempt_disable(); -	list_for_each_entry_rcu(mod, &modules, list) +	list_for_each_entry_rcu(mod, &modules, list) { +		if (mod->state == MODULE_STATE_UNFORMED) +			continue;  		printk(" %s%s", mod->name, module_flags(mod, buf)); +	}  	preempt_enable();  	if (last_unloaded_module[0])  		printk(" [last unloaded: %s]", last_unloaded_module); diff --git a/kernel/mutex.c b/kernel/mutex.c index a307cc9c952..52f23011b6e 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -19,6 +19,7 @@   */  #include <linux/mutex.h>  #include <linux/sched.h> +#include <linux/sched/rt.h>  #include <linux/export.h>  #include <linux/spinlock.h>  #include <linux/interrupt.h> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c057104bf05..afc0456f227 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)  		goto out;  	} -	new_ns = create_new_namespaces(flags, tsk, -				       task_cred_xxx(tsk, user_ns), tsk->fs); +	new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);  	if (IS_ERR(new_ns)) {  		err = PTR_ERR(new_ns);  		goto out; diff --git a/kernel/panic.c b/kernel/panic.c index e1b2822fff9..7c57cc9eee2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -259,26 +259,19 @@ unsigned long get_taint(void)  	return tainted_mask;  } -void add_taint(unsigned flag) +/** + * add_taint: add a taint flag if not already set. + * @flag: one of the TAINT_* constants. + * @lockdep_ok: whether lock debugging is still OK. + * + * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for + * some notewortht-but-not-corrupting cases, it can be set to true. + */ +void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)  { -	/* -	 * Can't trust the integrity of the kernel anymore. -	 * We don't call directly debug_locks_off() because the issue -	 * is not necessarily serious enough to set oops_in_progress to 1 -	 * Also we want to keep up lockdep for staging/out-of-tree -	 * development and post-warning case. -	 */ -	switch (flag) { -	case TAINT_CRAP: -	case TAINT_OOT_MODULE: -	case TAINT_WARN: -	case TAINT_FIRMWARE_WORKAROUND: -		break; - -	default: -		if (__debug_locks_off()) -			printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); -	} +	if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) +		printk(KERN_WARNING +		       "Disabling lock debugging due to kernel taint\n");  	set_bit(flag, &tainted_mask);  } @@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,  	print_modules();  	dump_stack();  	print_oops_end_marker(); -	add_taint(taint); +	/* Just a warning, don't kill lockdep. */ +	add_taint(taint, LOCKDEP_STILL_OK);  }  void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) diff --git a/kernel/pid.c b/kernel/pid.c index de9af600006..f2c6a682509 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -331,7 +331,7 @@ out:  	return pid;  out_unlock: -	spin_unlock(&pidmap_lock); +	spin_unlock_irq(&pidmap_lock);  out_free:  	while (++i <= ns->level)  		free_pidmap(pid->numbers + i); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index a278cad1d5d..8fd709c9bb5 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,  static inline cputime_t prof_ticks(struct task_struct *p)  { -	return p->utime + p->stime; +	cputime_t utime, stime; + +	task_cputime(p, &utime, &stime); + +	return utime + stime;  }  static inline cputime_t virt_ticks(struct task_struct *p)  { -	return p->utime; +	cputime_t utime; + +	task_cputime(p, &utime, NULL); + +	return utime;  }  static int @@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head,   */  void posix_cpu_timers_exit(struct task_struct *tsk)  { +	cputime_t utime, stime; +  	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,  						sizeof(unsigned long long)); +	task_cputime(tsk, &utime, &stime);  	cleanup_timers(tsk->cpu_timers, -		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); +		       utime, stime, tsk->se.sum_exec_runtime);  }  void posix_cpu_timers_exit_group(struct task_struct *tsk)  {  	struct signal_struct *const sig = tsk->signal; +	cputime_t utime, stime; +	task_cputime(tsk, &utime, &stime);  	cleanup_timers(tsk->signal->cpu_timers, -		       tsk->utime + sig->utime, tsk->stime + sig->stime, +		       utime + sig->utime, stime + sig->stime,  		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);  } @@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,  static inline int fastpath_timer_check(struct task_struct *tsk)  {  	struct signal_struct *sig; +	cputime_t utime, stime; + +	task_cputime(tsk, &utime, &stime);  	if (!task_cputime_zero(&tsk->cputime_expires)) {  		struct task_cputime task_sample = { -			.utime = tsk->utime, -			.stime = tsk->stime, +			.utime = utime, +			.stime = stime,  			.sum_exec_runtime = tsk->se.sum_exec_runtime  		}; @@ -1401,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  		while (!signal_pending(current)) {  			if (timer.it.cpu.expires.sched == 0) {  				/* -				 * Our timer fired and was reset. +				 * Our timer fired and was reset, below +				 * deletion can not fail.  				 */ +				posix_cpu_timer_del(&timer);  				spin_unlock_irq(&timer.it_lock);  				return 0;  			} @@ -1420,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  		 * We were interrupted by a signal.  		 */  		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); -		posix_cpu_timer_set(&timer, 0, &zero_it, it); +		error = posix_cpu_timer_set(&timer, 0, &zero_it, it); +		if (!error) { +			/* +			 * Timer is now unarmed, deletion can not fail. +			 */ +			posix_cpu_timer_del(&timer); +		}  		spin_unlock_irq(&timer.it_lock); +		while (error == TIMER_RETRY) { +			/* +			 * We need to handle case when timer was or is in the +			 * middle of firing. In other cases we already freed +			 * resources. +			 */ +			spin_lock_irq(&timer.it_lock); +			error = posix_cpu_timer_del(&timer); +			spin_unlock_irq(&timer.it_lock); +		} +  		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {  			/*  			 * It actually did fire already. diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 69185ae6b70..7edfe4b901e 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -639,6 +639,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)  {  	struct k_itimer *timr; +	/* +	 * timer_t could be any type >= int and we want to make sure any +	 * @timer_id outside positive int range fails lookup. +	 */ +	if ((unsigned long long)timer_id > INT_MAX) +		return NULL; +  	rcu_read_lock();  	timr = idr_find(&posix_timers_id, (int)timer_id);  	if (timr) { @@ -997,7 +1004,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,  	err = kc->clock_adj(which_clock, &ktx); -	if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) +	if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))  		return -EFAULT;  	return err; diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index ca304046d9e..c6422ffeda9 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);  void queue_up_suspend_work(void)  { -	if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) +	if (autosleep_state > PM_SUSPEND_ON)  		queue_work(autosleep_wq, &suspend_work);  } diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c16f9167de..d77663bfede 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,  static suspend_state_t decode_state(const char *buf, size_t n)  {  #ifdef CONFIG_SUSPEND -	suspend_state_t state = PM_SUSPEND_STANDBY; +	suspend_state_t state = PM_SUSPEND_MIN;  	const char * const *s;  #endif  	char *p; @@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);  #endif /* CONFIG_PM_TRACE */ +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, +				      struct kobj_attribute *attr, char *buf) +{ +	return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, +				       struct kobj_attribute *attr, +				       const char *buf, size_t n) +{ +	unsigned long val; + +	if (kstrtoul(buf, 10, &val)) +		return -EINVAL; + +	freeze_timeout_msecs = val; +	return n; +} + +power_attr(pm_freeze_timeout); + +#endif	/* CONFIG_FREEZER*/ +  static struct attribute * g[] = {  	&state_attr.attr,  #ifdef CONFIG_PM_TRACE @@ -576,6 +600,9 @@ static struct attribute * g[] = {  	&pm_print_times_attr.attr,  #endif  #endif +#ifdef CONFIG_FREEZER +	&pm_freeze_timeout_attr.attr, +#endif  	NULL,  }; diff --git a/kernel/power/process.c b/kernel/power/process.c index d5a258b60c6..98088e0e71e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -21,7 +21,7 @@  /*    * Timeout for stopping processes   */ -#define TIMEOUT	(20 * HZ) +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;  static int try_to_freeze_tasks(bool user_only)  { @@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)  	do_gettimeofday(&start); -	end_time = jiffies + TIMEOUT; +	end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);  	if (!user_only)  		freeze_workqueues_begin(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9322ff7eaad..587dddeebf1 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,  		return;  	} -	if (delayed_work_pending(&req->work)) -		cancel_delayed_work_sync(&req->work); +	cancel_delayed_work_sync(&req->work);  	if (new_value != req->node.prio)  		pm_qos_update_target( @@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,  		 "%s called for unknown object.", __func__))  		return; -	if (delayed_work_pending(&req->work)) -		cancel_delayed_work_sync(&req->work); +	cancel_delayed_work_sync(&req->work);  	if (new_value != req->node.prio)  		pm_qos_update_target( @@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)  		return;  	} -	if (delayed_work_pending(&req->work)) -		cancel_delayed_work_sync(&req->work); +	cancel_delayed_work_sync(&req->work);  	pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,  			     &req->node, PM_QOS_REMOVE_REQ, diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27d..d4feda084a3 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,12 +30,38 @@  #include "power.h"  const char *const pm_states[PM_SUSPEND_MAX] = { +	[PM_SUSPEND_FREEZE]	= "freeze",  	[PM_SUSPEND_STANDBY]	= "standby",  	[PM_SUSPEND_MEM]	= "mem",  };  static const struct platform_suspend_ops *suspend_ops; +static bool need_suspend_ops(suspend_state_t state) +{ +	return !!(state > PM_SUSPEND_FREEZE); +} + +static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static bool suspend_freeze_wake; + +static void freeze_begin(void) +{ +	suspend_freeze_wake = false; +} + +static void freeze_enter(void) +{ +	wait_event(suspend_freeze_wait_head, suspend_freeze_wake); +} + +void freeze_wake(void) +{ +	suspend_freeze_wake = true; +	wake_up(&suspend_freeze_wait_head); +} +EXPORT_SYMBOL_GPL(freeze_wake); +  /**   * suspend_set_ops - Set the global suspend method table.   * @ops: Suspend operations to use. @@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);  bool valid_state(suspend_state_t state)  { +	if (state == PM_SUSPEND_FREEZE) +		return true;  	/* -	 * All states need lowlevel support and need to be valid to the lowlevel +	 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel +	 * support and need to be valid to the lowlevel  	 * implementation, no valid callback implies that none are valid.  	 */  	return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); @@ -89,11 +118,11 @@ static int suspend_test(int level)   * hibernation).  Run suspend notifiers, allocate the "suspend" console and   * freeze processes.   */ -static int suspend_prepare(void) +static int suspend_prepare(suspend_state_t state)  {  	int error; -	if (!suspend_ops || !suspend_ops->enter) +	if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))  		return -EPERM;  	pm_prepare_console(); @@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  {  	int error; -	if (suspend_ops->prepare) { +	if (need_suspend_ops(state) && suspend_ops->prepare) {  		error = suspend_ops->prepare();  		if (error)  			goto Platform_finish; @@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  		goto Platform_finish;  	} -	if (suspend_ops->prepare_late) { +	if (need_suspend_ops(state) && suspend_ops->prepare_late) {  		error = suspend_ops->prepare_late();  		if (error)  			goto Platform_wake;  	} +	/* +	 * PM_SUSPEND_FREEZE equals +	 * frozen processes + suspended devices + idle processors. +	 * Thus we should invoke freeze_enter() soon after +	 * all the devices are suspended. +	 */ +	if (state == PM_SUSPEND_FREEZE) { +		freeze_enter(); +		goto Platform_wake; +	} +  	if (suspend_test(TEST_PLATFORM))  		goto Platform_wake; @@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	enable_nonboot_cpus();   Platform_wake: -	if (suspend_ops->wake) +	if (need_suspend_ops(state) && suspend_ops->wake)  		suspend_ops->wake();  	dpm_resume_start(PMSG_RESUME);   Platform_finish: -	if (suspend_ops->finish) +	if (need_suspend_ops(state) && suspend_ops->finish)  		suspend_ops->finish();  	return error; @@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)  	int error;  	bool wakeup = false; -	if (!suspend_ops) +	if (need_suspend_ops(state) && !suspend_ops)  		return -ENOSYS;  	trace_machine_suspend(state); -	if (suspend_ops->begin) { +	if (need_suspend_ops(state) && suspend_ops->begin) {  		error = suspend_ops->begin(state);  		if (error)  			goto Close; @@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)  	do {  		error = suspend_enter(state, &wakeup); -	} while (!error && !wakeup +	} while (!error && !wakeup && need_suspend_ops(state)  		&& suspend_ops->suspend_again && suspend_ops->suspend_again());   Resume_devices: @@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)  	ftrace_start();  	resume_console();   Close: -	if (suspend_ops->end) +	if (need_suspend_ops(state) && suspend_ops->end)  		suspend_ops->end();  	trace_machine_suspend(PWR_EVENT_EXIT);  	return error;   Recover_platform: -	if (suspend_ops->recover) +	if (need_suspend_ops(state) && suspend_ops->recover)  		suspend_ops->recover();  	goto Resume_devices;  } @@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)  	if (!mutex_trylock(&pm_mutex))  		return -EBUSY; +	if (state == PM_SUSPEND_FREEZE) +		freeze_begin(); +  	printk(KERN_INFO "PM: Syncing filesystems ... ");  	sys_sync();  	printk("done.\n");  	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); -	error = suspend_prepare(); +	error = suspend_prepare(state);  	if (error)  		goto Unlock; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 25596e450ac..9b2a1d58558 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)  	rtc_set_alarm(rtc, &alm);  } -static int __init has_wakealarm(struct device *dev, void *name_ptr) +static int __init has_wakealarm(struct device *dev, const void *data)  {  	struct rtc_device *candidate = to_rtc_device(dev); @@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)  	if (!device_may_wakeup(candidate->dev.parent))  		return 0; -	*(const char **)name_ptr = dev_name(dev);  	return 1;  } @@ -159,8 +158,8 @@ static int __init test_suspend(void)  	static char		warn_no_rtc[] __initdata =  		KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; -	char			*pony = NULL;  	struct rtc_device	*rtc = NULL; +	struct device		*dev;  	/* PM is initialized by now; is that state testable? */  	if (test_state == PM_SUSPEND_ON) @@ -171,9 +170,9 @@ static int __init test_suspend(void)  	}  	/* RTCs have initialized by now too ... can we use one? */ -	class_find_device(rtc_class, NULL, &pony, has_wakealarm); -	if (pony) -		rtc = rtc_class_open(pony); +	dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); +	if (dev) +		rtc = rtc_class_open(dev_name(dev));  	if (!rtc) {  		printk(warn_no_rtc);  		goto done; diff --git a/kernel/printk.c b/kernel/printk.c index 357f714ddd4..0b31715f335 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -42,6 +42,7 @@  #include <linux/notifier.h>  #include <linux/rculist.h>  #include <linux/poll.h> +#include <linux/irq_work.h>  #include <asm/uaccess.h> @@ -1967,30 +1968,32 @@ int is_console_locked(void)  static DEFINE_PER_CPU(int, printk_pending);  static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); -void printk_tick(void) +static void wake_up_klogd_work_func(struct irq_work *irq_work)  { -	if (__this_cpu_read(printk_pending)) { -		int pending = __this_cpu_xchg(printk_pending, 0); -		if (pending & PRINTK_PENDING_SCHED) { -			char *buf = __get_cpu_var(printk_sched_buf); -			printk(KERN_WARNING "[sched_delayed] %s", buf); -		} -		if (pending & PRINTK_PENDING_WAKEUP) -			wake_up_interruptible(&log_wait); +	int pending = __this_cpu_xchg(printk_pending, 0); + +	if (pending & PRINTK_PENDING_SCHED) { +		char *buf = __get_cpu_var(printk_sched_buf); +		printk(KERN_WARNING "[sched_delayed] %s", buf);  	} -} -int printk_needs_cpu(int cpu) -{ -	if (cpu_is_offline(cpu)) -		printk_tick(); -	return __this_cpu_read(printk_pending); +	if (pending & PRINTK_PENDING_WAKEUP) +		wake_up_interruptible(&log_wait);  } +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { +	.func = wake_up_klogd_work_func, +	.flags = IRQ_WORK_LAZY, +}; +  void wake_up_klogd(void)  { -	if (waitqueue_active(&log_wait)) +	preempt_disable(); +	if (waitqueue_active(&log_wait)) {  		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); +		irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); +	} +	preempt_enable();  }  static void console_cont_flush(char *text, size_t size) @@ -2471,6 +2474,7 @@ int printk_sched(const char *fmt, ...)  	va_end(args);  	__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); +	irq_work_queue(&__get_cpu_var(wake_up_klogd_work));  	local_irq_restore(flags);  	return r; diff --git a/kernel/profile.c b/kernel/profile.c index 1f391819c42..dc3384ee874 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -37,9 +37,6 @@ struct profile_hit {  #define NR_PROFILE_HIT		(PAGE_SIZE/sizeof(struct profile_hit))  #define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ) -/* Oprofile timer tick hook */ -static int (*timer_hook)(struct pt_regs *) __read_mostly; -  static atomic_t *prof_buffer;  static unsigned long prof_len, prof_shift; @@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)  }  EXPORT_SYMBOL_GPL(profile_event_unregister); -int register_timer_hook(int (*hook)(struct pt_regs *)) -{ -	if (timer_hook) -		return -EBUSY; -	timer_hook = hook; -	return 0; -} -EXPORT_SYMBOL_GPL(register_timer_hook); - -void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ -	WARN_ON(hook != timer_hook); -	timer_hook = NULL; -	/* make sure all CPUs see the NULL hook */ -	synchronize_sched();  /* Allow ongoing interrupts to complete. */ -} -EXPORT_SYMBOL_GPL(unregister_timer_hook); - -  #ifdef CONFIG_SMP  /*   * Each cpu has a pair of open-addressed hashtables for pending @@ -436,8 +414,6 @@ void profile_tick(int type)  {  	struct pt_regs *regs = get_irq_regs(); -	if (type == CPU_PROFILING && timer_hook) -		timer_hook(regs);  	if (!user_mode(regs) && prof_cpu_mask != NULL &&  	    cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))  		profile_hit(type, (void *)profile_pc(regs)); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 612a5612685..acbd28424d8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child)  	 * TASK_KILLABLE sleeps.  	 */  	if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) -		signal_wake_up(child, task_is_traced(child)); +		ptrace_signal_wake_up(child, true);  	spin_unlock(&child->sighand->siglock);  } +/* Ensure that nothing can wake it up, even SIGKILL */ +static bool ptrace_freeze_traced(struct task_struct *task) +{ +	bool ret = false; + +	/* Lockless, nobody but us can set this flag */ +	if (task->jobctl & JOBCTL_LISTENING) +		return ret; + +	spin_lock_irq(&task->sighand->siglock); +	if (task_is_traced(task) && !__fatal_signal_pending(task)) { +		task->state = __TASK_TRACED; +		ret = true; +	} +	spin_unlock_irq(&task->sighand->siglock); + +	return ret; +} + +static void ptrace_unfreeze_traced(struct task_struct *task) +{ +	if (task->state != __TASK_TRACED) +		return; + +	WARN_ON(!task->ptrace || task->parent != current); + +	spin_lock_irq(&task->sighand->siglock); +	if (__fatal_signal_pending(task)) +		wake_up_state(task, __TASK_TRACED); +	else +		task->state = TASK_TRACED; +	spin_unlock_irq(&task->sighand->siglock); +} +  /**   * ptrace_check_attach - check whether ptracee is ready for ptrace operation   * @child: ptracee to check for @@ -151,24 +185,29 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)  	 * be changed by us so it's not changing right after this.  	 */  	read_lock(&tasklist_lock); -	if ((child->ptrace & PT_PTRACED) && child->parent == current) { +	if (child->ptrace && child->parent == current) { +		WARN_ON(child->state == __TASK_TRACED);  		/*  		 * child->sighand can't be NULL, release_task()  		 * does ptrace_unlink() before __exit_signal().  		 */ -		spin_lock_irq(&child->sighand->siglock); -		WARN_ON_ONCE(task_is_stopped(child)); -		if (ignore_state || (task_is_traced(child) && -				     !(child->jobctl & JOBCTL_LISTENING))) +		if (ignore_state || ptrace_freeze_traced(child))  			ret = 0; -		spin_unlock_irq(&child->sighand->siglock);  	}  	read_unlock(&tasklist_lock); -	if (!ret && !ignore_state) -		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; +	if (!ret && !ignore_state) { +		if (!wait_task_inactive(child, __TASK_TRACED)) { +			/* +			 * This can only happen if may_ptrace_stop() fails and +			 * ptrace_stop() changes ->state back to TASK_RUNNING, +			 * so we should not worry about leaking __TASK_TRACED. +			 */ +			WARN_ON(child->state == __TASK_TRACED); +			ret = -ESRCH; +		} +	} -	/* All systems go.. */  	return ret;  } @@ -317,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request,  	 */  	if (task_is_stopped(task) &&  	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) -		signal_wake_up(task, 1); +		signal_wake_up_state(task, __TASK_STOPPED);  	spin_unlock(&task->sighand->siglock); @@ -673,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,  					     kiov->iov_len, kiov->iov_base);  } +/* + * This is declared in linux/regset.h and defined in machine-dependent + * code.  We put the export here, near the primary machine-neutral use, + * to ensure no machine forgets it. + */ +EXPORT_SYMBOL_GPL(task_user_regset_view);  #endif  int ptrace_request(struct task_struct *child, long request, @@ -737,7 +782,7 @@ int ptrace_request(struct task_struct *child, long request,  		 * tracee into STOP.  		 */  		if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) -			signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); +			ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);  		unlock_task_sighand(child, &flags);  		ret = 0; @@ -763,7 +808,7 @@ int ptrace_request(struct task_struct *child, long request,  			 * start of this trap and now.  Trigger re-trap.  			 */  			if (child->jobctl & JOBCTL_TRAP_NOTIFY) -				signal_wake_up(child, true); +				ptrace_signal_wake_up(child, true);  			ret = 0;  		}  		unlock_task_sighand(child, &flags); @@ -900,6 +945,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,  		goto out_put_task_struct;  	ret = arch_ptrace(child, request, addr, data); +	if (ret || request != PTRACE_DETACH) +		ptrace_unfreeze_traced(child);   out_put_task_struct:  	put_task_struct(child); @@ -1039,8 +1086,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,  	ret = ptrace_check_attach(child, request == PTRACE_KILL ||  				  request == PTRACE_INTERRUPT); -	if (!ret) +	if (!ret) {  		ret = compat_arch_ptrace(child, request, addr, data); +		if (ret || request != PTRACE_DETACH) +			ptrace_unfreeze_traced(child); +	}   out_put_task_struct:  	put_task_struct(child); diff --git a/kernel/rcu.h b/kernel/rcu.h index 20dfba576c2..7f8e7590e3e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)  extern int rcu_expedited; +#ifdef CONFIG_RCU_STALL_COMMON + +extern int rcu_cpu_stall_suppress; +int rcu_jiffies_till_stall_check(void); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +  #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a2cf76177b4..48ab70384a4 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);  #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */  #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, +			       unsigned long secs, +			       unsigned long c_old, unsigned long c)  { -	trace_rcu_torture_read(rcutorturename, rhp); +	trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);  }  EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);  #else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ +	do { } while (0)  #endif + +#ifdef CONFIG_RCU_STALL_COMMON + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA	       (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA	       0 +#endif + +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + +module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); + +int rcu_jiffies_till_stall_check(void) +{ +	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + +	/* +	 * Limit check must be consistent with the Kconfig limits +	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. +	 */ +	if (till_stall_check < 3) { +		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; +		till_stall_check = 3; +	} else if (till_stall_check > 300) { +		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; +		till_stall_check = 300; +	} +	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ +	rcu_cpu_stall_suppress = 1; +	return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { +	.notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ +	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); +	return 0; +} +early_initcall(check_cpu_stall_init); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e7dce58f9c2..a0714a51b6d 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,  		       void (*func)(struct rcu_head *rcu),  		       struct rcu_ctrlblk *rcp); -#include "rcutiny_plugin.h" -  static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; +#include "rcutiny_plugin.h" +  /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */  static void rcu_idle_enter_common(long long newval)  { @@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);   * interrupts don't count, we must be running at the first interrupt   * level.   */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void)  {  	return rcu_dynticks_nesting <= 1;  } @@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)   */  static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)  { +	reset_cpu_stall_ticks(rcp);  	if (rcp->rcucblist != NULL &&  	    rcp->donetail != rcp->curtail) {  		rcp->donetail = rcp->curtail; @@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)   */  void rcu_check_callbacks(int cpu, int user)  { +	check_cpu_stalls();  	if (user || rcu_is_cpu_rrupt_from_idle())  		rcu_sched_qs(cpu);  	else if (!in_softirq()) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f85016a2309..8a233002fae 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -33,6 +33,9 @@ struct rcu_ctrlblk {  	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */  	struct rcu_head **curtail;	/* ->next pointer of last CB. */  	RCU_TRACE(long qlen);		/* Number of pending CBs. */ +	RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ +	RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ +	RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */  	RCU_TRACE(char *name);		/* Name of RCU type. */  }; @@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;  EXPORT_SYMBOL_GPL(rcu_scheduler_active);  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#ifdef CONFIG_RCU_TRACE + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ +	unsigned long j; +	unsigned long js; + +	if (rcu_cpu_stall_suppress) +		return; +	rcp->ticks_this_gp++; +	j = jiffies; +	js = rcp->jiffies_stall; +	if (*rcp->curtail && ULONG_CMP_GE(j, js)) { +		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", +		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, +		       jiffies - rcp->gp_start, rcp->qlen); +		dump_stack(); +	} +	if (*rcp->curtail && ULONG_CMP_GE(j, js)) +		rcp->jiffies_stall = jiffies + +			3 * rcu_jiffies_till_stall_check() + 3; +	else if (ULONG_CMP_GE(j, js)) +		rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stall_preempt(void); + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +#ifdef CONFIG_RCU_TRACE +	rcp->ticks_this_gp = 0; +	rcp->gp_start = jiffies; +	rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +#endif /* #ifdef CONFIG_RCU_TRACE */ +} + +static void check_cpu_stalls(void) +{ +	RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); +	RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); +	RCU_TRACE(check_cpu_stall_preempt()); +} +  #ifdef CONFIG_TINY_PREEMPT_RCU  #include <linux/delay.h> @@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)  		/* Official start of GP. */  		rcu_preempt_ctrlblk.gpnum++;  		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); +		reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);  		/* Any blocked RCU readers block new GP. */  		if (rcu_preempt_blocked_readers_any()) @@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");  MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");  MODULE_LICENSE("GPL"); +static void check_cpu_stall_preempt(void) +{ +#ifdef CONFIG_TINY_PREEMPT_RCU +	check_cpu_stall(&rcu_preempt_ctrlblk.rcb); +#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +} +  #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 31dea01c85f..e1f3a8c9672 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -46,6 +46,7 @@  #include <linux/stat.h>  #include <linux/srcu.h>  #include <linux/slab.h> +#include <linux/trace_clock.h>  #include <asm/byteorder.h>  MODULE_LICENSE("GPL"); @@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");  #define rcu_can_boost() 0  #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ +	u64 ts = trace_clock_local(); +	unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); +	return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ +	return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ +  static unsigned long shutdown_time;	/* jiffies to system shutdown. */  static unsigned long boost_starttime;	/* jiffies of next boost test start. */  DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */ @@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg)  		/* Wait for the next test interval. */  		oldstarttime = boost_starttime;  		while (ULONG_CMP_LT(jiffies, oldstarttime)) { -			schedule_timeout_uninterruptible(1); +			schedule_timeout_interruptible(oldstarttime - jiffies);  			rcu_stutter_wait("rcu_torture_boost");  			if (kthread_should_stop() ||  			    fullstop != FULLSTOP_DONTSTOP) @@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void)  		return;  	if (atomic_xchg(&beenhere, 1) != 0)  		return; -	do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);  	ftrace_dump(DUMP_ALL);  } @@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)  {  	int idx;  	int completed; +	int completed_end;  	static DEFINE_RCU_RANDOM(rand);  	static DEFINE_SPINLOCK(rand_lock);  	struct rcu_torture *p;  	int pipe_count; +	unsigned long long ts;  	idx = cur_ops->readlock();  	completed = cur_ops->completed(); +	ts = rcu_trace_clock_local();  	p = rcu_dereference_check(rcu_torture_current,  				  rcu_read_lock_bh_held() ||  				  rcu_read_lock_sched_held() || @@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)  		cur_ops->readunlock(idx);  		return;  	} -	do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);  	if (p->rtort_mbtest == 0)  		atomic_inc(&n_rcu_torture_mberror);  	spin_lock(&rand_lock); @@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)  		/* Should not happen, but... */  		pipe_count = RCU_TORTURE_PIPE_LEN;  	} -	if (pipe_count > 1) +	completed_end = cur_ops->completed(); +	if (pipe_count > 1) { +		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, +					  completed, completed_end);  		rcutorture_trace_dump(); +	}  	__this_cpu_inc(rcu_torture_count[pipe_count]); -	completed = cur_ops->completed() - completed; +	completed = completed_end - completed;  	if (completed > RCU_TORTURE_PIPE_LEN) {  		/* Should not happen, but... */  		completed = RCU_TORTURE_PIPE_LEN; @@ -1094,11 +1114,13 @@ static int  rcu_torture_reader(void *arg)  {  	int completed; +	int completed_end;  	int idx;  	DEFINE_RCU_RANDOM(rand);  	struct rcu_torture *p;  	int pipe_count;  	struct timer_list t; +	unsigned long long ts;  	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");  	set_user_nice(current, 19); @@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg)  		}  		idx = cur_ops->readlock();  		completed = cur_ops->completed(); +		ts = rcu_trace_clock_local();  		p = rcu_dereference_check(rcu_torture_current,  					  rcu_read_lock_bh_held() ||  					  rcu_read_lock_sched_held() || @@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg)  			schedule_timeout_interruptible(HZ);  			continue;  		} -		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);  		if (p->rtort_mbtest == 0)  			atomic_inc(&n_rcu_torture_mberror);  		cur_ops->read_delay(&rand); @@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg)  			/* Should not happen, but... */  			pipe_count = RCU_TORTURE_PIPE_LEN;  		} -		if (pipe_count > 1) +		completed_end = cur_ops->completed(); +		if (pipe_count > 1) { +			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, +						  ts, completed, completed_end);  			rcutorture_trace_dump(); +		}  		__this_cpu_inc(rcu_torture_count[pipe_count]); -		completed = cur_ops->completed() - completed; +		completed = completed_end - completed;  		if (completed > RCU_TORTURE_PIPE_LEN) {  			/* Should not happen, but... */  			completed = RCU_TORTURE_PIPE_LEN; @@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)  				set_cpus_allowed_ptr(reader_tasks[i],  						     shuffle_tmp_mask);  	} -  	if (fakewriter_tasks) {  		for (i = 0; i < nfakewriters; i++)  			if (fakewriter_tasks[i])  				set_cpus_allowed_ptr(fakewriter_tasks[i],  						     shuffle_tmp_mask);  	} -  	if (writer_task)  		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); -  	if (stats_task)  		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); +	if (stutter_task) +		set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); +	if (fqs_task) +		set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); +	if (shutdown_task) +		set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU +	if (onoff_task) +		set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +	if (stall_task) +		set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); +	if (barrier_cbs_tasks) +		for (i = 0; i < n_barrier_cbs; i++) +			if (barrier_cbs_tasks[i]) +				set_cpus_allowed_ptr(barrier_cbs_tasks[i], +						     shuffle_tmp_mask); +	if (barrier_task) +		set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);  	if (rcu_idle_cpu == -1)  		rcu_idle_cpu = num_online_cpus() - 1; @@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void)  	barrier_cbs_wq =  		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),  			GFP_KERNEL); -	if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) +	if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)  		return -ENOMEM;  	for (i = 0; i < n_barrier_cbs; i++) {  		init_waitqueue_head(&barrier_cbs_wq[i]); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614..5b8ad827fd8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */   * The rcu_scheduler_active variable transitions from zero to one just   * before the first task is spawned.  So when this variable is zero, RCU   * can assume that there is but one task, allowing RCU to (for example) - * optimized synchronize_sched() to a simple barrier().  When this variable + * optimize synchronize_sched() to a simple barrier().  When this variable   * is one, RCU must actually do all the hard work required to detect real   * grace periods.  This variable is also used to suppress boot-time false   * positives from lockdep-RCU error checking. @@ -217,12 +217,6 @@ module_param(blimit, long, 0444);  module_param(qhimark, long, 0444);  module_param(qlowmark, long, 0444); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - -module_param(rcu_cpu_stall_suppress, int, 0644); -module_param(rcu_cpu_stall_timeout, int, 0644); -  static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;  static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; @@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)  }  /* - * Does the current CPU require a yet-as-unscheduled grace period? + * Does the current CPU require a not-yet-started grace period? + * The caller must have disabled interrupts to prevent races with + * normal callback registry.   */  static int  cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)  { -	struct rcu_head **ntp; +	int i; -	ntp = rdp->nxttail[RCU_DONE_TAIL + -			   (ACCESS_ONCE(rsp->completed) != rdp->completed)]; -	return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && -	       !rcu_gp_in_progress(rsp); +	if (rcu_gp_in_progress(rsp)) +		return 0;  /* No, a grace period is already in progress. */ +	if (!rdp->nxttail[RCU_NEXT_TAIL]) +		return 0;  /* No, this is a no-CBs (or offline) CPU. */ +	if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) +		return 1;  /* Yes, this CPU has newly registered callbacks. */ +	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) +		if (rdp->nxttail[i - 1] != rdp->nxttail[i] && +		    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), +				 rdp->nxtcompleted[i])) +			return 1;  /* Yes, CBs for future grace period. */ +	return 0; /* No grace period needed. */  }  /* @@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)  static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,  				bool user)  { -	trace_rcu_dyntick("Start", oldval, 0); +	trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);  	if (!user && !is_idle_task(current)) {  		struct task_struct *idle = idle_task(smp_processor_id()); @@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);   * interrupt from idle, return true.  The caller must have at least   * disabled preemption.   */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void)  {  	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;  } @@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  	return 0;  } -static int jiffies_till_stall_check(void) -{ -	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); - -	/* -	 * Limit check must be consistent with the Kconfig limits -	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. -	 */ -	if (till_stall_check < 3) { -		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; -		till_stall_check = 3; -	} else if (till_stall_check > 300) { -		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; -		till_stall_check = 300; -	} -	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} -  static void record_gp_stall_check_time(struct rcu_state *rsp)  {  	rsp->gp_start = jiffies; -	rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); +	rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();  }  /* @@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  		raw_spin_unlock_irqrestore(&rnp->lock, flags);  		return;  	} -	rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; +	rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	/* @@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp)  	raw_spin_lock_irqsave(&rnp->lock, flags);  	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))  		rsp->jiffies_stall = jiffies + -				     3 * jiffies_till_stall_check() + 3; +				     3 * rcu_jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	set_need_resched();  /* kick ourselves to get things going. */ @@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)  	}  } -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ -	rcu_cpu_stall_suppress = 1; -	return NOTIFY_DONE; -} -  /**   * rcu_cpu_stall_reset - prevent further stall warnings in current grace period   * @@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void)  		rsp->jiffies_stall = jiffies + ULONG_MAX / 2;  } -static struct notifier_block rcu_panic_block = { -	.notifier_call = rcu_panic, -}; - -static void __init check_cpu_stall_init(void) -{ -	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -} -  /*   * Update CPU-local rcu_data state to record the newly noticed grace period.   * This is used both when we started the grace period and when we notice @@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp)  }  /* + * Determine the value that ->completed will have at the end of the + * next subsequent grace period.  This is used to tag callbacks so that + * a CPU can invoke callbacks in a timely fashion even if that CPU has + * been dyntick-idle for an extended period with callbacks under the + * influence of RCU_FAST_NO_HZ. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static unsigned long rcu_cbs_completed(struct rcu_state *rsp, +				       struct rcu_node *rnp) +{ +	/* +	 * If RCU is idle, we just wait for the next grace period. +	 * But we can only be sure that RCU is idle if we are looking +	 * at the root rcu_node structure -- otherwise, a new grace +	 * period might have started, but just not yet gotten around +	 * to initializing the current non-root rcu_node structure. +	 */ +	if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) +		return rnp->completed + 1; + +	/* +	 * Otherwise, wait for a possible partial grace period and +	 * then the subsequent full grace period. +	 */ +	return rnp->completed + 2; +} + +/* + * If there is room, assign a ->completed number to any callbacks on + * this CPU that have not already been assigned.  Also accelerate any + * callbacks that were previously assigned a ->completed number that has + * since proven to be too conservative, which can happen if callbacks get + * assigned a ->completed number while RCU is idle, but with reference to + * a non-root rcu_node structure.  This function is idempotent, so it does + * not hurt to call it repeatedly. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +			       struct rcu_data *rdp) +{ +	unsigned long c; +	int i; + +	/* If the CPU has no callbacks, nothing to do. */ +	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) +		return; + +	/* +	 * Starting from the sublist containing the callbacks most +	 * recently assigned a ->completed number and working down, find the +	 * first sublist that is not assignable to an upcoming grace period. +	 * Such a sublist has something in it (first two tests) and has +	 * a ->completed number assigned that will complete sooner than +	 * the ->completed number for newly arrived callbacks (last test). +	 * +	 * The key point is that any later sublist can be assigned the +	 * same ->completed number as the newly arrived callbacks, which +	 * means that the callbacks in any of these later sublist can be +	 * grouped into a single sublist, whether or not they have already +	 * been assigned a ->completed number. +	 */ +	c = rcu_cbs_completed(rsp, rnp); +	for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) +		if (rdp->nxttail[i] != rdp->nxttail[i - 1] && +		    !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) +			break; + +	/* +	 * If there are no sublist for unassigned callbacks, leave. +	 * At the same time, advance "i" one sublist, so that "i" will +	 * index into the sublist where all the remaining callbacks should +	 * be grouped into. +	 */ +	if (++i >= RCU_NEXT_TAIL) +		return; + +	/* +	 * Assign all subsequent callbacks' ->completed number to the next +	 * full grace period and group them all in the sublist initially +	 * indexed by "i". +	 */ +	for (; i <= RCU_NEXT_TAIL; i++) { +		rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; +		rdp->nxtcompleted[i] = c; +	} + +	/* Trace depending on how much we were able to accelerate. */ +	if (!*rdp->nxttail[RCU_WAIT_TAIL]) +		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); +	else +		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); +} + +/* + * Move any callbacks whose grace period has completed to the + * RCU_DONE_TAIL sublist, then compact the remaining sublists and + * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * sublist.  This function is idempotent, so it does not hurt to + * invoke it repeatedly.  As long as it is not invoked -too- often... + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +			    struct rcu_data *rdp) +{ +	int i, j; + +	/* If the CPU has no callbacks, nothing to do. */ +	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) +		return; + +	/* +	 * Find all callbacks whose ->completed numbers indicate that they +	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. +	 */ +	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { +		if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) +			break; +		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; +	} +	/* Clean up any sublist tail pointers that were misordered above. */ +	for (j = RCU_WAIT_TAIL; j < i; j++) +		rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; + +	/* Copy down callbacks to fill in empty sublists. */ +	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { +		if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) +			break; +		rdp->nxttail[j] = rdp->nxttail[i]; +		rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; +	} + +	/* Classify any remaining callbacks. */ +	rcu_accelerate_cbs(rsp, rnp, rdp); +} + +/*   * Advance this CPU's callbacks, but only if the current grace period   * has ended.  This may be called only from the CPU to whom the rdp   * belongs.  In addition, the corresponding leaf rcu_node structure's @@ -1080,12 +1190,15 @@ static void  __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)  {  	/* Did another grace period end? */ -	if (rdp->completed != rnp->completed) { +	if (rdp->completed == rnp->completed) { -		/* Advance callbacks.  No harm if list empty. */ -		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; -		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; -		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; +		/* No, so just accelerate recent callbacks. */ +		rcu_accelerate_cbs(rsp, rnp, rdp); + +	} else { + +		/* Advance callbacks. */ +		rcu_advance_cbs(rsp, rnp, rdp);  		/* Remember that we saw this grace-period completion. */  		rdp->completed = rnp->completed; @@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)  	/*  	 * Because there is no grace period in progress right now,  	 * any callbacks we have up to this point will be satisfied -	 * by the next grace period.  So promote all callbacks to be -	 * handled after the end of the next grace period.  If the -	 * CPU is not yet aware of the end of the previous grace period, -	 * we need to allow for the callback advancement that will -	 * occur when it does become aware.  Deadlock prevents us from -	 * making it aware at this point: We cannot acquire a leaf -	 * rcu_node ->lock while holding the root rcu_node ->lock. +	 * by the next grace period.  So this is a good place to +	 * assign a grace period number to recently posted callbacks.  	 */ -	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; -	if (rdp->completed == rsp->completed) -		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; +	rcu_accelerate_cbs(rsp, rnp, rdp);  	rsp->gp_flags = RCU_GP_FLAG_INIT;  	raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ @@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)  		 * This GP can't end until cpu checks in, so all of our  		 * callbacks can be processed during the next GP.  		 */ -		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; +		rcu_accelerate_cbs(rsp, rnp, rdp);  		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */  	} @@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	long bl, count, count_lazy;  	int i; -	/* If no callbacks are ready, just return.*/ +	/* If no callbacks are ready, just return. */  	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {  		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);  		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), @@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)  	WARN_ON_ONCE(rdp->beenonline == 0); -	/* -	 * Advance callbacks in response to end of earlier grace -	 * period that some other CPU ended. -	 */ +	/* Handle the end of a grace period that some other CPU ended.  */  	rcu_process_gp_end(rsp, rdp);  	/* Update RCU state based on any recent quiescent states. */  	rcu_check_quiescent_state(rsp, rdp);  	/* Does this CPU require a not-yet-started grace period? */ +	local_irq_save(flags);  	if (cpu_needs_another_gp(rsp, rdp)) { -		raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); +		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */  		rcu_start_gp(rsp, flags);  /* releases above lock */ +	} else { +		local_irq_restore(flags);  	}  	/* If there are callbacks ready, invoke them. */ @@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);  	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);  	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -#ifdef CONFIG_RCU_USER_QS -	WARN_ON_ONCE(rdp->dynticks->in_user); -#endif  	rdp->cpu = cpu;  	rdp->rsp = rsp;  	rcu_boot_init_nocb_percpu_data(rdp); @@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,  	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */ +	/* Silence gcc 4.8 warning about array index out of range. */ +	if (rcu_num_lvls > RCU_NUM_LVLS) +		panic("rcu_init_one: rcu_num_lvls overflow"); +  	/* Initialize the level-tracking arrays. */  	for (i = 0; i < rcu_num_lvls; i++) @@ -3074,7 +3181,6 @@ void __init rcu_init(void)  	cpu_notifier(rcu_cpu_notify, 0);  	for_each_online_cpu(cpu)  		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); -	check_cpu_stall_init();  }  #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093..c896b5045d9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -102,10 +102,6 @@ struct rcu_dynticks {  				    /* idle-period nonlazy_posted snapshot. */  	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */  #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -#ifdef CONFIG_RCU_USER_QS -	bool ignore_user_qs;	    /* Treat userspace as extended QS or not */ -	bool in_user;		    /* Is the CPU in userland from RCU POV? */ -#endif  };  /* RCU's kthread states for tracing. */ @@ -282,6 +278,8 @@ struct rcu_data {  	 */  	struct rcu_head *nxtlist;  	struct rcu_head **nxttail[RCU_NEXT_SIZE]; +	unsigned long	nxtcompleted[RCU_NEXT_SIZE]; +					/* grace periods for sublists. */  	long		qlen_lazy;	/* # of lazy queued callbacks */  	long		qlen;		/* # of queued callbacks, incl lazy */  	long		qlen_last_fqs_check; @@ -343,11 +341,6 @@ struct rcu_data {  #define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */ -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA	       (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA	       0 -#endif  #define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */  						/*  to take at least one */  						/*  scheduling clock irq */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f6e5ec2932b..c1cc7e17ff9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -40,8 +40,7 @@  #ifdef CONFIG_RCU_NOCB_CPU  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */  static bool have_rcu_nocb_mask;	    /* Was rcu_nocb_mask allocated? */ -static bool rcu_nocb_poll;	    /* Offload kthread are to poll. */ -module_param(rcu_nocb_poll, bool, 0444); +static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */  static char __initdata nocb_buf[NR_CPUS * 5];  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -2159,6 +2158,13 @@ static int __init rcu_nocb_setup(char *str)  }  __setup("rcu_nocbs=", rcu_nocb_setup); +static int __init parse_rcu_nocb_poll(char *arg) +{ +	rcu_nocb_poll = 1; +	return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +  /* Is the specified CPU a no-CPUs CPU? */  static bool is_nocb_cpu(int cpu)  { @@ -2366,10 +2372,11 @@ static int rcu_nocb_kthread(void *arg)  	for (;;) {  		/* If not polling, wait for next batch of callbacks. */  		if (!rcu_nocb_poll) -			wait_event(rdp->nocb_wq, rdp->nocb_head); +			wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);  		list = ACCESS_ONCE(rdp->nocb_head);  		if (!list) {  			schedule_timeout_interruptible(1); +			flush_signals(current);  			continue;  		} diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 16502d3a71c..13b243a323f 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -17,6 +17,7 @@   * See rt.c in preempt-rt for proper credits and further information   */  #include <linux/sched.h> +#include <linux/sched/rt.h>  #include <linux/delay.h>  #include <linux/export.h>  #include <linux/spinlock.h> diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 98ec4947546..7890b10084a 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -10,6 +10,7 @@  #include <linux/kthread.h>  #include <linux/export.h>  #include <linux/sched.h> +#include <linux/sched/rt.h>  #include <linux/spinlock.h>  #include <linux/timer.h>  #include <linux/freezer.h> diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a242e691c99..1e09308bf2a 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -13,6 +13,7 @@  #include <linux/spinlock.h>  #include <linux/export.h>  #include <linux/sched.h> +#include <linux/sched/rt.h>  #include <linux/timer.h>  #include "rtmutex_common.h" diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a..64de5f8b0c9 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)  	ag->tg->rt_se = NULL;  	ag->tg->rt_rq = NULL;  #endif +	sched_offline_group(ag->tg);  	sched_destroy_group(ag->tg);  } @@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)  	if (IS_ERR(tg))  		goto out_free; +	sched_online_group(tg, &root_task_group); +  	kref_init(&ag->kref);  	init_rwsem(&ag->lock);  	ag->id = atomic_inc_return(&autogroup_seq_nr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb..2b5243176ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -83,7 +83,7 @@  #endif  #include "sched.h" -#include "../workqueue_sched.h" +#include "../workqueue_internal.h"  #include "../smpboot.h"  #define CREATE_TRACE_POINTS @@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);   */  static int select_fallback_rq(int cpu, struct task_struct *p)  { -	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); +	int nid = cpu_to_node(cpu); +	const struct cpumask *nodemask = NULL;  	enum { cpuset, possible, fail } state = cpuset;  	int dest_cpu; -	/* Look for allowed, online CPU in same node. */ -	for_each_cpu(dest_cpu, nodemask) { -		if (!cpu_online(dest_cpu)) -			continue; -		if (!cpu_active(dest_cpu)) -			continue; -		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) -			return dest_cpu; +	/* +	 * If the node that the cpu is on has been offlined, cpu_to_node() +	 * will return -1. There is no cpu on the node, and we should +	 * select the cpu on the other node. +	 */ +	if (nid != -1) { +		nodemask = cpumask_of_node(nid); + +		/* Look for allowed, online CPU in same node. */ +		for_each_cpu(dest_cpu, nodemask) { +			if (!cpu_online(dest_cpu)) +				continue; +			if (!cpu_active(dest_cpu)) +				continue; +			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) +				return dest_cpu; +		}  	}  	for (;;) { @@ -1523,7 +1533,8 @@ out:   */  int wake_up_process(struct task_struct *p)  { -	return try_to_wake_up(p, TASK_ALL, 0); +	WARN_ON(task_is_stopped_or_traced(p)); +	return try_to_wake_up(p, TASK_NORMAL, 0);  }  EXPORT_SYMBOL(wake_up_process); @@ -1968,11 +1979,10 @@ context_switch(struct rq *rq, struct task_struct *prev,  }  /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches:   *   * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup.   */  unsigned long nr_running(void)  { @@ -1984,23 +1994,6 @@ unsigned long nr_running(void)  	return sum;  } -unsigned long nr_uninterruptible(void) -{ -	unsigned long i, sum = 0; - -	for_each_possible_cpu(i) -		sum += cpu_rq(i)->nr_uninterruptible; - -	/* -	 * Since we read the counters lockless, it might be slightly -	 * inaccurate. Do not allow it to go below zero though: -	 */ -	if (unlikely((long)sum < 0)) -		sum = 0; - -	return sum; -} -  unsigned long long nr_context_switches(void)  {  	int i; @@ -2785,7 +2778,7 @@ static noinline void __schedule_bug(struct task_struct *prev)  	if (irqs_disabled())  		print_irqtrace_events(prev);  	dump_stack(); -	add_taint(TAINT_WARN); +	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);  }  /* @@ -4363,20 +4356,32 @@ EXPORT_SYMBOL(yield);   * It's the caller's job to ensure that the target task struct   * can't go away on us before we can do any checks.   * - * Returns true if we indeed boosted the target task. + * Returns: + *	true (>0) if we indeed boosted the target task. + *	false (0) if we failed to boost the target. + *	-ESRCH if there's no task to yield to.   */  bool __sched yield_to(struct task_struct *p, bool preempt)  {  	struct task_struct *curr = current;  	struct rq *rq, *p_rq;  	unsigned long flags; -	bool yielded = 0; +	int yielded = 0;  	local_irq_save(flags);  	rq = this_rq();  again:  	p_rq = task_rq(p); +	/* +	 * If we're the only runnable task on the rq and target rq also +	 * has only one task, there's absolutely no point in yielding. +	 */ +	if (rq->nr_running == 1 && p_rq->nr_running == 1) { +		yielded = -ESRCH; +		goto out_irq; +	} +  	double_rq_lock(rq, p_rq);  	while (task_rq(p) != p_rq) {  		double_rq_unlock(rq, p_rq); @@ -4384,13 +4389,13 @@ again:  	}  	if (!curr->sched_class->yield_to_task) -		goto out; +		goto out_unlock;  	if (curr->sched_class != p->sched_class) -		goto out; +		goto out_unlock;  	if (task_running(p_rq, p) || p->state) -		goto out; +		goto out_unlock;  	yielded = curr->sched_class->yield_to_task(rq, p, preempt);  	if (yielded) { @@ -4403,11 +4408,12 @@ again:  			resched_task(p_rq->curr);  	} -out: +out_unlock:  	double_rq_unlock(rq, p_rq); +out_irq:  	local_irq_restore(flags); -	if (yielded) +	if (yielded > 0)  		schedule();  	return yielded; @@ -4666,6 +4672,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)  	 */  	idle->sched_class = &idle_sched_class;  	ftrace_graph_init_idle_task(idle, cpu); +	vtime_init_idle(idle);  #if defined(CONFIG_SMP)  	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);  #endif @@ -7159,7 +7166,6 @@ static void free_sched_group(struct task_group *tg)  struct task_group *sched_create_group(struct task_group *parent)  {  	struct task_group *tg; -	unsigned long flags;  	tg = kzalloc(sizeof(*tg), GFP_KERNEL);  	if (!tg) @@ -7171,6 +7177,17 @@ struct task_group *sched_create_group(struct task_group *parent)  	if (!alloc_rt_sched_group(tg, parent))  		goto err; +	return tg; + +err: +	free_sched_group(tg); +	return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ +	unsigned long flags; +  	spin_lock_irqsave(&task_group_lock, flags);  	list_add_rcu(&tg->list, &task_groups); @@ -7180,12 +7197,6 @@ struct task_group *sched_create_group(struct task_group *parent)  	INIT_LIST_HEAD(&tg->children);  	list_add_rcu(&tg->siblings, &parent->children);  	spin_unlock_irqrestore(&task_group_lock, flags); - -	return tg; - -err: -	free_sched_group(tg); -	return ERR_PTR(-ENOMEM);  }  /* rcu callback to free various structures associated with a task group */ @@ -7198,6 +7209,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)  /* Destroy runqueue etc associated with a task group */  void sched_destroy_group(struct task_group *tg)  { +	/* wait for possible concurrent references to cfs_rqs complete */ +	call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{  	unsigned long flags;  	int i; @@ -7209,9 +7226,6 @@ void sched_destroy_group(struct task_group *tg)  	list_del_rcu(&tg->list);  	list_del_rcu(&tg->siblings);  	spin_unlock_irqrestore(&task_group_lock, flags); - -	/* wait for possible concurrent references to cfs_rqs complete */ -	call_rcu(&tg->rcu, free_sched_group_rcu);  }  /* change task's runqueue when it moves between groups. @@ -7507,6 +7521,25 @@ static int sched_rt_global_constraints(void)  }  #endif /* CONFIG_RT_GROUP_SCHED */ +int sched_rr_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, +		loff_t *ppos) +{ +	int ret; +	static DEFINE_MUTEX(mutex); + +	mutex_lock(&mutex); +	ret = proc_dointvec(table, write, buffer, lenp, ppos); +	/* make sure that internally we keep jiffies */ +	/* also, writing zero resets timeslice to default */ +	if (!ret && write) { +		sched_rr_timeslice = sched_rr_timeslice <= 0 ? +			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); +	} +	mutex_unlock(&mutex); +	return ret; +} +  int sched_rt_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp,  		loff_t *ppos) @@ -7563,6 +7596,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)  	return &tg->css;  } +static int cpu_cgroup_css_online(struct cgroup *cgrp) +{ +	struct task_group *tg = cgroup_tg(cgrp); +	struct task_group *parent; + +	if (!cgrp->parent) +		return 0; + +	parent = cgroup_tg(cgrp->parent); +	sched_online_group(tg, parent); +	return 0; +} +  static void cpu_cgroup_css_free(struct cgroup *cgrp)  {  	struct task_group *tg = cgroup_tg(cgrp); @@ -7570,6 +7616,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)  	sched_destroy_group(tg);  } +static void cpu_cgroup_css_offline(struct cgroup *cgrp) +{ +	struct task_group *tg = cgroup_tg(cgrp); + +	sched_offline_group(tg); +} +  static int cpu_cgroup_can_attach(struct cgroup *cgrp,  				 struct cgroup_taskset *tset)  { @@ -7925,6 +7978,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {  	.name		= "cpu",  	.css_alloc	= cpu_cgroup_css_alloc,  	.css_free	= cpu_cgroup_css_free, +	.css_online	= cpu_cgroup_css_online, +	.css_offline	= cpu_cgroup_css_offline,  	.can_attach	= cpu_cgroup_can_attach,  	.attach		= cpu_cgroup_attach,  	.exit		= cpu_cgroup_exit, diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 23aa789c53e..1095e878a46 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -28,6 +28,8 @@   */  #include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/sched/rt.h>  #include "cpupri.h"  /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 293b202fcf7..ed12cbb135f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,7 @@  #include <linux/tsacct_kern.h>  #include <linux/kernel_stat.h>  #include <linux/static_key.h> +#include <linux/context_tracking.h>  #include "sched.h" @@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,  	task_group_account_field(p, index, (__force u64) cputime);  	/* Account for user time used */ -	acct_update_integrals(p); +	acct_account_cputime(p);  }  /* @@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,  	task_group_account_field(p, index, (__force u64) cputime);  	/* Account for system time used */ -	acct_update_integrals(p); +	acct_account_cputime(p);  }  /* @@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)  {  	struct signal_struct *sig = tsk->signal; +	cputime_t utime, stime;  	struct task_struct *t;  	times->utime = sig->utime; @@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)  	t = tsk;  	do { -		times->utime += t->utime; -		times->stime += t->stime; +		task_cputime(tsk, &utime, &stime); +		times->utime += utime; +		times->stime += stime;  		times->sum_exec_runtime += task_sched_runtime(t);  	} while_each_thread(tsk, t);  out:  	rcu_read_unlock();  } -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -  #ifdef CONFIG_IRQ_TIME_ACCOUNTING  /*   * Account a tick to a process and cpustat @@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)  		irqtime_account_process_tick(current, 0, rq);  }  #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,  						struct rq *rq) {}  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE  /*   * Account a single tick of cpu time.   * @p: the process that the cpu time gets accounted to @@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)  	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);  	struct rq *rq = this_rq(); +	if (vtime_accounting_enabled()) +		return; +  	if (sched_clock_irqtime) {  		irqtime_account_process_tick(p, user_tick, rq);  		return; @@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)  	account_idle_time(jiffies_to_cputime(ticks));  } - -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */  /*   * Use precise platform statistics if available: @@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime  	*st = cputime.stime;  } -void vtime_account_system_irqsafe(struct task_struct *tsk) -{ -	unsigned long flags; - -	local_irq_save(flags); -	vtime_account_system(tsk); -	local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); -  #ifndef __ARCH_HAS_VTIME_TASK_SWITCH  void vtime_task_switch(struct task_struct *prev)  { +	if (!vtime_accounting_enabled()) +		return; +  	if (is_idle_task(prev))  		vtime_account_idle(prev);  	else  		vtime_account_system(prev); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE  	vtime_account_user(prev); +#endif  	arch_vtime_task_switch(prev);  }  #endif @@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)   * vtime_account().   */  #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk)  { -	if (in_interrupt() || !is_idle_task(tsk)) -		vtime_account_system(tsk); -	else -		vtime_account_idle(tsk); +	if (!vtime_accounting_enabled()) +		return; + +	if (!in_interrupt()) { +		/* +		 * If we interrupted user, context_tracking_in_user() +		 * is 1 because the context tracking don't hook +		 * on irq entry/exit. This way we know if +		 * we need to flush user time on kernel entry. +		 */ +		if (context_tracking_in_user()) { +			vtime_account_user(tsk); +			return; +		} + +		if (is_idle_task(tsk)) { +			vtime_account_idle(tsk); +			return; +		} +	} +	vtime_account_system(tsk);  } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter);  #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs) -#endif +#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)  {  	u64 temp = (__force u64) rtime; -	temp *= (__force u64) utime; +	temp *= (__force u64) stime;  	if (sizeof(cputime_t) == 4)  		temp = div_u64(temp, (__force u32) total); @@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,  			   struct cputime *prev,  			   cputime_t *ut, cputime_t *st)  { -	cputime_t rtime, utime, total; +	cputime_t rtime, stime, total; -	utime = curr->utime; -	total = utime + curr->stime; +	stime = curr->stime; +	total = stime + curr->utime;  	/*  	 * Tick based cputime accounting depend on random scheduling @@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,  	rtime = nsecs_to_cputime(curr->sum_exec_runtime);  	if (total) -		utime = scale_utime(utime, rtime, total); +		stime = scale_stime(stime, rtime, total);  	else -		utime = rtime; +		stime = rtime;  	/*  	 * If the tick based count grows faster than the scheduler one,  	 * the result of the scaling may go backward.  	 * Let's enforce monotonicity.  	 */ -	prev->utime = max(prev->utime, utime); -	prev->stime = max(prev->stime, rtime - prev->utime); +	prev->stime = max(prev->stime, stime); +	prev->utime = max(prev->utime, rtime - prev->stime);  	*ut = prev->utime;  	*st = prev->stime; @@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,  void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)  {  	struct task_cputime cputime = { -		.utime = p->utime, -		.stime = p->stime,  		.sum_exec_runtime = p->se.sum_exec_runtime,  	}; +	task_cputime(p, &cputime.utime, &cputime.stime);  	cputime_adjust(&cputime, &p->prev_cputime, ut, st);  } @@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime  	thread_group_cputime(p, &cputime);  	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);  } -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static unsigned long long vtime_delta(struct task_struct *tsk) +{ +	unsigned long long clock; + +	clock = local_clock(); +	if (clock < tsk->vtime_snap) +		return 0; + +	return clock - tsk->vtime_snap; +} + +static cputime_t get_vtime_delta(struct task_struct *tsk) +{ +	unsigned long long delta = vtime_delta(tsk); + +	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); +	tsk->vtime_snap += delta; + +	/* CHECKME: always safe to convert nsecs to cputime? */ +	return nsecs_to_cputime(delta); +} + +static void __vtime_account_system(struct task_struct *tsk) +{ +	cputime_t delta_cpu = get_vtime_delta(tsk); + +	account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + +void vtime_account_system(struct task_struct *tsk) +{ +	if (!vtime_accounting_enabled()) +		return; + +	write_seqlock(&tsk->vtime_seqlock); +	__vtime_account_system(tsk); +	write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_irq_exit(struct task_struct *tsk) +{ +	if (!vtime_accounting_enabled()) +		return; + +	write_seqlock(&tsk->vtime_seqlock); +	if (context_tracking_in_user()) +		tsk->vtime_snap_whence = VTIME_USER; +	__vtime_account_system(tsk); +	write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_user(struct task_struct *tsk) +{ +	cputime_t delta_cpu; + +	if (!vtime_accounting_enabled()) +		return; + +	delta_cpu = get_vtime_delta(tsk); + +	write_seqlock(&tsk->vtime_seqlock); +	tsk->vtime_snap_whence = VTIME_SYS; +	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); +	write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_user_enter(struct task_struct *tsk) +{ +	if (!vtime_accounting_enabled()) +		return; + +	write_seqlock(&tsk->vtime_seqlock); +	tsk->vtime_snap_whence = VTIME_USER; +	__vtime_account_system(tsk); +	write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_enter(struct task_struct *tsk) +{ +	write_seqlock(&tsk->vtime_seqlock); +	__vtime_account_system(tsk); +	current->flags |= PF_VCPU; +	write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_exit(struct task_struct *tsk) +{ +	write_seqlock(&tsk->vtime_seqlock); +	__vtime_account_system(tsk); +	current->flags &= ~PF_VCPU; +	write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_idle(struct task_struct *tsk) +{ +	cputime_t delta_cpu = get_vtime_delta(tsk); + +	account_idle_time(delta_cpu); +} + +bool vtime_accounting_enabled(void) +{ +	return context_tracking_active(); +} + +void arch_vtime_task_switch(struct task_struct *prev) +{ +	write_seqlock(&prev->vtime_seqlock); +	prev->vtime_snap_whence = VTIME_SLEEPING; +	write_sequnlock(&prev->vtime_seqlock); + +	write_seqlock(¤t->vtime_seqlock); +	current->vtime_snap_whence = VTIME_SYS; +	current->vtime_snap = sched_clock(); +	write_sequnlock(¤t->vtime_seqlock); +} + +void vtime_init_idle(struct task_struct *t) +{ +	unsigned long flags; + +	write_seqlock_irqsave(&t->vtime_seqlock, flags); +	t->vtime_snap_whence = VTIME_SYS; +	t->vtime_snap = sched_clock(); +	write_sequnlock_irqrestore(&t->vtime_seqlock, flags); +} + +cputime_t task_gtime(struct task_struct *t) +{ +	unsigned int seq; +	cputime_t gtime; + +	do { +		seq = read_seqbegin(&t->vtime_seqlock); + +		gtime = t->gtime; +		if (t->flags & PF_VCPU) +			gtime += vtime_delta(t); + +	} while (read_seqretry(&t->vtime_seqlock, seq)); + +	return gtime; +} + +/* + * Fetch cputime raw values from fields of task_struct and + * add up the pending nohz execution time since the last + * cputime snapshot. + */ +static void +fetch_task_cputime(struct task_struct *t, +		   cputime_t *u_dst, cputime_t *s_dst, +		   cputime_t *u_src, cputime_t *s_src, +		   cputime_t *udelta, cputime_t *sdelta) +{ +	unsigned int seq; +	unsigned long long delta; + +	do { +		*udelta = 0; +		*sdelta = 0; + +		seq = read_seqbegin(&t->vtime_seqlock); + +		if (u_dst) +			*u_dst = *u_src; +		if (s_dst) +			*s_dst = *s_src; + +		/* Task is sleeping, nothing to add */ +		if (t->vtime_snap_whence == VTIME_SLEEPING || +		    is_idle_task(t)) +			continue; + +		delta = vtime_delta(t); + +		/* +		 * Task runs either in user or kernel space, add pending nohz time to +		 * the right place. +		 */ +		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { +			*udelta = delta; +		} else { +			if (t->vtime_snap_whence == VTIME_SYS) +				*sdelta = delta; +		} +	} while (read_seqretry(&t->vtime_seqlock, seq)); +} + + +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +{ +	cputime_t udelta, sdelta; + +	fetch_task_cputime(t, utime, stime, &t->utime, +			   &t->stime, &udelta, &sdelta); +	if (utime) +		*utime += udelta; +	if (stime) +		*stime += sdelta; +} + +void task_cputime_scaled(struct task_struct *t, +			 cputime_t *utimescaled, cputime_t *stimescaled) +{ +	cputime_t udelta, sdelta; + +	fetch_task_cputime(t, utimescaled, stimescaled, +			   &t->utimescaled, &t->stimescaled, &udelta, &sdelta); +	if (utimescaled) +		*utimescaled += cputime_to_scaled(udelta); +	if (stimescaled) +		*stimescaled += cputime_to_scaled(sdelta); +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2cd3c1b4e58..75024a67352 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)  	if (autogroup_path(tg, group_path, PATH_MAX))  		return group_path; -	/* -	 * May be NULL if the underlying cgroup isn't fully-created yet -	 */ -	if (!tg->css.cgroup) { -		group_path[0] = '\0'; -		return group_path; -	}  	cgroup_path(tg->css.cgroup, group_path, PATH_MAX);  	return group_path;  } @@ -222,8 +215,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  			cfs_rq->runnable_load_avg);  	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",  			cfs_rq->blocked_load_avg); -	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg", -			atomic64_read(&cfs_rq->tg->load_avg)); +	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_avg", +			(unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));  	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",  			cfs_rq->tg_load_contrib);  	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib", @@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)  	{  		unsigned int freq = cpu_khz ? : 1; -		SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", +		SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",  			   cpu, freq / 1000, (freq % 1000));  	}  #else -	SEQ_printf(m, "\ncpu#%d\n", cpu); +	SEQ_printf(m, "cpu#%d\n", cpu);  #endif  #define P(x)								\ @@ -330,6 +323,7 @@ do {									\  	print_rq(m, rq, cpu);  	rcu_read_unlock();  	spin_unlock_irqrestore(&sched_debug_lock, flags); +	SEQ_printf(m, "\n");  }  static const char *sched_tunable_scaling_names[] = { @@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {  	"linear"  }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m)  {  	u64 ktime, sched_clk, cpu_clk;  	unsigned long flags; -	int cpu;  	local_irq_save(flags);  	ktime = ktime_to_ns(ktime_get()); @@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)  #undef PN  #undef P -	SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", +	SEQ_printf(m, "  .%-40s: %d (%s)\n", +		"sysctl_sched_tunable_scaling",  		sysctl_sched_tunable_scaling,  		sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); +	SEQ_printf(m, "\n"); +} -	for_each_online_cpu(cpu) -		print_cpu(m, cpu); +static int sched_debug_show(struct seq_file *m, void *v) +{ +	int cpu = (unsigned long)(v - 2); -	SEQ_printf(m, "\n"); +	if (cpu != -1) +		print_cpu(m, cpu); +	else +		sched_debug_header(m);  	return 0;  }  void sysrq_sched_debug_show(void)  { -	sched_debug_show(NULL, NULL); +	int cpu; + +	sched_debug_header(NULL); +	for_each_online_cpu(cpu) +		print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ +	unsigned long n = *offset; + +	if (n == 0) +		return (void *) 1; + +	n--; + +	if (n > 0) +		n = cpumask_next(n - 1, cpu_online_mask); +	else +		n = cpumask_first(cpu_online_mask); + +	*offset = n + 1; + +	if (n < nr_cpu_ids) +		return (void *)(unsigned long)(n + 2); +	return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ +	(*offset)++; +	return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { +	.start = sched_debug_start, +	.next = sched_debug_next, +	.stop = sched_debug_stop, +	.show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ +	seq_release(inode, file); + +	return 0;  }  static int sched_debug_open(struct inode *inode, struct file *filp)  { -	return single_open(filp, sched_debug_show, NULL); +	int ret = 0; + +	ret = seq_open(filp, &sched_debug_sops); + +	return ret;  }  static const struct file_operations sched_debug_fops = {  	.open		= sched_debug_open,  	.read		= seq_read,  	.llseek		= seq_lseek, -	.release	= single_release, +	.release	= sched_debug_release,  };  static int __init init_sched_debug_procfs(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eea8707234..7a33e5986fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  	}  	/* ensure we never gain time by being placed backwards. */ -	vruntime = max_vruntime(se->vruntime, vruntime); - -	se->vruntime = vruntime; +	se->vruntime = max_vruntime(se->vruntime, vruntime);  }  static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -2663,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  	hrtimer_cancel(&cfs_b->slack_timer);  } -static void unthrottle_offline_cfs_rqs(struct rq *rq) +static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  {  	struct cfs_rq *cfs_rq; @@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)   */  static int select_idle_sibling(struct task_struct *p, int target)  { -	int cpu = smp_processor_id(); -	int prev_cpu = task_cpu(p);  	struct sched_domain *sd;  	struct sched_group *sg; -	int i; +	int i = task_cpu(p); -	/* -	 * If the task is going to be woken-up on this cpu and if it is -	 * already idle, then it is the right target. -	 */ -	if (target == cpu && idle_cpu(cpu)) -		return cpu; +	if (idle_cpu(target)) +		return target;  	/* -	 * If the task is going to be woken-up on the cpu where it previously -	 * ran and if it is currently idle, then it the right target. +	 * If the prevous cpu is cache affine and idle, don't be stupid.  	 */ -	if (target == prev_cpu && idle_cpu(prev_cpu)) -		return prev_cpu; +	if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) +		return i;  	/*  	 * Otherwise, iterate the domains and find an elegible idle cpu. @@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)  				goto next;  			for_each_cpu(i, sched_group_cpus(sg)) { -				if (!idle_cpu(i)) +				if (i == target || !idle_cpu(i))  					goto next;  			} @@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task  	 * idle runqueue:  	 */  	if (rq->cfs.load.weight) -		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); +		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));  	return rr_interval;  } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 418feb01344..127a2c4cf4a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@  #include <linux/slab.h> +int sched_rr_timeslice = RR_TIMESLICE; +  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);  struct rt_bandwidth def_rt_bandwidth; @@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)  static int do_balance_runtime(struct rt_rq *rt_rq)  {  	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); -	struct root_domain *rd = cpu_rq(smp_processor_id())->rd; +	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;  	int i, weight, more = 0;  	u64 rt_period; @@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)  		return;  	delta_exec = rq->clock_task - curr->se.exec_start; -	if (unlikely((s64)delta_exec < 0)) -		delta_exec = 0; +	if (unlikely((s64)delta_exec <= 0)) +		return;  	schedstat_set(curr->se.statistics.exec_max,  		      max(curr->se.statistics.exec_max, delta_exec)); @@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)  {  	if (!task_running(rq, p) && -	    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && -	    (p->nr_cpus_allowed > 1)) +	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))  		return 1;  	return 0;  } @@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)  	 * we may need to handle the pulling of RT tasks  	 * now.  	 */ -	if (p->on_rq && !rq->rt.rt_nr_running) -		pull_rt_task(rq); +	if (!p->on_rq || rq->rt.rt_nr_running) +		return; + +	if (pull_rt_task(rq)) +		resched_task(rq->curr);  }  void init_sched_rt_class(void) @@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)  	if (soft != RLIM_INFINITY) {  		unsigned long next; -		p->rt.timeout++; +		if (p->rt.watchdog_stamp != jiffies) { +			p->rt.timeout++; +			p->rt.watchdog_stamp = jiffies; +		} +  		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);  		if (p->rt.timeout > next)  			p->cputime_expires.sched_exp = p->se.sum_exec_runtime; @@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  	if (--p->rt.time_slice)  		return; -	p->rt.time_slice = RR_TIMESLICE; +	p->rt.time_slice = sched_rr_timeslice;  	/*  	 * Requeue to the end of queue if we (and all of our ancestors) are the @@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)  	 * Time slice is 0 for SCHED_FIFO tasks  	 */  	if (task->policy == SCHED_RR) -		return RR_TIMESLICE; +		return sched_rr_timeslice;  	else  		return 0;  } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc886441436..cc03cfdf469 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,5 +1,7 @@  #include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h>  #include <linux/mutex.h>  #include <linux/spinlock.h>  #include <linux/stop_machine.h> diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 903ffa9e887..e036eda1a9c 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)  	if (mask_str == NULL)  		return -ENOMEM; -	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); -	seq_printf(seq, "timestamp %lu\n", jiffies); -	for_each_online_cpu(cpu) { -		struct rq *rq = cpu_rq(cpu); +	if (v == (void *)1) { +		seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); +		seq_printf(seq, "timestamp %lu\n", jiffies); +	} else { +		struct rq *rq;  #ifdef CONFIG_SMP  		struct sched_domain *sd;  		int dcount = 0;  #endif +		cpu = (unsigned long)(v - 2); +		rq = cpu_rq(cpu);  		/* runqueue-specific stats */  		seq_printf(seq, @@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)  	return 0;  } -static int schedstat_open(struct inode *inode, struct file *file) +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *schedstat_start(struct seq_file *file, loff_t *offset)  { -	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); -	char *buf = kmalloc(size, GFP_KERNEL); -	struct seq_file *m; -	int res; +	unsigned long n = *offset; -	if (!buf) -		return -ENOMEM; -	res = single_open(file, show_schedstat, NULL); -	if (!res) { -		m = file->private_data; -		m->buf = buf; -		m->size = size; -	} else -		kfree(buf); -	return res; +	if (n == 0) +		return (void *) 1; + +	n--; + +	if (n > 0) +		n = cpumask_next(n - 1, cpu_online_mask); +	else +		n = cpumask_first(cpu_online_mask); + +	*offset = n + 1; + +	if (n < nr_cpu_ids) +		return (void *)(unsigned long)(n + 2); +	return NULL; +} + +static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) +{ +	(*offset)++; +	return schedstat_start(file, offset); +} + +static void schedstat_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations schedstat_sops = { +	.start = schedstat_start, +	.next  = schedstat_next, +	.stop  = schedstat_stop, +	.show  = show_schedstat, +}; + +static int schedstat_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &schedstat_sops);  } +static int schedstat_release(struct inode *inode, struct file *file) +{ +	return 0; +}; +  static const struct file_operations proc_schedstat_operations = {  	.open    = schedstat_open,  	.read    = seq_read,  	.llseek  = seq_lseek, -	.release = single_release, +	.release = schedstat_release,  };  static int __init proc_schedstat_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 53cd5c4d117..2a7ae296318 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -680,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)   * No need to set need_resched since signal event passing   * goes through ->blocked   */ -void signal_wake_up(struct task_struct *t, int resume) +void signal_wake_up_state(struct task_struct *t, unsigned int state)  { -	unsigned int mask; -  	set_tsk_thread_flag(t, TIF_SIGPENDING); -  	/* -	 * For SIGKILL, we want to wake it up in the stopped/traced/killable +	 * TASK_WAKEKILL also means wake it up in the stopped/traced/killable  	 * case. We don't check t->state here because there is a race with it  	 * executing another processor and just now entering stopped state.  	 * By using wake_up_state, we ensure the process will wake up and  	 * handle its death signal.  	 */ -	mask = TASK_INTERRUPTIBLE; -	if (resume) -		mask |= TASK_WAKEKILL; -	if (!wake_up_state(t, mask)) +	if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))  		kick_process(t);  } @@ -844,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t)  	assert_spin_locked(&t->sighand->siglock);  	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); -	signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); +	ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);  }  /* @@ -1638,6 +1632,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)  	unsigned long flags;  	struct sighand_struct *psig;  	bool autoreap = false; +	cputime_t utime, stime;  	BUG_ON(sig == -1); @@ -1675,8 +1670,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)  				       task_uid(tsk));  	rcu_read_unlock(); -	info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); -	info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); +	task_cputime(tsk, &utime, &stime); +	info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime); +	info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);  	info.si_status = tsk->exit_code & 0x7f;  	if (tsk->exit_code & 0x80) @@ -1740,6 +1736,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,  	unsigned long flags;  	struct task_struct *parent;  	struct sighand_struct *sighand; +	cputime_t utime, stime;  	if (for_ptracer) {  		parent = tsk->parent; @@ -1758,8 +1755,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,  	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));  	rcu_read_unlock(); -	info.si_utime = cputime_to_clock_t(tsk->utime); -	info.si_stime = cputime_to_clock_t(tsk->stime); +	task_cputime(tsk, &utime, &stime); +	info.si_utime = cputime_to_clock_t(utime); +	info.si_stime = cputime_to_clock_t(stime);   	info.si_code = why;   	switch (why) { @@ -1800,6 +1798,10 @@ static inline int may_ptrace_stop(void)  	 * If SIGKILL was already sent before the caller unlocked  	 * ->siglock we must see ->core_state != NULL. Otherwise it  	 * is safe to enter schedule(). +	 * +	 * This is almost outdated, a task with the pending SIGKILL can't +	 * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported +	 * after SIGKILL was already dequeued.  	 */  	if (unlikely(current->mm->core_state) &&  	    unlikely(current->mm == current->parent->mm)) @@ -1925,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)  		if (gstop_done)  			do_notify_parent_cldstop(current, false, why); +		/* tasklist protects us from ptrace_freeze_traced() */  		__set_current_state(TASK_RUNNING);  		if (clear_code)  			current->exit_code = 0; @@ -2396,6 +2399,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,  	tracehook_signal_handler(sig, info, ka, regs, stepping);  } +void signal_setup_done(int failed, struct ksignal *ksig, int stepping) +{ +	if (failed) +		force_sigsegv(ksig->sig, current); +	else +		signal_delivered(ksig->sig, &ksig->info, &ksig->ka, +			signal_pt_regs(), stepping); +} +  /*   * It could be that complete_signal() picked us to notify about the   * group-wide signal. Other threads should be notified now to take @@ -2613,28 +2625,58 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,  	return 0;  } -long do_sigpending(void __user *set, unsigned long sigsetsize) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, +		compat_sigset_t __user *, oset, compat_size_t, sigsetsize)  { -	long error = -EINVAL; -	sigset_t pending; +#ifdef __BIG_ENDIAN +	sigset_t old_set = current->blocked; + +	/* XXX: Don't preclude handling different sized sigset_t's.  */ +	if (sigsetsize != sizeof(sigset_t)) +		return -EINVAL; + +	if (nset) { +		compat_sigset_t new32; +		sigset_t new_set; +		int error; +		if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) +			return -EFAULT; + +		sigset_from_compat(&new_set, &new32); +		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + +		error = sigprocmask(how, &new_set, NULL); +		if (error) +			return error; +	} +	if (oset) { +		compat_sigset_t old32; +		sigset_to_compat(&old32, &old_set); +		if (copy_to_user(oset, &old_set, sizeof(sigset_t))) +			return -EFAULT; +	} +	return 0; +#else +	return sys_rt_sigprocmask(how, (sigset_t __user *)nset, +				  (sigset_t __user *)oset, sigsetsize); +#endif +} +#endif +static int do_sigpending(void *set, unsigned long sigsetsize) +{  	if (sigsetsize > sizeof(sigset_t)) -		goto out; +		return -EINVAL;  	spin_lock_irq(¤t->sighand->siglock); -	sigorsets(&pending, ¤t->pending.signal, +	sigorsets(set, ¤t->pending.signal,  		  ¤t->signal->shared_pending.signal);  	spin_unlock_irq(¤t->sighand->siglock);  	/* Outside the lock because only this thread touches it.  */ -	sigandsets(&pending, ¤t->blocked, &pending); - -	error = -EFAULT; -	if (!copy_to_user(set, &pending, sigsetsize)) -		error = 0; - -out: -	return error; +	sigandsets(set, ¤t->blocked, set); +	return 0;  }  /** @@ -2643,11 +2685,36 @@ out:   *  @set: stores pending signals   *  @sigsetsize: size of sigset_t type or larger   */ -SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)  { -	return do_sigpending(set, sigsetsize); +	sigset_t set; +	int err = do_sigpending(&set, sigsetsize); +	if (!err && copy_to_user(uset, &set, sigsetsize)) +		err = -EFAULT; +	return err;  } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, +		compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN +	sigset_t set; +	int err = do_sigpending(&set, sigsetsize); +	if (!err) { +		compat_sigset_t set32; +		sigset_to_compat(&set32, &set); +		/* we can get here only if sigsetsize <= sizeof(set) */ +		if (copy_to_user(uset, &set32, sigsetsize)) +			err = -EFAULT; +	} +	return err; +#else +	return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); +#endif +} +#endif +  #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER  int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) @@ -2924,6 +2991,22 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)  	return do_tkill(0, pid, sig);  } +static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) +{ +	/* Not even root can pretend to send signals from the kernel. +	 * Nor can they impersonate a kill()/tgkill(), which adds source info. +	 */ +	if (info->si_code >= 0 || info->si_code == SI_TKILL) { +		/* We used to allow any < 0 si_code */ +		WARN_ON_ONCE(info->si_code < 0); +		return -EPERM; +	} +	info->si_signo = sig; + +	/* POSIX.1b doesn't mention process groups.  */ +	return kill_proc_info(sig, info, pid); +} +  /**   *  sys_rt_sigqueueinfo - send signal information to a signal   *  @pid: the PID of the thread @@ -2934,25 +3017,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,  		siginfo_t __user *, uinfo)  {  	siginfo_t info; -  	if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))  		return -EFAULT; +	return do_rt_sigqueueinfo(pid, sig, &info); +} -	/* Not even root can pretend to send signals from the kernel. -	 * Nor can they impersonate a kill()/tgkill(), which adds source info. -	 */ -	if (info.si_code >= 0 || info.si_code == SI_TKILL) { -		/* We used to allow any < 0 si_code */ -		WARN_ON_ONCE(info.si_code < 0); -		return -EPERM; -	} -	info.si_signo = sig; - -	/* POSIX.1b doesn't mention process groups.  */ -	return kill_proc_info(sig, &info, pid); +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, +			compat_pid_t, pid, +			int, sig, +			struct compat_siginfo __user *, uinfo) +{ +	siginfo_t info; +	int ret = copy_siginfo_from_user32(&info, uinfo); +	if (unlikely(ret)) +		return ret; +	return do_rt_sigqueueinfo(pid, sig, &info);  } +#endif -long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)  {  	/* This is only valid for single tasks */  	if (pid <= 0 || tgid <= 0) @@ -2982,6 +3066,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,  	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);  } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, +			compat_pid_t, tgid, +			compat_pid_t, pid, +			int, sig, +			struct compat_siginfo __user *, uinfo) +{ +	siginfo_t info; + +	if (copy_siginfo_from_user32(&info, uinfo)) +		return -EFAULT; +	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} +#endif +  int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)  {  	struct task_struct *t = current; @@ -3027,7 +3126,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)  	return 0;  } -int  +static int   do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)  {  	stack_t oss; @@ -3092,12 +3191,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s  out:  	return error;  } -#ifdef CONFIG_GENERIC_SIGALTSTACK  SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)  {  	return do_sigaltstack(uss, uoss, current_user_stack_pointer());  } -#endif  int restore_altstack(const stack_t __user *uss)  { @@ -3115,7 +3212,6 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)  }  #ifdef CONFIG_COMPAT -#ifdef CONFIG_GENERIC_SIGALTSTACK  COMPAT_SYSCALL_DEFINE2(sigaltstack,  			const compat_stack_t __user *, uss_ptr,  			compat_stack_t __user *, uoss_ptr) @@ -3165,7 +3261,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)  		__put_user(t->sas_ss_size, &uss->ss_size);  }  #endif -#endif  #ifdef __ARCH_WANT_SYS_SIGPENDING @@ -3175,7 +3270,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)   */  SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)  { -	return do_sigpending(set, sizeof(*set)); +	return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));   }  #endif @@ -3231,7 +3326,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,  }  #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ -#ifdef __ARCH_WANT_SYS_RT_SIGACTION +#ifndef CONFIG_ODD_RT_SIGACTION  /**   *  sys_rt_sigaction - alter an action taken by a process   *  @sig: signal to be sent @@ -3265,7 +3360,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,  out:  	return ret;  } -#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, +		const struct compat_sigaction __user *, act, +		struct compat_sigaction __user *, oact, +		compat_size_t, sigsetsize) +{ +	struct k_sigaction new_ka, old_ka; +	compat_sigset_t mask; +#ifdef __ARCH_HAS_SA_RESTORER +	compat_uptr_t restorer; +#endif +	int ret; + +	/* XXX: Don't preclude handling different sized sigset_t's.  */ +	if (sigsetsize != sizeof(compat_sigset_t)) +		return -EINVAL; + +	if (act) { +		compat_uptr_t handler; +		ret = get_user(handler, &act->sa_handler); +		new_ka.sa.sa_handler = compat_ptr(handler); +#ifdef __ARCH_HAS_SA_RESTORER +		ret |= get_user(restorer, &act->sa_restorer); +		new_ka.sa.sa_restorer = compat_ptr(restorer); +#endif +		ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); +		ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); +		if (ret) +			return -EFAULT; +		sigset_from_compat(&new_ka.sa.sa_mask, &mask); +	} + +	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); +	if (!ret && oact) { +		sigset_to_compat(&mask, &old_ka.sa.sa_mask); +		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),  +			       &oact->sa_handler); +		ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); +		ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); +#ifdef __ARCH_HAS_SA_RESTORER +		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), +				&oact->sa_restorer); +#endif +	} +	return ret; +} +#endif +#endif /* !CONFIG_ODD_RT_SIGACTION */ + +#ifdef CONFIG_OLD_SIGACTION +SYSCALL_DEFINE3(sigaction, int, sig, +		const struct old_sigaction __user *, act, +	        struct old_sigaction __user *, oact) +{ +	struct k_sigaction new_ka, old_ka; +	int ret; + +	if (act) { +		old_sigset_t mask; +		if (!access_ok(VERIFY_READ, act, sizeof(*act)) || +		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) || +		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || +		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) || +		    __get_user(mask, &act->sa_mask)) +			return -EFAULT; +#ifdef __ARCH_HAS_KA_RESTORER +		new_ka.ka_restorer = NULL; +#endif +		siginitset(&new_ka.sa.sa_mask, mask); +	} + +	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + +	if (!ret && oact) { +		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || +		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || +		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || +		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || +		    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) +			return -EFAULT; +	} + +	return ret; +} +#endif +#ifdef CONFIG_COMPAT_OLD_SIGACTION +COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, +		const struct compat_old_sigaction __user *, act, +	        struct compat_old_sigaction __user *, oact) +{ +	struct k_sigaction new_ka, old_ka; +	int ret; +	compat_old_sigset_t mask; +	compat_uptr_t handler, restorer; + +	if (act) { +		if (!access_ok(VERIFY_READ, act, sizeof(*act)) || +		    __get_user(handler, &act->sa_handler) || +		    __get_user(restorer, &act->sa_restorer) || +		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) || +		    __get_user(mask, &act->sa_mask)) +			return -EFAULT; + +#ifdef __ARCH_HAS_KA_RESTORER +		new_ka.ka_restorer = NULL; +#endif +		new_ka.sa.sa_handler = compat_ptr(handler); +		new_ka.sa.sa_restorer = compat_ptr(restorer); +		siginitset(&new_ka.sa.sa_mask, mask); +	} + +	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + +	if (!ret && oact) { +		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || +		    __put_user(ptr_to_compat(old_ka.sa.sa_handler), +			       &oact->sa_handler) || +		    __put_user(ptr_to_compat(old_ka.sa.sa_restorer), +			       &oact->sa_restorer) || +		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || +		    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) +			return -EFAULT; +	} +	return ret; +} +#endif  #ifdef __ARCH_WANT_SYS_SGETMASK @@ -3333,7 +3553,6 @@ int sigsuspend(sigset_t *set)  	return -ERESTARTNOHAND;  } -#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND  /**   *  sys_rt_sigsuspend - replace the signal mask for a value with the   *	@unewset value until a signal is received @@ -3352,7 +3571,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)  		return -EFAULT;  	return sigsuspend(&newset);  } -#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ +  +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN +	sigset_t newset; +	compat_sigset_t newset32; + +	/* XXX: Don't preclude handling different sized sigset_t's.  */ +	if (sigsetsize != sizeof(sigset_t)) +		return -EINVAL; + +	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) +		return -EFAULT; +	sigset_from_compat(&newset, &newset32); +	return sigsuspend(&newset); +#else +	/* on little-endian bitmaps don't care about granularity */ +	return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); +#endif +} +#endif + +#ifdef CONFIG_OLD_SIGSUSPEND +SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) +{ +	sigset_t blocked; +	siginitset(&blocked, mask); +	return sigsuspend(&blocked); +} +#endif +#ifdef CONFIG_OLD_SIGSUSPEND3 +SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) +{ +	sigset_t blocked; +	siginitset(&blocked, mask); +	return sigsuspend(&blocked); +} +#endif  __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)  { diff --git a/kernel/smp.c b/kernel/smp.c index 29dd40a9f2f..8e451f3ff51 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,23 +16,14 @@  #include "smpboot.h"  #ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static struct { -	struct list_head	queue; -	raw_spinlock_t		lock; -} call_function __cacheline_aligned_in_smp = -	{ -		.queue		= LIST_HEAD_INIT(call_function.queue), -		.lock		= __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), -	}; -  enum {  	CSD_FLAG_LOCK		= 0x01,  };  struct call_function_data { -	struct call_single_data	csd; -	atomic_t		refs; +	struct call_single_data	__percpu *csd;  	cpumask_var_t		cpumask; +	cpumask_var_t		cpumask_ipi;  };  static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -56,6 +47,14 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)  		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,  				cpu_to_node(cpu)))  			return notifier_from_errno(-ENOMEM); +		if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, +				cpu_to_node(cpu))) +			return notifier_from_errno(-ENOMEM); +		cfd->csd = alloc_percpu(struct call_single_data); +		if (!cfd->csd) { +			free_cpumask_var(cfd->cpumask); +			return notifier_from_errno(-ENOMEM); +		}  		break;  #ifdef CONFIG_HOTPLUG_CPU @@ -65,6 +64,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)  	case CPU_DEAD:  	case CPU_DEAD_FROZEN:  		free_cpumask_var(cfd->cpumask); +		free_cpumask_var(cfd->cpumask_ipi); +		free_percpu(cfd->csd);  		break;  #endif  	}; @@ -166,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)  }  /* - * Invoked by arch to handle an IPI for call function. Must be called with - * interrupts disabled. - */ -void generic_smp_call_function_interrupt(void) -{ -	struct call_function_data *data; -	int cpu = smp_processor_id(); - -	/* -	 * Shouldn't receive this interrupt on a cpu that is not yet online. -	 */ -	WARN_ON_ONCE(!cpu_online(cpu)); - -	/* -	 * Ensure entry is visible on call_function_queue after we have -	 * entered the IPI. See comment in smp_call_function_many. -	 * If we don't have this, then we may miss an entry on the list -	 * and never get another IPI to process it. -	 */ -	smp_mb(); - -	/* -	 * It's ok to use list_for_each_rcu() here even though we may -	 * delete 'pos', since list_del_rcu() doesn't clear ->next -	 */ -	list_for_each_entry_rcu(data, &call_function.queue, csd.list) { -		int refs; -		smp_call_func_t func; - -		/* -		 * Since we walk the list without any locks, we might -		 * see an entry that was completed, removed from the -		 * list and is in the process of being reused. -		 * -		 * We must check that the cpu is in the cpumask before -		 * checking the refs, and both must be set before -		 * executing the callback on this cpu. -		 */ - -		if (!cpumask_test_cpu(cpu, data->cpumask)) -			continue; - -		smp_rmb(); - -		if (atomic_read(&data->refs) == 0) -			continue; - -		func = data->csd.func;		/* save for later warn */ -		func(data->csd.info); - -		/* -		 * If the cpu mask is not still set then func enabled -		 * interrupts (BUG), and this cpu took another smp call -		 * function interrupt and executed func(info) twice -		 * on this cpu.  That nested execution decremented refs. -		 */ -		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { -			WARN(1, "%pf enabled interrupts and double executed\n", func); -			continue; -		} - -		refs = atomic_dec_return(&data->refs); -		WARN_ON(refs < 0); - -		if (refs) -			continue; - -		WARN_ON(!cpumask_empty(data->cpumask)); - -		raw_spin_lock(&call_function.lock); -		list_del_rcu(&data->csd.list); -		raw_spin_unlock(&call_function.lock); - -		csd_unlock(&data->csd); -	} - -} - -/*   * Invoked by arch to handle an IPI for call function single. Must be   * called from the arch with interrupts disabled.   */ @@ -448,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,  			    smp_call_func_t func, void *info, bool wait)  {  	struct call_function_data *data; -	unsigned long flags; -	int refs, cpu, next_cpu, this_cpu = smp_processor_id(); +	int cpu, next_cpu, this_cpu = smp_processor_id();  	/*  	 * Can deadlock when called with interrupts disabled. @@ -481,79 +402,46 @@ void smp_call_function_many(const struct cpumask *mask,  	}  	data = &__get_cpu_var(cfd_data); -	csd_lock(&data->csd); - -	/* This BUG_ON verifies our reuse assertions and can be removed */ -	BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); -	/* -	 * The global call function queue list add and delete are protected -	 * by a lock, but the list is traversed without any lock, relying -	 * on the rcu list add and delete to allow safe concurrent traversal. -	 * We reuse the call function data without waiting for any grace -	 * period after some other cpu removes it from the global queue. -	 * This means a cpu might find our data block as it is being -	 * filled out. -	 * -	 * We hold off the interrupt handler on the other cpu by -	 * ordering our writes to the cpu mask vs our setting of the -	 * refs counter.  We assert only the cpu owning the data block -	 * will set a bit in cpumask, and each bit will only be cleared -	 * by the subject cpu.  Each cpu must first find its bit is -	 * set and then check that refs is set indicating the element is -	 * ready to be processed, otherwise it must skip the entry. -	 * -	 * On the previous iteration refs was set to 0 by another cpu. -	 * To avoid the use of transitivity, set the counter to 0 here -	 * so the wmb will pair with the rmb in the interrupt handler. -	 */ -	atomic_set(&data->refs, 0);	/* convert 3rd to 1st party write */ - -	data->csd.func = func; -	data->csd.info = info; - -	/* Ensure 0 refs is visible before mask.  Also orders func and info */ -	smp_wmb(); - -	/* We rely on the "and" being processed before the store */  	cpumask_and(data->cpumask, mask, cpu_online_mask);  	cpumask_clear_cpu(this_cpu, data->cpumask); -	refs = cpumask_weight(data->cpumask);  	/* Some callers race with other cpus changing the passed mask */ -	if (unlikely(!refs)) { -		csd_unlock(&data->csd); +	if (unlikely(!cpumask_weight(data->cpumask)))  		return; -	} -	raw_spin_lock_irqsave(&call_function.lock, flags);  	/* -	 * Place entry at the _HEAD_ of the list, so that any cpu still -	 * observing the entry in generic_smp_call_function_interrupt() -	 * will not miss any other list entries: +	 * After we put an entry into the list, data->cpumask +	 * may be cleared again when another CPU sends another IPI for +	 * a SMP function call, so data->cpumask will be zero.  	 */ -	list_add_rcu(&data->csd.list, &call_function.queue); -	/* -	 * We rely on the wmb() in list_add_rcu to complete our writes -	 * to the cpumask before this write to refs, which indicates -	 * data is on the list and is ready to be processed. -	 */ -	atomic_set(&data->refs, refs); -	raw_spin_unlock_irqrestore(&call_function.lock, flags); +	cpumask_copy(data->cpumask_ipi, data->cpumask); -	/* -	 * Make the list addition visible before sending the ipi. -	 * (IPIs must obey or appear to obey normal Linux cache -	 * coherency rules -- see comment in generic_exec_single). -	 */ -	smp_mb(); +	for_each_cpu(cpu, data->cpumask) { +		struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); +		struct call_single_queue *dst = +					&per_cpu(call_single_queue, cpu); +		unsigned long flags; + +		csd_lock(csd); +		csd->func = func; +		csd->info = info; + +		raw_spin_lock_irqsave(&dst->lock, flags); +		list_add_tail(&csd->list, &dst->list); +		raw_spin_unlock_irqrestore(&dst->lock, flags); +	}  	/* Send a message to all CPUs in the map */ -	arch_send_call_function_ipi_mask(data->cpumask); +	arch_send_call_function_ipi_mask(data->cpumask_ipi); -	/* Optionally wait for the CPUs to complete */ -	if (wait) -		csd_lock_wait(&data->csd); +	if (wait) { +		for_each_cpu(cpu, data->cpumask) { +			struct call_single_data *csd = +					per_cpu_ptr(data->csd, cpu); +			csd_lock_wait(csd); +		} +	}  }  EXPORT_SYMBOL(smp_call_function_many); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d6c5fc05424..d4abac26177 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)  		kfree(td);  		return PTR_ERR(tsk);  	} -  	get_task_struct(tsk);  	*per_cpu_ptr(ht->store, cpu) = tsk; +	if (ht->create) +		ht->create(cpu);  	return 0;  } @@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)  {  	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); -	if (tsk) +	if (tsk && !ht->selfparking)  		kthread_park(tsk);  } diff --git a/kernel/softirq.c b/kernel/softirq.c index ed567babe78..b4d252fd195 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)  EXPORT_SYMBOL(local_bh_enable_ip);  /* - * We restart softirq processing MAX_SOFTIRQ_RESTART times, - * and we fall back to softirqd after that. + * We restart softirq processing for at most 2 ms, + * and if need_resched() is not set.   * - * This number has been established via experimentation. + * These limits have been established via experimentation.   * The two things to balance is latency against fairness -   * we want to handle softirqs as soon as possible, but they   * should not be able to lock up the box.   */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)  asmlinkage void __do_softirq(void)  {  	struct softirq_action *h;  	__u32 pending; -	int max_restart = MAX_SOFTIRQ_RESTART; +	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;  	int cpu;  	unsigned long old_flags = current->flags; @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)  	current->flags &= ~PF_MEMALLOC;  	pending = local_softirq_pending(); -	vtime_account_irq_enter(current); +	account_irq_enter_time(current);  	__local_bh_disable((unsigned long)__builtin_return_address(0),  				SOFTIRQ_OFFSET); @@ -264,15 +264,16 @@ restart:  	local_irq_disable();  	pending = local_softirq_pending(); -	if (pending && --max_restart) -		goto restart; +	if (pending) { +		if (time_before(jiffies, end) && !need_resched()) +			goto restart; -	if (pending)  		wakeup_softirqd(); +	}  	lockdep_softirq_exit(); -	vtime_account_irq_exit(current); +	account_irq_exit_time(current);  	__local_bh_enable(SOFTIRQ_OFFSET);  	tsk_restore_flags(current, old_flags, PF_MEMALLOC);  } @@ -341,7 +342,7 @@ static inline void invoke_softirq(void)   */  void irq_exit(void)  { -	vtime_account_irq_exit(current); +	account_irq_exit_time(current);  	trace_hardirq_exit();  	sub_preempt_count(IRQ_EXIT_OFFSET);  	if (!in_interrupt() && local_softirq_pending()) diff --git a/kernel/srcu.c b/kernel/srcu.c index 2b859828cdc..01d5ccb8bfe 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)   */  void cleanup_srcu_struct(struct srcu_struct *sp)  { -	int sum; - -	sum = srcu_readers_active(sp); -	WARN_ON(sum);  /* Leakage unless caller handles error. */ -	if (sum != 0) -		return; +	if (WARN_ON(srcu_readers_active(sp))) +		return; /* Leakage unless caller handles error. */  	free_percpu(sp->per_cpu_ref);  	sp->per_cpu_ref = NULL;  } @@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)  {  	int idx; +	idx = ACCESS_ONCE(sp->completed) & 0x1;  	preempt_disable(); -	idx = rcu_dereference_index_check(sp->completed, -					  rcu_read_lock_sched_held()) & 0x1;  	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;  	smp_mb(); /* B */  /* Avoid leaking the critical section. */  	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; @@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);   */  void __srcu_read_unlock(struct srcu_struct *sp, int idx)  { -	preempt_disable();  	smp_mb(); /* C */  /* Avoid leaking the critical section. */ -	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; -	preempt_enable(); +	this_cpu_dec(sp->per_cpu_ref->c[idx]);  }  EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)  			   !lock_is_held(&rcu_sched_lock_map),  			   "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); +	might_sleep();  	init_completion(&rcu.completion);  	head->next = NULL; @@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)   * synchronize_srcu - wait for prior SRCU read-side critical-section completion   * @sp: srcu_struct with which to synchronize.   * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates.  Can block; must be called from - * process context. + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->completed & 1) ^ 1) to drain to zero at first, + * and then flip the completed and wait for the count of the other index. + * + * Can block; must be called from process context.   *   * Note that it is illegal to call synchronize_srcu() from the corresponding   * SRCU read-side critical section; doing so will result in deadlock. @@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);   * Wait for an SRCU grace period to elapse, but be more aggressive about   * spinning rather than blocking when waiting.   * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier.  It is also illegal to call - * synchronize_srcu_expedited() from the corresponding SRCU read-side - * critical section; doing so will result in deadlock.  However, it is - * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct - * from some other srcu_struct's read-side critical section, as long as + * Note that it is also illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; + * doing so will result in deadlock.  However, it is perfectly legal + * to call synchronize_srcu_expedited() on one srcu_struct from some + * other srcu_struct's read-side critical section, as long as   * the resulting graph of srcu_structs is acyclic.   */  void synchronize_srcu_expedited(struct srcu_struct *sp) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2f194e96571..95d178c62d5 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -18,7 +18,7 @@  #include <linux/stop_machine.h>  #include <linux/interrupt.h>  #include <linux/kallsyms.h> - +#include <linux/smpboot.h>  #include <linux/atomic.h>  /* @@ -37,10 +37,10 @@ struct cpu_stopper {  	spinlock_t		lock;  	bool			enabled;	/* is this stopper enabled? */  	struct list_head	works;		/* list of pending works */ -	struct task_struct	*thread;	/* stopper thread */  };  static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); +static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);  static bool stop_machine_initialized = false;  static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) @@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)  }  /* queue @work to @stopper.  if offline, @work is completed immediately */ -static void cpu_stop_queue_work(struct cpu_stopper *stopper, -				struct cpu_stop_work *work) +static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)  { +	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); +	struct task_struct *p = per_cpu(cpu_stopper_task, cpu); +  	unsigned long flags;  	spin_lock_irqsave(&stopper->lock, flags);  	if (stopper->enabled) {  		list_add_tail(&work->list, &stopper->works); -		wake_up_process(stopper->thread); +		wake_up_process(p);  	} else  		cpu_stop_signal_done(work->done, false); @@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)  	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };  	cpu_stop_init_done(&done, 1); -	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); +	cpu_stop_queue_work(cpu, &work);  	wait_for_completion(&done.completion);  	return done.executed ? done.ret : -ENOENT;  } @@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,  			struct cpu_stop_work *work_buf)  {  	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; -	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); +	cpu_stop_queue_work(cpu, work_buf);  }  /* static data for stop_cpus */ @@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,  	 */  	preempt_disable();  	for_each_cpu(cpu, cpumask) -		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), -				    &per_cpu(stop_cpus_work, cpu)); +		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));  	preempt_enable();  } @@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)  	return ret;  } -static int cpu_stopper_thread(void *data) +static int cpu_stop_should_run(unsigned int cpu) +{ +	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); +	unsigned long flags; +	int run; + +	spin_lock_irqsave(&stopper->lock, flags); +	run = !list_empty(&stopper->works); +	spin_unlock_irqrestore(&stopper->lock, flags); +	return run; +} + +static void cpu_stopper_thread(unsigned int cpu)  { -	struct cpu_stopper *stopper = data; +	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);  	struct cpu_stop_work *work;  	int ret;  repeat: -	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */ - -	if (kthread_should_stop()) { -		__set_current_state(TASK_RUNNING); -		return 0; -	} -  	work = NULL;  	spin_lock_irq(&stopper->lock);  	if (!list_empty(&stopper->works)) { @@ -273,8 +279,6 @@ repeat:  		struct cpu_stop_done *done = work->done;  		char ksym_buf[KSYM_NAME_LEN] __maybe_unused; -		__set_current_state(TASK_RUNNING); -  		/* cpu stop callbacks are not allowed to sleep */  		preempt_disable(); @@ -290,88 +294,55 @@ repeat:  					  ksym_buf), arg);  		cpu_stop_signal_done(done, true); -	} else -		schedule(); - -	goto repeat; +		goto repeat; +	}  }  extern void sched_set_stop_task(int cpu, struct task_struct *stop); -/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ -static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, -					   unsigned long action, void *hcpu) +static void cpu_stop_create(unsigned int cpu)  { -	unsigned int cpu = (unsigned long)hcpu; -	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); -	struct task_struct *p; - -	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_UP_PREPARE: -		BUG_ON(stopper->thread || stopper->enabled || -		       !list_empty(&stopper->works)); -		p = kthread_create_on_node(cpu_stopper_thread, -					   stopper, -					   cpu_to_node(cpu), -					   "migration/%d", cpu); -		if (IS_ERR(p)) -			return notifier_from_errno(PTR_ERR(p)); -		get_task_struct(p); -		kthread_bind(p, cpu); -		sched_set_stop_task(cpu, p); -		stopper->thread = p; -		break; +	sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); +} -	case CPU_ONLINE: -		/* strictly unnecessary, as first user will wake it */ -		wake_up_process(stopper->thread); -		/* mark enabled */ -		spin_lock_irq(&stopper->lock); -		stopper->enabled = true; -		spin_unlock_irq(&stopper->lock); -		break; +static void cpu_stop_park(unsigned int cpu) +{ +	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); +	struct cpu_stop_work *work; +	unsigned long flags; -#ifdef CONFIG_HOTPLUG_CPU -	case CPU_UP_CANCELED: -	case CPU_POST_DEAD: -	{ -		struct cpu_stop_work *work; +	/* drain remaining works */ +	spin_lock_irqsave(&stopper->lock, flags); +	list_for_each_entry(work, &stopper->works, list) +		cpu_stop_signal_done(work->done, false); +	stopper->enabled = false; +	spin_unlock_irqrestore(&stopper->lock, flags); +} -		sched_set_stop_task(cpu, NULL); -		/* kill the stopper */ -		kthread_stop(stopper->thread); -		/* drain remaining works */ -		spin_lock_irq(&stopper->lock); -		list_for_each_entry(work, &stopper->works, list) -			cpu_stop_signal_done(work->done, false); -		stopper->enabled = false; -		spin_unlock_irq(&stopper->lock); -		/* release the stopper */ -		put_task_struct(stopper->thread); -		stopper->thread = NULL; -		break; -	} -#endif -	} +static void cpu_stop_unpark(unsigned int cpu) +{ +	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); -	return NOTIFY_OK; +	spin_lock_irq(&stopper->lock); +	stopper->enabled = true; +	spin_unlock_irq(&stopper->lock);  } -/* - * Give it a higher priority so that cpu stopper is available to other - * cpu notifiers.  It currently shares the same priority as sched - * migration_notifier. - */ -static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { -	.notifier_call	= cpu_stop_cpu_callback, -	.priority	= 10, +static struct smp_hotplug_thread cpu_stop_threads = { +	.store			= &cpu_stopper_task, +	.thread_should_run	= cpu_stop_should_run, +	.thread_fn		= cpu_stopper_thread, +	.thread_comm		= "migration/%u", +	.create			= cpu_stop_create, +	.setup			= cpu_stop_unpark, +	.park			= cpu_stop_park, +	.unpark			= cpu_stop_unpark, +	.selfparking		= true,  };  static int __init cpu_stop_init(void)  { -	void *bcpu = (void *)(long)smp_processor_id();  	unsigned int cpu; -	int err;  	for_each_possible_cpu(cpu) {  		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); @@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)  		INIT_LIST_HEAD(&stopper->works);  	} -	/* start one for the boot cpu */ -	err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, -				    bcpu); -	BUG_ON(err != NOTIFY_OK); -	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); -	register_cpu_notifier(&cpu_stop_cpu_notifier); - +	BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));  	stop_machine_initialized = true; -  	return 0;  }  early_initcall(cpu_stop_init); diff --git a/kernel/sys.c b/kernel/sys.c index e3932ea50ec..e10566bee39 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@  #include <linux/syscalls.h>  #include <linux/kprobes.h>  #include <linux/user_namespace.h> +#include <linux/binfmts.h>  #include <linux/kmsg_dump.h>  /* Move somewhere else to avoid recompiling? */ @@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex);  SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,  		void __user *, arg)  { +	struct pid_namespace *pid_ns = task_active_pid_ns(current);  	char buffer[256];  	int ret = 0;  	/* We only trust the superuser with rebooting the system. */ -	if (!capable(CAP_SYS_BOOT)) +	if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))  		return -EPERM;  	/* For safety, we require "magic" arguments. */ @@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,  	 * pid_namespace, the command is handled by reboot_pid_ns() which will  	 * call do_exit().  	 */ -	ret = reboot_pid_ns(task_active_pid_ns(current), cmd); +	ret = reboot_pid_ns(pid_ns, cmd);  	if (ret)  		return ret; @@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  	error = 0;  	switch (option) { -		case PR_SET_PDEATHSIG: -			if (!valid_signal(arg2)) { -				error = -EINVAL; -				break; -			} -			me->pdeath_signal = arg2; -			break; -		case PR_GET_PDEATHSIG: -			error = put_user(me->pdeath_signal, (int __user *)arg2); -			break; -		case PR_GET_DUMPABLE: -			error = get_dumpable(me->mm); +	case PR_SET_PDEATHSIG: +		if (!valid_signal(arg2)) { +			error = -EINVAL;  			break; -		case PR_SET_DUMPABLE: -			if (arg2 < 0 || arg2 > 1) { -				error = -EINVAL; -				break; -			} -			set_dumpable(me->mm, arg2); +		} +		me->pdeath_signal = arg2; +		break; +	case PR_GET_PDEATHSIG: +		error = put_user(me->pdeath_signal, (int __user *)arg2); +		break; +	case PR_GET_DUMPABLE: +		error = get_dumpable(me->mm); +		break; +	case PR_SET_DUMPABLE: +		if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { +			error = -EINVAL;  			break; +		} +		set_dumpable(me->mm, arg2); +		break; -		case PR_SET_UNALIGN: -			error = SET_UNALIGN_CTL(me, arg2); -			break; -		case PR_GET_UNALIGN: -			error = GET_UNALIGN_CTL(me, arg2); -			break; -		case PR_SET_FPEMU: -			error = SET_FPEMU_CTL(me, arg2); -			break; -		case PR_GET_FPEMU: -			error = GET_FPEMU_CTL(me, arg2); -			break; -		case PR_SET_FPEXC: -			error = SET_FPEXC_CTL(me, arg2); -			break; -		case PR_GET_FPEXC: -			error = GET_FPEXC_CTL(me, arg2); -			break; -		case PR_GET_TIMING: -			error = PR_TIMING_STATISTICAL; -			break; -		case PR_SET_TIMING: -			if (arg2 != PR_TIMING_STATISTICAL) -				error = -EINVAL; -			break; -		case PR_SET_NAME: -			comm[sizeof(me->comm)-1] = 0; -			if (strncpy_from_user(comm, (char __user *)arg2, -					      sizeof(me->comm) - 1) < 0) -				return -EFAULT; -			set_task_comm(me, comm); -			proc_comm_connector(me); -			break; -		case PR_GET_NAME: -			get_task_comm(comm, me); -			if (copy_to_user((char __user *)arg2, comm, -					 sizeof(comm))) -				return -EFAULT; -			break; -		case PR_GET_ENDIAN: -			error = GET_ENDIAN(me, arg2); -			break; -		case PR_SET_ENDIAN: -			error = SET_ENDIAN(me, arg2); -			break; -		case PR_GET_SECCOMP: -			error = prctl_get_seccomp(); -			break; -		case PR_SET_SECCOMP: -			error = prctl_set_seccomp(arg2, (char __user *)arg3); -			break; -		case PR_GET_TSC: -			error = GET_TSC_CTL(arg2); -			break; -		case PR_SET_TSC: -			error = SET_TSC_CTL(arg2); -			break; -		case PR_TASK_PERF_EVENTS_DISABLE: -			error = perf_event_task_disable(); -			break; -		case PR_TASK_PERF_EVENTS_ENABLE: -			error = perf_event_task_enable(); -			break; -		case PR_GET_TIMERSLACK: -			error = current->timer_slack_ns; -			break; -		case PR_SET_TIMERSLACK: -			if (arg2 <= 0) -				current->timer_slack_ns = +	case PR_SET_UNALIGN: +		error = SET_UNALIGN_CTL(me, arg2); +		break; +	case PR_GET_UNALIGN: +		error = GET_UNALIGN_CTL(me, arg2); +		break; +	case PR_SET_FPEMU: +		error = SET_FPEMU_CTL(me, arg2); +		break; +	case PR_GET_FPEMU: +		error = GET_FPEMU_CTL(me, arg2); +		break; +	case PR_SET_FPEXC: +		error = SET_FPEXC_CTL(me, arg2); +		break; +	case PR_GET_FPEXC: +		error = GET_FPEXC_CTL(me, arg2); +		break; +	case PR_GET_TIMING: +		error = PR_TIMING_STATISTICAL; +		break; +	case PR_SET_TIMING: +		if (arg2 != PR_TIMING_STATISTICAL) +			error = -EINVAL; +		break; +	case PR_SET_NAME: +		comm[sizeof(me->comm) - 1] = 0; +		if (strncpy_from_user(comm, (char __user *)arg2, +				      sizeof(me->comm) - 1) < 0) +			return -EFAULT; +		set_task_comm(me, comm); +		proc_comm_connector(me); +		break; +	case PR_GET_NAME: +		get_task_comm(comm, me); +		if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) +			return -EFAULT; +		break; +	case PR_GET_ENDIAN: +		error = GET_ENDIAN(me, arg2); +		break; +	case PR_SET_ENDIAN: +		error = SET_ENDIAN(me, arg2); +		break; +	case PR_GET_SECCOMP: +		error = prctl_get_seccomp(); +		break; +	case PR_SET_SECCOMP: +		error = prctl_set_seccomp(arg2, (char __user *)arg3); +		break; +	case PR_GET_TSC: +		error = GET_TSC_CTL(arg2); +		break; +	case PR_SET_TSC: +		error = SET_TSC_CTL(arg2); +		break; +	case PR_TASK_PERF_EVENTS_DISABLE: +		error = perf_event_task_disable(); +		break; +	case PR_TASK_PERF_EVENTS_ENABLE: +		error = perf_event_task_enable(); +		break; +	case PR_GET_TIMERSLACK: +		error = current->timer_slack_ns; +		break; +	case PR_SET_TIMERSLACK: +		if (arg2 <= 0) +			current->timer_slack_ns =  					current->default_timer_slack_ns; -			else -				current->timer_slack_ns = arg2; -			break; -		case PR_MCE_KILL: -			if (arg4 | arg5) -				return -EINVAL; -			switch (arg2) { -			case PR_MCE_KILL_CLEAR: -				if (arg3 != 0) -					return -EINVAL; -				current->flags &= ~PF_MCE_PROCESS; -				break; -			case PR_MCE_KILL_SET: -				current->flags |= PF_MCE_PROCESS; -				if (arg3 == PR_MCE_KILL_EARLY) -					current->flags |= PF_MCE_EARLY; -				else if (arg3 == PR_MCE_KILL_LATE) -					current->flags &= ~PF_MCE_EARLY; -				else if (arg3 == PR_MCE_KILL_DEFAULT) -					current->flags &= -						~(PF_MCE_EARLY|PF_MCE_PROCESS); -				else -					return -EINVAL; -				break; -			default: +		else +			current->timer_slack_ns = arg2; +		break; +	case PR_MCE_KILL: +		if (arg4 | arg5) +			return -EINVAL; +		switch (arg2) { +		case PR_MCE_KILL_CLEAR: +			if (arg3 != 0)  				return -EINVAL; -			} +			current->flags &= ~PF_MCE_PROCESS;  			break; -		case PR_MCE_KILL_GET: -			if (arg2 | arg3 | arg4 | arg5) -				return -EINVAL; -			if (current->flags & PF_MCE_PROCESS) -				error = (current->flags & PF_MCE_EARLY) ? -					PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; +		case PR_MCE_KILL_SET: +			current->flags |= PF_MCE_PROCESS; +			if (arg3 == PR_MCE_KILL_EARLY) +				current->flags |= PF_MCE_EARLY; +			else if (arg3 == PR_MCE_KILL_LATE) +				current->flags &= ~PF_MCE_EARLY; +			else if (arg3 == PR_MCE_KILL_DEFAULT) +				current->flags &= +						~(PF_MCE_EARLY|PF_MCE_PROCESS);  			else -				error = PR_MCE_KILL_DEFAULT; -			break; -		case PR_SET_MM: -			error = prctl_set_mm(arg2, arg3, arg4, arg5); -			break; -		case PR_GET_TID_ADDRESS: -			error = prctl_get_tid_address(me, (int __user **)arg2); -			break; -		case PR_SET_CHILD_SUBREAPER: -			me->signal->is_child_subreaper = !!arg2; -			break; -		case PR_GET_CHILD_SUBREAPER: -			error = put_user(me->signal->is_child_subreaper, -					 (int __user *) arg2); -			break; -		case PR_SET_NO_NEW_PRIVS: -			if (arg2 != 1 || arg3 || arg4 || arg5)  				return -EINVAL; - -			current->no_new_privs = 1;  			break; -		case PR_GET_NO_NEW_PRIVS: -			if (arg2 || arg3 || arg4 || arg5) -				return -EINVAL; -			return current->no_new_privs ? 1 : 0;  		default: -			error = -EINVAL; -			break; +			return -EINVAL; +		} +		break; +	case PR_MCE_KILL_GET: +		if (arg2 | arg3 | arg4 | arg5) +			return -EINVAL; +		if (current->flags & PF_MCE_PROCESS) +			error = (current->flags & PF_MCE_EARLY) ? +				PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; +		else +			error = PR_MCE_KILL_DEFAULT; +		break; +	case PR_SET_MM: +		error = prctl_set_mm(arg2, arg3, arg4, arg5); +		break; +	case PR_GET_TID_ADDRESS: +		error = prctl_get_tid_address(me, (int __user **)arg2); +		break; +	case PR_SET_CHILD_SUBREAPER: +		me->signal->is_child_subreaper = !!arg2; +		break; +	case PR_GET_CHILD_SUBREAPER: +		error = put_user(me->signal->is_child_subreaper, +				 (int __user *)arg2); +		break; +	case PR_SET_NO_NEW_PRIVS: +		if (arg2 != 1 || arg3 || arg4 || arg5) +			return -EINVAL; + +		current->no_new_privs = 1; +		break; +	case PR_GET_NO_NEW_PRIVS: +		if (arg2 || arg3 || arg4 || arg5) +			return -EINVAL; +		return current->no_new_privs ? 1 : 0; +	default: +		error = -EINVAL; +		break;  	}  	return error;  } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491..d8df00e69c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,7 @@  #include <linux/kmod.h>  #include <linux/capability.h>  #include <linux/binfmts.h> +#include <linux/sched/sysctl.h>  #include <asm/uaccess.h>  #include <asm/processor.h> @@ -104,7 +105,6 @@ extern char core_pattern[];  extern unsigned int core_pipe_limit;  #endif  extern int pid_max; -extern int min_free_kbytes;  extern int pid_max_min, pid_max_max;  extern int sysctl_drop_caches;  extern int percpu_pagelist_fraction; @@ -161,10 +161,13 @@ extern int unaligned_enabled;  #endif  #ifdef CONFIG_IA64 -extern int no_unaligned_warning;  extern int unaligned_dump_stack;  #endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN +extern int no_unaligned_warning; +#endif +  #ifdef CONFIG_PROC_SYSCTL  static int proc_do_cad_pid(struct ctl_table *table, int write,  		  void __user *buffer, size_t *lenp, loff_t *ppos); @@ -403,6 +406,13 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= sched_rt_handler,  	}, +	{ +		.procname	= "sched_rr_timeslice_ms", +		.data		= &sched_rr_timeslice, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= sched_rr_handler, +	},  #ifdef CONFIG_SCHED_AUTOGROUP  	{  		.procname	= "sched_autogroup_enabled", @@ -911,7 +921,7 @@ static struct ctl_table kern_table[] = {  		.proc_handler	= proc_doulongvec_minmax,  	},  #endif -#ifdef CONFIG_IA64 +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN  	{  		.procname	= "ignore-unaligned-usertrap",  		.data		= &no_unaligned_warning, @@ -919,6 +929,8 @@ static struct ctl_table kern_table[] = {  	 	.mode		= 0644,  		.proc_handler	= proc_dointvec,  	}, +#endif +#ifdef CONFIG_IA64  	{  		.procname	= "unaligned-dump-stack",  		.data		= &unaligned_dump_stack, @@ -2006,7 +2018,7 @@ static int proc_taint(struct ctl_table *table, int write,  		int i;  		for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {  			if ((tmptaint >> i) & 1) -				add_taint(i); +				add_taint(i, LOCKDEP_STILL_OK);  		}  	} diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 37f240fec37..b25115e8c7f 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {  	{ CTL_INT,	NET_TCP_MODERATE_RCVBUF,		"tcp_moderate_rcvbuf" },  	{ CTL_INT,	NET_TCP_TSO_WIN_DIVISOR,		"tcp_tso_win_divisor" },  	{ CTL_STR,	NET_TCP_CONG_CONTROL,			"tcp_congestion_control" }, -	{ CTL_INT,	NET_TCP_ABC,				"tcp_abc" },  	{ CTL_INT,	NET_TCP_MTU_PROBING,			"tcp_mtu_probing" },  	{ CTL_INT,	NET_TCP_BASE_MSS,			"tcp_base_mss" },  	{ CTL_INT,	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,	"tcp_workaround_signed_windows" }, diff --git a/kernel/time.c b/kernel/time.c index d226c6a3fd2..f8342a41efa 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,  }  /* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/*   * Adjust the time obtained from the CMOS to be UTC time instead of   * local time.   * @@ -135,6 +141,8 @@ static inline void warp_clock(void)  	struct timespec adjust;  	adjust = current_kernel_time(); +	if (sys_tz.tz_minuteswest != 0) +		persistent_clock_is_local = 1;  	adjust.tv_sec += sys_tz.tz_minuteswest * 60;  	do_settimeofday(&adjust);  } @@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);   * Avoid unnecessary multiplications/divisions in the   * two most common HZ cases:   */ -inline unsigned int jiffies_to_msecs(const unsigned long j) +unsigned int jiffies_to_msecs(const unsigned long j)  {  #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)  	return (MSEC_PER_SEC / HZ) * j; @@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)  }  EXPORT_SYMBOL(jiffies_to_msecs); -inline unsigned int jiffies_to_usecs(const unsigned long j) +unsigned int jiffies_to_usecs(const unsigned long j)  {  #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)  	return (USEC_PER_SEC / HZ) * j; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8601f0db126..24510d84efd 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG  config ARCH_CLOCKSOURCE_DATA  	bool +# Platforms has a persistent clock +config ALWAYS_USE_PERSISTENT_CLOCK +	bool +	default n +  # Timekeeping vsyscall support  config GENERIC_TIME_VSYSCALL  	bool @@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD  	default y  	depends on GENERIC_CLOCKEVENTS +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST +	bool +  # Clockevents broadcasting infrastructure  config GENERIC_CLOCKEVENTS_BROADCAST  	bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 30b6de0d977..c6d6400ee13 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,  	clockevents_config(dev, freq);  	clockevents_register_device(dev);  } +EXPORT_SYMBOL_GPL(clockevents_config_and_register);  /**   * clockevents_update_freq - Update frequency and reprogram a clock event device. diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 24174b4d669..072bb066bb7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -15,6 +15,7 @@  #include <linux/time.h>  #include <linux/mm.h>  #include <linux/module.h> +#include <linux/rtc.h>  #include "tick-internal.h" @@ -22,7 +23,7 @@   * NTP timekeeping variables:   */ -DEFINE_SPINLOCK(ntp_lock); +DEFINE_RAW_SPINLOCK(ntp_lock);  /* USER_HZ period (usecs): */ @@ -347,7 +348,7 @@ void ntp_clear(void)  {  	unsigned long flags; -	spin_lock_irqsave(&ntp_lock, flags); +	raw_spin_lock_irqsave(&ntp_lock, flags);  	time_adjust	= 0;		/* stop active adjtime() */  	time_status	|= STA_UNSYNC; @@ -361,7 +362,7 @@ void ntp_clear(void)  	/* Clear PPS state variables */  	pps_clear(); -	spin_unlock_irqrestore(&ntp_lock, flags); +	raw_spin_unlock_irqrestore(&ntp_lock, flags);  } @@ -371,9 +372,9 @@ u64 ntp_tick_length(void)  	unsigned long flags;  	s64 ret; -	spin_lock_irqsave(&ntp_lock, flags); +	raw_spin_lock_irqsave(&ntp_lock, flags);  	ret = tick_length; -	spin_unlock_irqrestore(&ntp_lock, flags); +	raw_spin_unlock_irqrestore(&ntp_lock, flags);  	return ret;  } @@ -394,7 +395,7 @@ int second_overflow(unsigned long secs)  	int leap = 0;  	unsigned long flags; -	spin_lock_irqsave(&ntp_lock, flags); +	raw_spin_lock_irqsave(&ntp_lock, flags);  	/*  	 * Leap second processing. If in leap-insert state at the end of the @@ -478,13 +479,12 @@ int second_overflow(unsigned long secs)  	time_adjust = 0;  out: -	spin_unlock_irqrestore(&ntp_lock, flags); +	raw_spin_unlock_irqrestore(&ntp_lock, flags);  	return leap;  } -#ifdef CONFIG_GENERIC_CMOS_UPDATE - +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)  static void sync_cmos_clock(struct work_struct *work);  static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)  	}  	getnstimeofday(&now); -	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) -		fail = update_persistent_clock(now); +	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { +		struct timespec adjust = now; + +		fail = -ENODEV; +		if (persistent_clock_is_local) +			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); +#ifdef CONFIG_GENERIC_CMOS_UPDATE +		fail = update_persistent_clock(adjust); +#endif +#ifdef CONFIG_RTC_SYSTOHC +		if (fail == -ENODEV) +			fail = rtc_set_ntp_time(adjust); +#endif +	}  	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);  	if (next.tv_nsec <= 0)  		next.tv_nsec += NSEC_PER_SEC; -	if (!fail) +	if (!fail || fail == -ENODEV)  		next.tv_sec = 659;  	else  		next.tv_sec = 0; @@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc)  	getnstimeofday(&ts); -	spin_lock_irq(&ntp_lock); +	raw_spin_lock_irq(&ntp_lock);  	if (txc->modes & ADJ_ADJTIME) {  		long save_adjust = time_adjust; @@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc)  	/* fill PPS status fields */  	pps_fill_timex(txc); -	spin_unlock_irq(&ntp_lock); +	raw_spin_unlock_irq(&ntp_lock);  	txc->time.tv_sec = ts.tv_sec;  	txc->time.tv_usec = ts.tv_nsec; @@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	pts_norm = pps_normalize_ts(*phase_ts); -	spin_lock_irqsave(&ntp_lock, flags); +	raw_spin_lock_irqsave(&ntp_lock, flags);  	/* clear the error bits, they will be set again if needed */  	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	 * just start the frequency interval */  	if (unlikely(pps_fbase.tv_sec == 0)) {  		pps_fbase = *raw_ts; -		spin_unlock_irqrestore(&ntp_lock, flags); +		raw_spin_unlock_irqrestore(&ntp_lock, flags);  		return;  	} @@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  		time_status |= STA_PPSJITTER;  		/* restart the frequency calibration interval */  		pps_fbase = *raw_ts; -		spin_unlock_irqrestore(&ntp_lock, flags); +		raw_spin_unlock_irqrestore(&ntp_lock, flags);  		pr_err("hardpps: PPSJITTER: bad pulse\n");  		return;  	} @@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	hardpps_update_phase(pts_norm.nsec); -	spin_unlock_irqrestore(&ntp_lock, flags); +	raw_spin_unlock_irqrestore(&ntp_lock, flags);  }  EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f113755695e..2fb8cb88df8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,6 +18,7 @@  #include <linux/percpu.h>  #include <linux/profile.h>  #include <linux/sched.h> +#include <linux/smp.h>  #include "tick-internal.h" @@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)  	return (dev && tick_broadcast_device.evtdev == dev);  } +static void err_broadcast(const struct cpumask *mask) +{ +	pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ +	if (!dev->broadcast) +		dev->broadcast = tick_broadcast; +	if (!dev->broadcast) { +		pr_warn_once("%s depends on broadcast, but no broadcast function available\n", +			     dev->name); +		dev->broadcast = err_broadcast; +	} +} +  /*   * Check, if the device is disfunctional and a place holder, which   * needs to be handled by the broadcast device. @@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  	 */  	if (!tick_device_is_functional(dev)) {  		dev->event_handler = tick_handle_periodic; +		tick_device_setup_broadcast_func(dev);  		cpumask_set_cpu(cpu, tick_get_broadcast_mask());  		tick_broadcast_start_periodic(tick_broadcast_device.evtdev);  		ret = 1; @@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  		 */  		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {  			int cpu = smp_processor_id(); -  			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());  			tick_broadcast_clear_oneshot(cpu); +		} else { +			tick_device_setup_broadcast_func(dev);  		}  	}  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  	return ret;  } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +int tick_receive_broadcast(void) +{ +	struct tick_device *td = this_cpu_ptr(&tick_cpu_device); +	struct clock_event_device *evt = td->evtdev; + +	if (!evt) +		return -ENODEV; + +	if (!evt->event_handler) +		return -EINVAL; + +	evt->event_handler(evt); +	return 0; +} +#endif +  /*   * Broadcast the event to the cpus, which are set in the mask (mangled).   */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d58e552d9fd..314b9ee07ed 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@  #include <linux/profile.h>  #include <linux/sched.h>  #include <linux/module.h> +#include <linux/irq_work.h>  #include <asm/irq_regs.h> @@ -28,7 +29,7 @@  /*   * Per cpu nohz control structure   */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);  /*   * The time, when the last jiffy update happened. Protected by jiffies_lock. @@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  		time_delta = timekeeping_max_deferment();  	} while (read_seqretry(&jiffies_lock, seq)); -	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || -	    arch_needs_cpu(cpu)) { +	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || +	    arch_needs_cpu(cpu) || irq_work_needs_cpu()) {  		next_jiffies = last_jiffies + 1;  		delta_jiffies = 1;  	} else { @@ -631,8 +632,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)  static void tick_nohz_account_idle_ticks(struct tick_sched *ts)  { -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE  	unsigned long ticks; + +	if (vtime_accounting_enabled()) +		return;  	/*  	 * We stopped the tick in idle. Update process times would miss the  	 * time we slept as update_process_times does only a 1 tick diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cbc6acb0db3..9a0bc98fbe1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -29,6 +29,9 @@ static struct timekeeper timekeeper;  /* flag for if timekeeping is suspended */  int __read_mostly timekeeping_suspended; +/* Flag for if there is a persistent clock on this platform */ +bool __read_mostly persistent_clock_exist = false; +  static inline void tk_normalize_xtime(struct timekeeper *tk)  {  	while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { @@ -135,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)  }  /* Timekeeper helper functions. */ + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +u32 (*arch_gettimeoffset)(void); + +u32 get_arch_timeoffset(void) +{ +	if (likely(arch_gettimeoffset)) +		return arch_gettimeoffset(); +	return 0; +} +#else +static inline u32 get_arch_timeoffset(void) { return 0; } +#endif +  static inline s64 timekeeping_get_ns(struct timekeeper *tk)  {  	cycle_t cycle_now, cycle_delta; @@ -151,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)  	nsec = cycle_delta * tk->mult + tk->xtime_nsec;  	nsec >>= tk->shift; -	/* If arch requires, add in gettimeoffset() */ -	return nsec + arch_gettimeoffset(); +	/* If arch requires, add in get_arch_timeoffset() */ +	return nsec + get_arch_timeoffset();  }  static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) @@ -171,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)  	/* convert delta to nanoseconds. */  	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); -	/* If arch requires, add in gettimeoffset() */ -	return nsec + arch_gettimeoffset(); +	/* If arch requires, add in get_arch_timeoffset() */ +	return nsec + get_arch_timeoffset();  }  static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); @@ -254,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)  	tk->xtime_nsec += cycle_delta * tk->mult; -	/* If arch requires, add in gettimeoffset() */ -	tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; +	/* If arch requires, add in get_arch_timeoffset() */ +	tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;  	tk_normalize_xtime(tk); @@ -264,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)  }  /** - * getnstimeofday - Returns the time of day in a timespec + * __getnstimeofday - Returns the time of day in a timespec.   * @ts:		pointer to the timespec to be set   * - * Returns the time of day in a timespec. + * Updates the time of day in the timespec. + * Returns 0 on success, or -ve when suspended (timespec will be undefined).   */ -void getnstimeofday(struct timespec *ts) +int __getnstimeofday(struct timespec *ts)  {  	struct timekeeper *tk = &timekeeper;  	unsigned long seq;  	s64 nsecs = 0; -	WARN_ON(timekeeping_suspended); -  	do {  		seq = read_seqbegin(&tk->lock); @@ -287,6 +303,26 @@ void getnstimeofday(struct timespec *ts)  	ts->tv_nsec = 0;  	timespec_add_ns(ts, nsecs); + +	/* +	 * Do not bail out early, in case there were callers still using +	 * the value, even in the face of the WARN_ON. +	 */ +	if (unlikely(timekeeping_suspended)) +		return -EAGAIN; +	return 0; +} +EXPORT_SYMBOL(__getnstimeofday); + +/** + * getnstimeofday - Returns the time of day in a timespec. + * @ts:		pointer to the timespec to be set + * + * Returns the time of day in a timespec (WARN if suspended). + */ +void getnstimeofday(struct timespec *ts) +{ +	WARN_ON(__getnstimeofday(ts));  }  EXPORT_SYMBOL(getnstimeofday); @@ -640,12 +676,14 @@ void __init timekeeping_init(void)  	struct timespec now, boot, tmp;  	read_persistent_clock(&now); +  	if (!timespec_valid_strict(&now)) {  		pr_warn("WARNING: Persistent clock returned invalid value!\n"  			"         Check your CMOS/BIOS settings.\n");  		now.tv_sec = 0;  		now.tv_nsec = 0; -	} +	} else if (now.tv_sec || now.tv_nsec) +		persistent_clock_exist = true;  	read_boot_clock(&boot);  	if (!timespec_valid_strict(&boot)) { @@ -718,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)  {  	struct timekeeper *tk = &timekeeper;  	unsigned long flags; -	struct timespec ts; -	/* Make sure we don't set the clock twice */ -	read_persistent_clock(&ts); -	if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) +	/* +	 * Make sure we don't set the clock twice, as timekeeping_resume() +	 * already did it +	 */ +	if (has_persistent_clock())  		return;  	write_seqlock_irqsave(&tk->lock, flags); diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl index eb51d76e058..3f42652a6a3 100644 --- a/kernel/timeconst.pl +++ b/kernel/timeconst.pl @@ -369,10 +369,8 @@ if ($hz eq '--can') {  		die "Usage: $0 HZ\n";  	} -	@val = @{$canned_values{$hz}}; -	if (!defined(@val)) { -		@val = compute_values($hz); -	} +	$cv = $canned_values{$hz}; +	@val = defined($cv) ? @$cv : compute_values($hz);  	output($hz, @val);  }  exit 0; diff --git a/kernel/timer.c b/kernel/timer.c index 367d0085848..dbf7a78a1ef 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@  #include <linux/kallsyms.h>  #include <linux/irq_work.h>  #include <linux/sched.h> +#include <linux/sched/sysctl.h>  #include <linux/slab.h>  #include <asm/uaccess.h> @@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)  	account_process_tick(p, user_tick);  	run_local_timers();  	rcu_check_callbacks(cpu, user_tick); -	printk_tick();  #ifdef CONFIG_IRQ_WORK  	if (in_irq())  		irq_work_run(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485..192473b2279 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE  	help  	  See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS +	bool +  config HAVE_FTRACE_MCOUNT_RECORD  	bool  	help @@ -78,21 +81,6 @@ config EVENT_TRACING  	select CONTEXT_SWITCH_TRACER  	bool -config EVENT_POWER_TRACING_DEPRECATED -	depends on EVENT_TRACING -	bool "Deprecated power event trace API, to be removed" -	default y -	help -	  Provides old power event types: -	  C-state/idle accounting events: -	  power:power_start -	  power:power_end -	  and old cpufreq accounting event: -	  power:power_frequency -	  This is for userspace compatibility -	  and will vanish after 5 kernel iterations, -	  namely 3.1. -  config CONTEXT_SWITCH_TRACER  	bool @@ -250,6 +238,16 @@ config FTRACE_SYSCALLS  	help  	  Basic tracer to catch the syscall entry and exit events. +config TRACER_SNAPSHOT +	bool "Create a snapshot trace buffer" +	select TRACER_MAX_TRACE +	help +	  Allow tracing users to take snapshot of the current buffer using the +	  ftrace interface, e.g.: + +	      echo 1 > /sys/kernel/debug/tracing/snapshot +	      cat snapshot +  config TRACE_BRANCH_PROFILING  	bool  	select GENERIC_TRACER @@ -434,6 +432,11 @@ config DYNAMIC_FTRACE  	  were made. If so, it runs stop_machine (stops all CPUS)  	  and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS +	def_bool y +	depends on DYNAMIC_FTRACE +	depends on HAVE_DYNAMIC_FTRACE_WITH_REGS +  config FUNCTION_PROFILER  	bool "Kernel function profiler"  	depends on FUNCTION_TRACER diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741..71259e2b6b6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)  		return;  	local_irq_save(flags); -	buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); +	buf = this_cpu_ptr(bt->msg_data);  	va_start(args, fmt);  	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);  	va_end(args); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ffe4c5ad3f..98ca94a4181 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);  #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)  #endif +/* + * Traverse the ftrace_global_list, invoking all entries.  The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism.  The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list)			\ +	op = rcu_dereference_raw(list);			\ +	do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op)				\ +	while (likely(op = rcu_dereference_raw((op)->next)) &&	\ +	       unlikely((op) != &ftrace_list_end)) +  /**   * ftrace_nr_registered_ops - return number of ops registered   * @@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)  	return cnt;  } -/* - * Traverse the ftrace_global_list, invoking all entries.  The reason that we - * can use rcu_dereference_raw() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism.  The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */  static void  ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,  			struct ftrace_ops *op, struct pt_regs *regs)  { -	if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) +	int bit; + +	bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); +	if (bit < 0)  		return; -	trace_recursion_set(TRACE_GLOBAL_BIT); -	op = rcu_dereference_raw(ftrace_global_list); /*see above*/ -	while (op != &ftrace_list_end) { +	do_for_each_ftrace_op(op, ftrace_global_list) {  		op->func(ip, parent_ip, op, regs); -		op = rcu_dereference_raw(op->next); /*see above*/ -	}; -	trace_recursion_clear(TRACE_GLOBAL_BIT); +	} while_for_each_ftrace_op(op); + +	trace_clear_recursion(bit);  }  static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -221,10 +233,24 @@ static void update_global_ops(void)  	 * registered callers.  	 */  	if (ftrace_global_list == &ftrace_list_end || -	    ftrace_global_list->next == &ftrace_list_end) +	    ftrace_global_list->next == &ftrace_list_end) {  		func = ftrace_global_list->func; -	else +		/* +		 * As we are calling the function directly. +		 * If it does not have recursion protection, +		 * the function_trace_op needs to be updated +		 * accordingly. +		 */ +		if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) +			global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; +		else +			global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; +	} else {  		func = ftrace_global_list_func; +		/* The list has its own recursion protection. */ +		global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; +	} +  	/* If we filter on pids, update to use the pid function */  	if (!list_empty(&ftrace_pids)) { @@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)  	if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)  		return -EINVAL; -#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS  	/*  	 * If the ftrace_ops specifies SAVE_REGS, then it only can be used  	 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. @@ -3970,35 +3996,49 @@ static void ftrace_init_module(struct module *mod,  	ftrace_process_locs(mod, start, end);  } -static int ftrace_module_notify(struct notifier_block *self, -				unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, +				      unsigned long val, void *data)  {  	struct module *mod = data; -	switch (val) { -	case MODULE_STATE_COMING: +	if (val == MODULE_STATE_COMING)  		ftrace_init_module(mod, mod->ftrace_callsites,  				   mod->ftrace_callsites +  				   mod->num_ftrace_callsites); -		break; -	case MODULE_STATE_GOING: +	return 0; +} + +static int ftrace_module_notify_exit(struct notifier_block *self, +				     unsigned long val, void *data) +{ +	struct module *mod = data; + +	if (val == MODULE_STATE_GOING)  		ftrace_release_mod(mod); -		break; -	}  	return 0;  }  #else -static int ftrace_module_notify(struct notifier_block *self, -				unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, +				      unsigned long val, void *data) +{ +	return 0; +} +static int ftrace_module_notify_exit(struct notifier_block *self, +				     unsigned long val, void *data)  {  	return 0;  }  #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { -	.notifier_call = ftrace_module_notify, -	.priority = 0, +struct notifier_block ftrace_module_enter_nb = { +	.notifier_call = ftrace_module_notify_enter, +	.priority = INT_MAX,	/* Run before anything that can use kprobes */ +}; + +struct notifier_block ftrace_module_exit_nb = { +	.notifier_call = ftrace_module_notify_exit, +	.priority = INT_MIN,	/* Run after anything that can remove kprobes */  };  extern unsigned long __start_mcount_loc[]; @@ -4032,9 +4072,13 @@ void __init ftrace_init(void)  				  __start_mcount_loc,  				  __stop_mcount_loc); -	ret = register_module_notifier(&ftrace_module_nb); +	ret = register_module_notifier(&ftrace_module_enter_nb);  	if (ret) -		pr_warning("Failed to register trace ftrace module notifier\n"); +		pr_warning("Failed to register trace ftrace module enter notifier\n"); + +	ret = register_module_notifier(&ftrace_module_exit_nb); +	if (ret) +		pr_warning("Failed to register trace ftrace module exit notifier\n");  	set_ftrace_early_filters(); @@ -4090,14 +4134,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  	 */  	preempt_disable_notrace();  	trace_recursion_set(TRACE_CONTROL_BIT); -	op = rcu_dereference_raw(ftrace_control_list); -	while (op != &ftrace_list_end) { +	do_for_each_ftrace_op(op, ftrace_control_list) {  		if (!ftrace_function_local_disabled(op) &&  		    ftrace_ops_test(op, ip))  			op->func(ip, parent_ip, op, regs); - -		op = rcu_dereference_raw(op->next); -	}; +	} while_for_each_ftrace_op(op);  	trace_recursion_clear(TRACE_CONTROL_BIT);  	preempt_enable_notrace();  } @@ -4112,27 +4153,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  		       struct ftrace_ops *ignored, struct pt_regs *regs)  {  	struct ftrace_ops *op; +	int bit;  	if (function_trace_stop)  		return; -	if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) +	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); +	if (bit < 0)  		return; -	trace_recursion_set(TRACE_INTERNAL_BIT);  	/*  	 * Some of the ops may be dynamically allocated,  	 * they must be freed after a synchronize_sched().  	 */  	preempt_disable_notrace(); -	op = rcu_dereference_raw(ftrace_ops_list); -	while (op != &ftrace_list_end) { +	do_for_each_ftrace_op(op, ftrace_ops_list) {  		if (ftrace_ops_test(op, ip))  			op->func(ip, parent_ip, op, regs); -		op = rcu_dereference_raw(op->next); -	}; +	} while_for_each_ftrace_op(op);  	preempt_enable_notrace(); -	trace_recursion_clear(TRACE_INTERNAL_BIT); +	trace_clear_recursion(bit);  }  /* @@ -4143,8 +4183,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,   * Archs are to support both the regs and ftrace_ops at the same time.   * If they support ftrace_ops, it is assumed they support regs.   * If call backs want to use regs, they must either check for regs - * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. - * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.   * An architecture can pass partial regs with ftrace_ops and still   * set the ARCH_SUPPORT_FTARCE_OPS.   */ diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index f55fcf61b22..1c71382b283 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,8 +13,5 @@  #define CREATE_TRACE_POINTS  #include <trace/events/power.h> -#ifdef EVENT_POWER_TRACING_DEPRECATED -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -#endif  EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedc..7244acde77b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,8 +3,10 @@   *   * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>   */ +#include <linux/ftrace_event.h>  #include <linux/ring_buffer.h>  #include <linux/trace_clock.h> +#include <linux/trace_seq.h>  #include <linux/spinlock.h>  #include <linux/debugfs.h>  #include <linux/uaccess.h> @@ -21,7 +23,6 @@  #include <linux/fs.h>  #include <asm/local.h> -#include "trace.h"  static void update_pages_handler(struct work_struct *work); @@ -2432,41 +2433,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,  #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + *  bit 0 =  NMI context + *  bit 1 =  IRQ context + *  bit 2 =  SoftIRQ context + *  bit 3 =  normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + *  101 - 1 = 100 + *  101 & 100 = 100 (clearing bit zero) + * + *  1010 - 1 = 1001 + *  1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void)  { -	/* Disable all tracing before we do anything else */ -	tracing_off_permanent(); - -	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" -		    "HC[%lu]:SC[%lu]:NMI[%lu]\n", -		    trace_recursion_buffer(), -		    hardirq_count() >> HARDIRQ_SHIFT, -		    softirq_count() >> SOFTIRQ_SHIFT, -		    in_nmi()); - -	WARN_ON_ONCE(1); -} +	unsigned int val = this_cpu_read(current_context); +	int bit; -static inline int trace_recursive_lock(void) -{ -	trace_recursion_inc(); +	if (in_interrupt()) { +		if (in_nmi()) +			bit = 0; +		else if (in_irq()) +			bit = 1; +		else +			bit = 2; +	} else +		bit = 3; -	if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) -		return 0; +	if (unlikely(val & (1 << bit))) +		return 1; -	trace_recursive_fail(); +	val |= (1 << bit); +	this_cpu_write(current_context, val); -	return -1; +	return 0;  } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void)  { -	WARN_ON_ONCE(!trace_recursion_buffer()); +	unsigned int val = this_cpu_read(current_context); -	trace_recursion_dec(); +	val--; +	val &= this_cpu_read(current_context); +	this_cpu_write(current_context, val);  }  #else @@ -3067,6 +3103,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)  EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);  /** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + +/**   * ring_buffer_entries - get the number of entries in a buffer   * @buffer: The ring buffer   * @@ -3425,7 +3479,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)  	/* check for end of page padding */  	if ((iter->head >= rb_page_size(iter->head_page)) &&  	    (iter->head_page != cpu_buffer->commit_page)) -		rb_advance_iter(iter); +		rb_inc_iter(iter);  }  static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c13e46d7d2..c2e2c231037 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,6 +39,7 @@  #include <linux/poll.h>  #include <linux/nmi.h>  #include <linux/fs.h> +#include <linux/sched/rt.h>  #include "trace.h"  #include "trace_output.h" @@ -249,7 +250,7 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;  static struct tracer		*trace_types __read_mostly;  /* current_trace points to the tracer that is currently active */ -static struct tracer		*current_trace __read_mostly; +static struct tracer		*current_trace __read_mostly = &nop_trace;  /*   * trace_types_lock is used to protect the trace_types list. @@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (!current_trace->use_max_tr) { -		WARN_ON_ONCE(1); + +	if (!current_trace->allocated_snapshot) { +		/* Only the nop tracer should hit this when disabling */ +		WARN_ON_ONCE(current_trace != &nop_trace);  		return;  	} +  	arch_spin_lock(&ftrace_max_lock);  	tr->buffer = max_tr.buffer; @@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (!current_trace->use_max_tr) { -		WARN_ON_ONCE(1); +	if (WARN_ON_ONCE(!current_trace->allocated_snapshot))  		return; -	}  	arch_spin_lock(&ftrace_max_lock); @@ -862,10 +864,13 @@ int register_tracer(struct tracer *type)  		current_trace = type; -		/* If we expanded the buffers, make sure the max is expanded too */ -		if (ring_buffer_expanded && type->use_max_tr) -			ring_buffer_resize(max_tr.buffer, trace_buf_size, -						RING_BUFFER_ALL_CPUS); +		if (type->use_max_tr) { +			/* If we expanded the buffers, make sure the max is expanded too */ +			if (ring_buffer_expanded) +				ring_buffer_resize(max_tr.buffer, trace_buf_size, +						   RING_BUFFER_ALL_CPUS); +			type->allocated_snapshot = true; +		}  		/* the test is responsible for initializing and enabling */  		pr_info("Testing tracer %s: ", type->name); @@ -881,10 +886,14 @@ int register_tracer(struct tracer *type)  		/* Only reset on passing, to avoid touching corrupted buffers */  		tracing_reset_online_cpus(tr); -		/* Shrink the max buffer again */ -		if (ring_buffer_expanded && type->use_max_tr) -			ring_buffer_resize(max_tr.buffer, 1, -						RING_BUFFER_ALL_CPUS); +		if (type->use_max_tr) { +			type->allocated_snapshot = false; + +			/* Shrink the max buffer again */ +			if (ring_buffer_expanded) +				ring_buffer_resize(max_tr.buffer, 1, +						   RING_BUFFER_ALL_CPUS); +		}  		printk(KERN_CONT "PASSED\n");  	} @@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu)  {  	struct ring_buffer *buffer = tr->buffer; +	if (!buffer) +		return; +  	ring_buffer_record_disable(buffer);  	/* Make sure all commits have finished */ @@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)  	struct ring_buffer *buffer = tr->buffer;  	int cpu; +	if (!buffer) +		return; +  	ring_buffer_record_disable(buffer);  	/* Make sure all commits have finished */ @@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  	entry->preempt_count		= pc & 0xff;  	entry->pid			= (tsk) ? tsk->pid : 0; -	entry->padding			= 0;  	entry->flags =  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT  		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	 */  	preempt_disable_notrace(); -	use_stack = ++__get_cpu_var(ftrace_stack_reserve); +	use_stack = __this_cpu_inc_return(ftrace_stack_reserve);  	/*  	 * We don't need any atomic variables, just a barrier.  	 * If an interrupt comes in, we don't care, because it would @@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,   out:  	/* Again, don't let gcc optimize things here */  	barrier(); -	__get_cpu_var(ftrace_stack_reserve)--; +	__this_cpu_dec(ftrace_stack_reserve);  	preempt_enable_notrace();  } @@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;  static char *get_trace_buf(void)  {  	struct trace_buffer_struct *percpu_buffer; -	struct trace_buffer_struct *buffer;  	/*  	 * If we have allocated per cpu buffers, then we do not @@ -1535,9 +1548,7 @@ static char *get_trace_buf(void)  	if (!percpu_buffer)  		return NULL; -	buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); - -	return buffer->buffer; +	return this_cpu_ptr(&percpu_buffer->buffer[0]);  }  static int alloc_percpu_trace_buffer(void) @@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  static void *s_start(struct seq_file *m, loff_t *pos)  {  	struct trace_iterator *iter = m->private; -	static struct tracer *old_tracer;  	int cpu_file = iter->cpu_file;  	void *p = NULL;  	loff_t l = 0;  	int cpu; -	/* copy the tracer to avoid using a global lock all around */ +	/* +	 * copy the tracer to avoid using a global lock all around. +	 * iter->trace is a copy of current_trace, the pointer to the +	 * name may be used instead of a strcmp(), as iter->trace->name +	 * will point to the same string as current_trace->name. +	 */  	mutex_lock(&trace_types_lock); -	if (unlikely(old_tracer != current_trace && current_trace)) { -		old_tracer = current_trace; +	if (unlikely(current_trace && iter->trace->name != current_trace->name))  		*iter->trace = *current_trace; -	}  	mutex_unlock(&trace_types_lock); -	atomic_inc(&trace_record_cmdline_disabled); +	if (iter->snapshot && iter->trace->use_max_tr) +		return ERR_PTR(-EBUSY); + +	if (!iter->snapshot) +		atomic_inc(&trace_record_cmdline_disabled);  	if (*pos != iter->pos) {  		iter->ent = NULL; @@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p)  {  	struct trace_iterator *iter = m->private; -	atomic_dec(&trace_record_cmdline_disabled); +	if (iter->snapshot && iter->trace->use_max_tr) +		return; + +	if (!iter->snapshot) +		atomic_dec(&trace_record_cmdline_disabled);  	trace_access_unlock(iter->cpu_file);  	trace_event_read_unlock();  } @@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  	unsigned long total;  	const char *name = "preemption"; -	if (type) -		name = type->name; +	name = type->name;  	get_total_entries(tr, &total, &entries); @@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = {  };  static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) +__tracing_open(struct inode *inode, struct file *file, bool snapshot)  {  	long cpu_file = (long) inode->i_private;  	struct trace_iterator *iter; @@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file)  	if (!iter->trace)  		goto fail; -	if (current_trace) -		*iter->trace = *current_trace; +	*iter->trace = *current_trace;  	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))  		goto fail; -	if (current_trace && current_trace->print_max) +	if (current_trace->print_max || snapshot)  		iter->tr = &max_tr;  	else  		iter->tr = &global_trace; +	iter->snapshot = snapshot;  	iter->pos = -1;  	mutex_init(&iter->mutex);  	iter->cpu_file = cpu_file; @@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file)  	if (trace_clocks[trace_clock_id].in_ns)  		iter->iter_flags |= TRACE_FILE_TIME_IN_NS; -	/* stop the trace while dumping */ -	tracing_stop(); +	/* stop the trace while dumping if we are not opening "snapshot" */ +	if (!iter->snapshot) +		tracing_stop();  	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {  		for_each_tracing_cpu(cpu) { @@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file)  	if (iter->trace && iter->trace->close)  		iter->trace->close(iter); -	/* reenable tracing if it was previously enabled */ -	tracing_start(); +	if (!iter->snapshot) +		/* reenable tracing if it was previously enabled */ +		tracing_start();  	mutex_unlock(&trace_types_lock);  	mutex_destroy(&iter->mutex); @@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file)  	}  	if (file->f_mode & FMODE_READ) { -		iter = __tracing_open(inode, file); +		iter = __tracing_open(inode, file, false);  		if (IS_ERR(iter))  			ret = PTR_ERR(iter);  		else if (trace_flags & TRACE_ITER_LATENCY_FMT) @@ -3014,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,  	int r;  	mutex_lock(&trace_types_lock); -	if (current_trace) -		r = sprintf(buf, "%s\n", current_trace->name); -	else -		r = sprintf(buf, "\n"); +	r = sprintf(buf, "%s\n", current_trace->name);  	mutex_unlock(&trace_types_lock);  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3183,6 +3202,7 @@ static int tracing_set_tracer(const char *buf)  	static struct trace_option_dentry *topts;  	struct trace_array *tr = &global_trace;  	struct tracer *t; +	bool had_max_tr;  	int ret = 0;  	mutex_lock(&trace_types_lock); @@ -3207,9 +3227,21 @@ static int tracing_set_tracer(const char *buf)  		goto out;  	trace_branch_disable(); -	if (current_trace && current_trace->reset) +	if (current_trace->reset)  		current_trace->reset(tr); -	if (current_trace && current_trace->use_max_tr) { + +	had_max_tr = current_trace->allocated_snapshot; +	current_trace = &nop_trace; + +	if (had_max_tr && !t->use_max_tr) { +		/* +		 * We need to make sure that the update_max_tr sees that +		 * current_trace changed to nop_trace to keep it from +		 * swapping the buffers after we resize it. +		 * The update_max_tr is called from interrupts disabled +		 * so a synchronized_sched() is sufficient. +		 */ +		synchronize_sched();  		/*  		 * We don't free the ring buffer. instead, resize it because  		 * The max_tr ring buffer has some state (e.g. ring->clock) and @@ -3217,18 +3249,19 @@ static int tracing_set_tracer(const char *buf)  		 */  		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);  		set_buffer_entries(&max_tr, 1); +		tracing_reset_online_cpus(&max_tr); +		current_trace->allocated_snapshot = false;  	}  	destroy_trace_option_files(topts); -	current_trace = &nop_trace; -  	topts = create_trace_option_files(t); -	if (t->use_max_tr) { +	if (t->use_max_tr && !had_max_tr) {  		/* we need to make per cpu buffer sizes equivalent */  		ret = resize_buffer_duplicate_size(&max_tr, &global_trace,  						   RING_BUFFER_ALL_CPUS);  		if (ret < 0)  			goto out; +		t->allocated_snapshot = true;  	}  	if (t->init) { @@ -3336,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  		ret = -ENOMEM;  		goto fail;  	} -	if (current_trace) -		*iter->trace = *current_trace; +	*iter->trace = *current_trace;  	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {  		ret = -ENOMEM; @@ -3477,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  		  size_t cnt, loff_t *ppos)  {  	struct trace_iterator *iter = filp->private_data; -	static struct tracer *old_tracer;  	ssize_t sret;  	/* return any leftover data */ @@ -3489,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(old_tracer != current_trace && current_trace)) { -		old_tracer = current_trace; +	if (unlikely(iter->trace->name != current_trace->name))  		*iter->trace = *current_trace; -	}  	mutex_unlock(&trace_types_lock);  	/* @@ -3648,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		.ops		= &tracing_pipe_buf_ops,  		.spd_release	= tracing_spd_release_pipe,  	}; -	static struct tracer *old_tracer;  	ssize_t ret;  	size_t rem;  	unsigned int i; @@ -3658,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(old_tracer != current_trace && current_trace)) { -		old_tracer = current_trace; +	if (unlikely(iter->trace->name != current_trace->name))  		*iter->trace = *current_trace; -	}  	mutex_unlock(&trace_types_lock);  	mutex_lock(&iter->mutex); @@ -4037,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  	 * Reset the buffer so that it doesn't have incomparable timestamps.  	 */  	tracing_reset_online_cpus(&global_trace); -	if (max_tr.buffer) -		tracing_reset_online_cpus(&max_tr); +	tracing_reset_online_cpus(&max_tr);  	mutex_unlock(&trace_types_lock); @@ -4054,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file)  	return single_open(file, tracing_clock_show, NULL);  } +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ +	struct trace_iterator *iter; +	int ret = 0; + +	if (file->f_mode & FMODE_READ) { +		iter = __tracing_open(inode, file, true); +		if (IS_ERR(iter)) +			ret = PTR_ERR(iter); +	} +	return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, +		       loff_t *ppos) +{ +	unsigned long val; +	int ret; + +	ret = tracing_update_buffers(); +	if (ret < 0) +		return ret; + +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret; + +	mutex_lock(&trace_types_lock); + +	if (current_trace->use_max_tr) { +		ret = -EBUSY; +		goto out; +	} + +	switch (val) { +	case 0: +		if (current_trace->allocated_snapshot) { +			/* free spare buffer */ +			ring_buffer_resize(max_tr.buffer, 1, +					   RING_BUFFER_ALL_CPUS); +			set_buffer_entries(&max_tr, 1); +			tracing_reset_online_cpus(&max_tr); +			current_trace->allocated_snapshot = false; +		} +		break; +	case 1: +		if (!current_trace->allocated_snapshot) { +			/* allocate spare buffer */ +			ret = resize_buffer_duplicate_size(&max_tr, +					&global_trace, RING_BUFFER_ALL_CPUS); +			if (ret < 0) +				break; +			current_trace->allocated_snapshot = true; +		} + +		local_irq_disable(); +		/* Now, we're going to swap */ +		update_max_tr(&global_trace, current, smp_processor_id()); +		local_irq_enable(); +		break; +	default: +		if (current_trace->allocated_snapshot) +			tracing_reset_online_cpus(&max_tr); +		else +			ret = -EINVAL; +		break; +	} + +	if (ret >= 0) { +		*ppos += cnt; +		ret = cnt; +	} +out: +	mutex_unlock(&trace_types_lock); +	return ret; +} +#endif /* CONFIG_TRACER_SNAPSHOT */ + +  static const struct file_operations tracing_max_lat_fops = {  	.open		= tracing_open_generic,  	.read		= tracing_max_lat_read, @@ -4110,6 +4216,16 @@ static const struct file_operations trace_clock_fops = {  	.write		= tracing_clock_write,  }; +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { +	.open		= tracing_snapshot_open, +	.read		= seq_read, +	.write		= tracing_snapshot_write, +	.llseek		= tracing_seek, +	.release	= tracing_release, +}; +#endif /* CONFIG_TRACER_SNAPSHOT */ +  struct ftrace_buffer_info {  	struct trace_array	*tr;  	void			*spare; @@ -4414,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,  	cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);  	trace_seq_printf(s, "dropped events: %ld\n", cnt); +	cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); +	trace_seq_printf(s, "read events: %ld\n", cnt); +  	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);  	kfree(s); @@ -4490,7 +4609,7 @@ struct dentry *tracing_init_dentry(void)  static struct dentry *d_percpu; -struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(void)  {  	static int once;  	struct dentry *d_tracer; @@ -4906,6 +5025,11 @@ static __init int tracer_init_debugfs(void)  			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);  #endif +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_tracer, +			  (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); +#endif +  	create_trace_options_dir();  	for_each_tracing_cpu(cpu) @@ -5014,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  	if (disable_tracing)  		ftrace_kill(); +	/* Simulate the iterator */  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { @@ -5025,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  	/* don't look at user memory in panic mode */  	trace_flags &= ~TRACE_ITER_SYM_USEROBJ; -	/* Simulate the iterator */ -	iter.tr = &global_trace; -	iter.trace = current_trace; -  	switch (oops_dump_mode) {  	case DUMP_ALL:  		iter.cpu_file = TRACE_PIPE_ALL_CPU; @@ -5173,7 +5294,7 @@ __init static int tracer_alloc_buffers(void)  	init_irq_work(&trace_work_wakeup, trace_wake_up);  	register_tracer(&nop_trace); -	current_trace = &nop_trace; +  	/* All seems OK, enable tracing */  	tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c75d7988902..57d7e5397d5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -287,20 +287,62 @@ struct tracer {  	struct tracer_flags	*flags;  	bool			print_max;  	bool			use_max_tr; +	bool			allocated_snapshot;  };  /* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) +/* + * For function tracing recursion: + *  The order of these bits are important. + * + *  When function tracing occurs, the following steps are made: + *   If arch does not support a ftrace feature: + *    call internal function (uses INTERNAL bits) which calls... + *   If callback is registered to the "global" list, the list + *    function is called and recursion checks the GLOBAL bits. + *    then this function calls... + *   The function callback, which can use the FTRACE bits to + *    check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ +enum { +	TRACE_BUFFER_BIT, +	TRACE_BUFFER_NMI_BIT, +	TRACE_BUFFER_IRQ_BIT, +	TRACE_BUFFER_SIRQ_BIT, + +	/* Start of function recursion bits */ +	TRACE_FTRACE_BIT, +	TRACE_FTRACE_NMI_BIT, +	TRACE_FTRACE_IRQ_BIT, +	TRACE_FTRACE_SIRQ_BIT, + +	/* GLOBAL_BITs must be greater than FTRACE_BITs */ +	TRACE_GLOBAL_BIT, +	TRACE_GLOBAL_NMI_BIT, +	TRACE_GLOBAL_IRQ_BIT, +	TRACE_GLOBAL_SIRQ_BIT, + +	/* INTERNAL_BITs must be greater than GLOBAL_BITs */ +	TRACE_INTERNAL_BIT, +	TRACE_INTERNAL_NMI_BIT, +	TRACE_INTERNAL_IRQ_BIT, +	TRACE_INTERNAL_SIRQ_BIT, -/* for function tracing recursion */ -#define TRACE_INTERNAL_BIT		(1<<11) -#define TRACE_GLOBAL_BIT		(1<<12) -#define TRACE_CONTROL_BIT		(1<<13) +	TRACE_CONTROL_BIT,  /*   * Abuse of the trace_recursion. @@ -309,11 +351,77 @@ struct tracer {   * was called in irq context but we have irq tracing off. Since this   * can only be modified by current, we can reuse trace_recursion.   */ -#define TRACE_IRQ_BIT			(1<<13) +	TRACE_IRQ_BIT, +}; + +#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit))) + +#define TRACE_CONTEXT_BITS	4 + +#define TRACE_FTRACE_START	TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_GLOBAL_START	TRACE_GLOBAL_BIT +#define TRACE_GLOBAL_MAX	((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START	TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK	TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ +	int bit; + +	if (in_interrupt()) { +		if (in_nmi()) +			bit = 0; + +		else if (in_irq()) +			bit = 1; +		else +			bit = 2; +	} else +		bit = 3; + +	return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ +	unsigned int val = current->trace_recursion; +	int bit; + +	/* A previous recursion check was made */ +	if ((val & TRACE_CONTEXT_MASK) > max) +		return 0; + +	bit = trace_get_context_bit() + start; +	if (unlikely(val & (1 << bit))) +		return -1; -#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit)	((current)->trace_recursion & (bit)) +	val |= 1 << bit; +	current->trace_recursion = val; +	barrier(); + +	return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ +	unsigned int val = current->trace_recursion; + +	if (!bit) +		return; + +	bit = 1 << bit; +	val &= ~bit; + +	barrier(); +	current->trace_recursion = val; +}  #define TRACE_PIPE_ALL_CPU	-1 diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cb..aa8f5f48dae 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@  #include <linux/ktime.h>  #include <linux/trace_clock.h> -#include "trace.h" -  /*   * trace_clock_local(): the simplest and least coherent tracing clock.   * @@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)  	return clock;  } +EXPORT_SYMBOL_GPL(trace_clock_local);  /*   * trace_clock(): 'between' trace clock. Not completely serialized, @@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)  	local_irq_save(flags);  	this_cpu = raw_smp_processor_id(); -	now = cpu_clock(this_cpu); +	now = sched_clock_cpu(this_cpu);  	/*  	 * If in an NMI context then dont risk lockups and return the  	 * cpu_clock() time: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 880073d0b94..57e9b284250 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void)  	__common_field(unsigned char, flags);  	__common_field(unsigned char, preempt_count);  	__common_field(int, pid); -	__common_field(int, padding);  	return ret;  } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8e3ad8082ab..60115252332 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)  	tracing_reset_online_cpus(tr);  } -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, -				 struct ftrace_ops *op, struct pt_regs *pt_regs) -{ -	struct trace_array *tr = func_trace; -	struct trace_array_cpu *data; -	unsigned long flags; -	long disabled; -	int cpu; -	int pc; - -	if (unlikely(!ftrace_function_enabled)) -		return; - -	pc = preempt_count(); -	preempt_disable_notrace(); -	local_save_flags(flags); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disabled = atomic_inc_return(&data->disabled); - -	if (likely(disabled == 1)) -		trace_function(tr, ip, parent_ip, flags, pc); - -	atomic_dec(&data->disabled); -	preempt_enable_notrace(); -} -  /* Our option */  enum {  	TRACE_FUNC_OPT_STACK	= 0x1, @@ -85,34 +57,34 @@ static struct tracer_flags func_flags;  static void  function_trace_call(unsigned long ip, unsigned long parent_ip,  		    struct ftrace_ops *op, struct pt_regs *pt_regs) -  {  	struct trace_array *tr = func_trace;  	struct trace_array_cpu *data;  	unsigned long flags; -	long disabled; +	int bit;  	int cpu;  	int pc;  	if (unlikely(!ftrace_function_enabled))  		return; -	/* -	 * Need to use raw, since this must be called before the -	 * recursive protection is performed. -	 */ -	local_irq_save(flags); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disabled = atomic_inc_return(&data->disabled); +	pc = preempt_count(); +	preempt_disable_notrace(); -	if (likely(disabled == 1)) { -		pc = preempt_count(); +	bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); +	if (bit < 0) +		goto out; + +	cpu = smp_processor_id(); +	data = tr->data[cpu]; +	if (!atomic_read(&data->disabled)) { +		local_save_flags(flags);  		trace_function(tr, ip, parent_ip, flags, pc);  	} +	trace_clear_recursion(bit); -	atomic_dec(&data->disabled); -	local_irq_restore(flags); + out: +	preempt_enable_notrace();  }  static void @@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)  {  	ftrace_function_enabled = 0; -	if (trace_flags & TRACE_ITER_PREEMPTONLY) -		trace_ops.func = function_trace_call_preempt_only; -	else -		trace_ops.func = function_trace_call; -  	if (func_flags.val & TRACE_FUNC_OPT_STACK)  		register_ftrace_function(&trace_stack_ops);  	else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7..39ada66389c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -47,6 +47,8 @@ struct fgraph_data {  #define TRACE_GRAPH_PRINT_ABS_TIME	0x20  #define TRACE_GRAPH_PRINT_IRQS		0x40 +static unsigned int max_depth; +  static struct tracer_opt trace_opts[] = {  	/* Display overruns? (for self-debug purpose) */  	{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, @@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)  	ftrace_pop_return_trace(&trace, &ret, frame_pointer);  	trace.rettime = trace_clock_local(); -	ftrace_graph_return(&trace);  	barrier();  	current->curr_ret_stack--; +	/* +	 * The trace should run after decrementing the ret counter +	 * in case an interrupt were to come in. We don't want to +	 * lose the interrupt if max_depth is set. +	 */ +	ftrace_graph_return(&trace); +  	if (unlikely(!ret)) {  		ftrace_graph_stop();  		WARN_ON(1); @@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  		return 0;  	/* trace it when it is-nested-in or is a function enabled. */ -	if (!(trace->depth || ftrace_graph_addr(trace->func)) || -	      ftrace_graph_ignore_irqs()) +	if ((!(trace->depth || ftrace_graph_addr(trace->func)) || +	     ftrace_graph_ignore_irqs()) || +	    (max_depth && trace->depth >= max_depth))  		return 0;  	local_irq_save(flags); @@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {  #endif  }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, +		  loff_t *ppos) +{ +	unsigned long val; +	int ret; + +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret; + +	max_depth = val; + +	*ppos += cnt; + +	return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, +		 loff_t *ppos) +{ +	char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ +	int n; + +	n = sprintf(buf, "%d\n", max_depth); + +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { +	.open		= tracing_open_generic, +	.write		= graph_depth_write, +	.read		= graph_depth_read, +	.llseek		= generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ +	struct dentry *d_tracer; + +	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; + +	trace_create_file("max_graph_depth", 0644, d_tracer, +			  NULL, &graph_depth_fops); + +	return 0; +} +fs_initcall(init_graph_debugfs); +  static __init int init_graph_trace(void)  {  	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 93370867781..5c7e09d10d7 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -66,7 +66,6 @@  #define TP_FLAG_TRACE		1  #define TP_FLAG_PROFILE		2  #define TP_FLAG_REGISTERED	4 -#define TP_FLAG_UPROBE		8  /* data_rloc: data relative location, compatible with u32 */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9fe45fcefca..75aa97fbe1a 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,8 +15,8 @@  #include <linux/kallsyms.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> +#include <linux/sched/rt.h>  #include <trace/events/sched.h> -  #include "trace.h"  static struct trace_array	*wakeup_trace; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 47623169a81..51c819c12c2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,  	 * The ftrace infrastructure should provide the recursion  	 * protection. If not, this will crash the kernel!  	 */ -	trace_selftest_recursion_cnt++; +	if (trace_selftest_recursion_cnt++ > 10) +		return;  	DYN_FTRACE_TEST_NAME();  } @@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)  	char *func_name;  	int len;  	int ret; -	int cnt;  	/* The previous test PASSED */  	pr_cont("PASSED\n"); @@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)  	unregister_ftrace_function(&test_recsafe_probe); -	/* -	 * If arch supports all ftrace features, and no other task -	 * was on the list, we should be fine. -	 */ -	if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) -		cnt = 2; /* Should have recursed */ -	else -		cnt = 1; -  	ret = -1; -	if (trace_selftest_recursion_cnt != cnt) { -		pr_cont("*callback not called expected %d times (%d)* ", -			cnt, trace_selftest_recursion_cnt); +	if (trace_selftest_recursion_cnt != 2) { +		pr_cont("*callback not called expected 2 times (%d)* ", +			trace_selftest_recursion_cnt);  		goto out;  	} @@ -568,7 +559,7 @@ trace_selftest_function_regs(void)  	int ret;  	int supported = 0; -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS  	supported = 1;  #endif diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7609dd6714c..7a809e32105 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@  #include <trace/syscall.h>  #include <trace/events/syscalls.h> +#include <linux/syscalls.h>  #include <linux/slab.h>  #include <linux/kernel.h>  #include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ @@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name  }  #endif +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + *     *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ +	if (unlikely(arch_trace_is_compat_syscall(regs))) +		return -1; + +	return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ +	return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ +  static __init struct syscall_metadata *  find_syscall_meta(unsigned long syscall)  { @@ -77,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)  	return syscalls_metadata[nr];  } -enum print_line_t +static enum print_line_t  print_syscall_enter(struct trace_iterator *iter, int flags,  		    struct trace_event *event)  { @@ -130,7 +163,7 @@ end:  	return TRACE_TYPE_HANDLED;  } -enum print_line_t +static enum print_line_t  print_syscall_exit(struct trace_iterator *iter, int flags,  		   struct trace_event *event)  { @@ -270,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)  	return ret;  } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  {  	struct syscall_trace_enter *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; -	int size;  	int syscall_nr; +	int size; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return;  	if (!test_bit(syscall_nr, enabled_enter_syscalls)) @@ -305,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  {  	struct syscall_trace_exit *entry;  	struct syscall_metadata *sys_data; @@ -313,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	struct ring_buffer *buffer;  	int syscall_nr; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return;  	if (!test_bit(syscall_nr, enabled_exit_syscalls)) @@ -337,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_call *call)  {  	int ret = 0;  	int num; @@ -356,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)  	return ret;  } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_call *call)  {  	int num; @@ -371,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)  	mutex_unlock(&syscall_trace_lock);  } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_call *call)  {  	int ret = 0;  	int num; @@ -390,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)  	return ret;  } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_call *call)  {  	int num; @@ -459,7 +492,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)  	return (unsigned long)sys_call_table[nr];  } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void)  {  	struct syscall_metadata *meta;  	unsigned long addr; @@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	int rctx;  	int size; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return;  	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) @@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	int rctx;  	int size; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return;  	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c86e6d4f67f..8dad2a92dee 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,20 +28,21 @@  #define UPROBE_EVENT_SYSTEM	"uprobes" +struct trace_uprobe_filter { +	rwlock_t		rwlock; +	int			nr_systemwide; +	struct list_head	perf_events; +}; +  /*   * uprobe event core functions   */ -struct trace_uprobe; -struct uprobe_trace_consumer { -	struct uprobe_consumer		cons; -	struct trace_uprobe		*tu; -}; -  struct trace_uprobe {  	struct list_head		list;  	struct ftrace_event_class	class;  	struct ftrace_event_call	call; -	struct uprobe_trace_consumer	*consumer; +	struct trace_uprobe_filter	filter; +	struct uprobe_consumer		consumer;  	struct inode			*inode;  	char				*filename;  	unsigned long			offset; @@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);  static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ +	rwlock_init(&filter->rwlock); +	filter->nr_systemwide = 0; +	INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ +	return !filter->nr_systemwide && list_empty(&filter->perf_events); +} +  /*   * Allocate new trace_uprobe and initialize it (including uprobes).   */ @@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)  		goto error;  	INIT_LIST_HEAD(&tu->list); +	tu->consumer.handler = uprobe_dispatcher; +	init_trace_uprobe_filter(&tu->filter);  	return tu;  error: @@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)  	if (ret)  		goto fail_address_parse; +	inode = igrab(path.dentry->d_inode); +	path_put(&path); + +	if (!inode || !S_ISREG(inode->i_mode)) { +		ret = -EINVAL; +		goto fail_address_parse; +	} +  	ret = kstrtoul(arg, 0, &offset);  	if (ret)  		goto fail_address_parse; -	inode = igrab(path.dentry->d_inode); -  	argc -= 2;  	argv += 2; @@ -356,7 +377,7 @@ fail_address_parse:  	if (inode)  		iput(inode); -	pr_info("Failed to parse address.\n"); +	pr_info("Failed to parse address or file.\n");  	return ret;  } @@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {  };  /* uprobe handler */ -static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)  {  	struct uprobe_trace_entry_head *entry;  	struct ring_buffer_event *event; @@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)  	unsigned long irq_flags;  	struct ftrace_event_call *call = &tu->call; -	tu->nhit++; -  	local_save_flags(irq_flags);  	pc = preempt_count(); @@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)  	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,  						  size, irq_flags, pc);  	if (!event) -		return; +		return 0;  	entry = ring_buffer_event_data(event); -	entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); +	entry->ip = instruction_pointer(task_pt_regs(current));  	data = (u8 *)&entry[1];  	for (i = 0; i < tu->nr_args; i++)  		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);  	if (!filter_current_check_discard(buffer, call, entry, event))  		trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + +	return 0;  }  /* Event entry printers */ @@ -533,42 +554,43 @@ partial:  	return TRACE_TYPE_PARTIAL_LINE;  } -static int probe_event_enable(struct trace_uprobe *tu, int flag) +static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)  { -	struct uprobe_trace_consumer *utc; -	int ret = 0; +	return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); +} -	if (!tu->inode || tu->consumer) -		return -EINTR; +typedef bool (*filter_func_t)(struct uprobe_consumer *self, +				enum uprobe_filter_ctx ctx, +				struct mm_struct *mm); -	utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); -	if (!utc) +static int +probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) +{ +	int ret = 0; + +	if (is_trace_uprobe_enabled(tu))  		return -EINTR; -	utc->cons.handler = uprobe_dispatcher; -	utc->cons.filter = NULL; -	ret = uprobe_register(tu->inode, tu->offset, &utc->cons); -	if (ret) { -		kfree(utc); -		return ret; -	} +	WARN_ON(!uprobe_filter_is_empty(&tu->filter));  	tu->flags |= flag; -	utc->tu = tu; -	tu->consumer = utc; +	tu->consumer.filter = filter; +	ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); +	if (ret) +		tu->flags &= ~flag; -	return 0; +	return ret;  }  static void probe_event_disable(struct trace_uprobe *tu, int flag)  { -	if (!tu->inode || !tu->consumer) +	if (!is_trace_uprobe_enabled(tu))  		return; -	uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); +	WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + +	uprobe_unregister(tu->inode, tu->offset, &tu->consumer);  	tu->flags &= ~flag; -	kfree(tu->consumer); -	tu->consumer = NULL;  }  static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)  }  #ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ +	struct perf_event *event; + +	if (filter->nr_systemwide) +		return true; + +	list_for_each_entry(event, &filter->perf_events, hw.tp_list) { +		if (event->hw.tp_target->mm == mm) +			return true; +	} + +	return false; +} + +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ +	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ +	bool done; + +	write_lock(&tu->filter.rwlock); +	if (event->hw.tp_target) { +		/* +		 * event->parent != NULL means copy_process(), we can avoid +		 * uprobe_apply(). current->mm must be probed and we can rely +		 * on dup_mmap() which preserves the already installed bp's. +		 * +		 * attr.enable_on_exec means that exec/mmap will install the +		 * breakpoints we need. +		 */ +		done = tu->filter.nr_systemwide || +			event->parent || event->attr.enable_on_exec || +			uprobe_filter_event(tu, event); +		list_add(&event->hw.tp_list, &tu->filter.perf_events); +	} else { +		done = tu->filter.nr_systemwide; +		tu->filter.nr_systemwide++; +	} +	write_unlock(&tu->filter.rwlock); + +	if (!done) +		uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + +	return 0; +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ +	bool done; + +	write_lock(&tu->filter.rwlock); +	if (event->hw.tp_target) { +		list_del(&event->hw.tp_list); +		done = tu->filter.nr_systemwide || +			(event->hw.tp_target->flags & PF_EXITING) || +			uprobe_filter_event(tu, event); +	} else { +		tu->filter.nr_systemwide--; +		done = tu->filter.nr_systemwide; +	} +	write_unlock(&tu->filter.rwlock); + +	if (!done) +		uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + +	return 0; +} + +static bool uprobe_perf_filter(struct uprobe_consumer *uc, +				enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ +	struct trace_uprobe *tu; +	int ret; + +	tu = container_of(uc, struct trace_uprobe, consumer); +	read_lock(&tu->filter.rwlock); +	ret = __uprobe_perf_filter(&tu->filter, mm); +	read_unlock(&tu->filter.rwlock); + +	return ret; +} +  /* uprobe profile handler */ -static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)  {  	struct ftrace_event_call *call = &tu->call;  	struct uprobe_trace_entry_head *entry; @@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)  	int size, __size, i;  	int rctx; +	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) +		return UPROBE_HANDLER_REMOVE; +  	__size = sizeof(*entry) + tu->size;  	size = ALIGN(__size + sizeof(u32), sizeof(u64));  	size -= sizeof(u32);  	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) -		return; +		return 0;  	preempt_disable(); @@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)  	if (!entry)  		goto out; -	entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); +	entry->ip = instruction_pointer(task_pt_regs(current));  	data = (u8 *)&entry[1];  	for (i = 0; i < tu->nr_args; i++)  		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)   out:  	preempt_enable(); +	return 0;  }  #endif	/* CONFIG_PERF_EVENTS */ @@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,  	switch (type) {  	case TRACE_REG_REGISTER: -		return probe_event_enable(tu, TP_FLAG_TRACE); +		return probe_event_enable(tu, TP_FLAG_TRACE, NULL);  	case TRACE_REG_UNREGISTER:  		probe_event_disable(tu, TP_FLAG_TRACE); @@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,  #ifdef CONFIG_PERF_EVENTS  	case TRACE_REG_PERF_REGISTER: -		return probe_event_enable(tu, TP_FLAG_PROFILE); +		return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);  	case TRACE_REG_PERF_UNREGISTER:  		probe_event_disable(tu, TP_FLAG_PROFILE);  		return 0; + +	case TRACE_REG_PERF_OPEN: +		return uprobe_perf_open(tu, data); + +	case TRACE_REG_PERF_CLOSE: +		return uprobe_perf_close(tu, data); +  #endif  	default:  		return 0; @@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,  static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)  { -	struct uprobe_trace_consumer *utc;  	struct trace_uprobe *tu; +	int ret = 0; -	utc = container_of(con, struct uprobe_trace_consumer, cons); -	tu = utc->tu; -	if (!tu || tu->consumer != utc) -		return 0; +	tu = container_of(con, struct trace_uprobe, consumer); +	tu->nhit++;  	if (tu->flags & TP_FLAG_TRACE) -		uprobe_trace_func(tu, regs); +		ret |= uprobe_trace_func(tu, regs);  #ifdef CONFIG_PERF_EVENTS  	if (tu->flags & TP_FLAG_PROFILE) -		uprobe_perf_func(tu, regs); +		ret |= uprobe_perf_func(tu, regs);  #endif -	return 0; +	return ret;  }  static struct trace_event_functions uprobe_funcs = { diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 625df0b4469..a1dd9a1b132 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,  {  	const struct cred *tcred;  	struct timespec uptime, ts; +	cputime_t utime, stime, utimescaled, stimescaled;  	u64 ac_etime;  	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); @@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,  	stats->ac_ppid	 = pid_alive(tsk) ?  		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;  	rcu_read_unlock(); -	stats->ac_utime = cputime_to_usecs(tsk->utime); -	stats->ac_stime = cputime_to_usecs(tsk->stime); -	stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); -	stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); + +	task_cputime(tsk, &utime, &stime); +	stats->ac_utime = cputime_to_usecs(utime); +	stats->ac_stime = cputime_to_usecs(stime); + +	task_cputime_scaled(tsk, &utimescaled, &stimescaled); +	stats->ac_utimescaled = cputime_to_usecs(utimescaled); +	stats->ac_stimescaled = cputime_to_usecs(stimescaled); +  	stats->ac_minflt = tsk->min_flt;  	stats->ac_majflt = tsk->maj_flt; @@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)  #undef KB  #undef MB -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) +static void __acct_update_integrals(struct task_struct *tsk, +				    cputime_t utime, cputime_t stime)  {  	if (likely(tsk->mm)) {  		cputime_t time, dtime; @@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)  		u64 delta;  		local_irq_save(flags); -		time = tsk->stime + tsk->utime; +		time = stime + utime;  		dtime = time - tsk->acct_timexpd;  		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);  		delta = value.tv_sec; @@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)  }  /** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ +	cputime_t utime, stime; + +	task_cputime(tsk, &utime, &stime); +	__acct_update_integrals(tsk, utime, stime); +} + +/** + * acct_account_cputime - update mm integral after cputime update + * @tsk: task_struct for accounting + */ +void acct_account_cputime(struct task_struct *tsk) +{ +	__acct_update_integrals(tsk, tsk->utime, tsk->stime); +} + +/**   * acct_clear_integrals - clear the mm integral fields in task_struct   * @tsk: task_struct whose accounting fields are cleared   */ diff --git a/kernel/user.c b/kernel/user.c index 33acb5e53a5..57ebfd42023 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -47,9 +47,7 @@ struct user_namespace init_user_ns = {  			.count = 4294967295U,  		},  	}, -	.kref = { -		.refcount	= ATOMIC_INIT(3), -	}, +	.count = ATOMIC_INIT(3),  	.owner = GLOBAL_ROOT_UID,  	.group = GLOBAL_ROOT_GID,  	.proc_inum = PROC_USER_INIT_INO, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2b042c42fbc..8b650837083 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -78,7 +78,7 @@ int create_user_ns(struct cred *new)  		return ret;  	} -	kref_init(&ns->kref); +	atomic_set(&ns->count, 1);  	/* Leave the new->user_ns reference with the new user namespace. */  	ns->parent = parent_ns;  	ns->owner = owner; @@ -104,15 +104,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)  	return create_user_ns(cred);  } -void free_user_ns(struct kref *kref) +void free_user_ns(struct user_namespace *ns)  { -	struct user_namespace *parent, *ns = -		container_of(kref, struct user_namespace, kref); +	struct user_namespace *parent; -	parent = ns->parent; -	proc_free_inum(ns->proc_inum); -	kmem_cache_free(user_ns_cachep, ns); -	put_user_ns(parent); +	do { +		parent = ns->parent; +		proc_free_inum(ns->proc_inum); +		kmem_cache_free(user_ns_cachep, ns); +		ns = parent; +	} while (atomic_dec_and_test(&parent->count));  }  EXPORT_SYMBOL(free_user_ns); @@ -519,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = {  	.show = projid_m_show,  }; +static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +{ +	u32 upper_first, lower_first, upper_last, lower_last; +	unsigned idx; + +	upper_first = extent->first; +	lower_first = extent->lower_first; +	upper_last = upper_first + extent->count - 1; +	lower_last = lower_first + extent->count - 1; + +	for (idx = 0; idx < new_map->nr_extents; idx++) { +		u32 prev_upper_first, prev_lower_first; +		u32 prev_upper_last, prev_lower_last; +		struct uid_gid_extent *prev; + +		prev = &new_map->extent[idx]; + +		prev_upper_first = prev->first; +		prev_lower_first = prev->lower_first; +		prev_upper_last = prev_upper_first + prev->count - 1; +		prev_lower_last = prev_lower_first + prev->count - 1; + +		/* Does the upper range intersect a previous extent? */ +		if ((prev_upper_first <= upper_last) && +		    (prev_upper_last >= upper_first)) +			return true; + +		/* Does the lower range intersect a previous extent? */ +		if ((prev_lower_first <= lower_last) && +		    (prev_lower_last >= lower_first)) +			return true; +	} +	return false; +} + +  static DEFINE_MUTEX(id_map_mutex);  static ssize_t map_write(struct file *file, const char __user *buf, @@ -531,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,  	struct user_namespace *ns = seq->private;  	struct uid_gid_map new_map;  	unsigned idx; -	struct uid_gid_extent *extent, *last = NULL; +	struct uid_gid_extent *extent = NULL;  	unsigned long page = 0;  	char *kbuf, *pos, *next_line;  	ssize_t ret = -EINVAL; @@ -634,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,  		if ((extent->lower_first + extent->count) <= extent->lower_first)  			goto out; -		/* For now only accept extents that are strictly in order */ -		if (last && -		    (((last->first + last->count) > extent->first) || -		     ((last->lower_first + last->count) > extent->lower_first))) +		/* Do the ranges in extent overlap any previous extents? */ +		if (mappings_overlap(&new_map, extent))  			goto out;  		new_map.nr_extents++; -		last = extent;  		/* Fail if the file contains too many extents */  		if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 75a2ab3d0b0..4a944676358 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -23,6 +23,7 @@  #include <linux/module.h>  #include <linux/sysctl.h>  #include <linux/smpboot.h> +#include <linux/sched/rt.h>  #include <asm/irq_regs.h>  #include <linux/kvm_para.h> @@ -112,9 +113,9 @@ static int get_softlockup_thresh(void)   * resolution, and we don't need to waste time with a big divide when   * 2^30ns == 1.074s.   */ -static unsigned long get_timestamp(int this_cpu) +static unsigned long get_timestamp(void)  { -	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */ +	return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */  }  static void set_sample_period(void) @@ -132,9 +133,7 @@ static void set_sample_period(void)  /* Commands for resetting the watchdog */  static void __touch_watchdog(void)  { -	int this_cpu = smp_processor_id(); - -	__this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); +	__this_cpu_write(watchdog_touch_ts, get_timestamp());  }  void touch_softlockup_watchdog(void) @@ -195,7 +194,7 @@ static int is_hardlockup(void)  static int is_softlockup(unsigned long touch_ts)  { -	unsigned long now = get_timestamp(smp_processor_id()); +	unsigned long now = get_timestamp();  	/* Warn about unreasonable delays: */  	if (time_after(now, touch_ts + get_softlockup_thresh())) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fbc6576a83c..f4feacad381 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,32 +41,31 @@  #include <linux/debug_locks.h>  #include <linux/lockdep.h>  #include <linux/idr.h> +#include <linux/hashtable.h> -#include "workqueue_sched.h" +#include "workqueue_internal.h"  enum {  	/* -	 * global_cwq flags +	 * worker_pool flags  	 * -	 * A bound gcwq is either associated or disassociated with its CPU. +	 * A bound pool is either associated or disassociated with its CPU.  	 * While associated (!DISASSOCIATED), all workers are bound to the  	 * CPU and none has %WORKER_UNBOUND set and concurrency management  	 * is in effect.  	 *  	 * While DISASSOCIATED, the cpu may be offline and all workers have  	 * %WORKER_UNBOUND set and concurrency management disabled, and may -	 * be executing on any CPU.  The gcwq behaves as an unbound one. +	 * be executing on any CPU.  The pool behaves as an unbound one.  	 *  	 * Note that DISASSOCIATED can be flipped only while holding -	 * assoc_mutex of all pools on the gcwq to avoid changing binding -	 * state while create_worker() is in progress. +	 * assoc_mutex to avoid changing binding state while +	 * create_worker() is in progress.  	 */ -	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */ -	GCWQ_FREEZING		= 1 << 1,	/* freeze in progress */ - -	/* pool flags */  	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */  	POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */ +	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */ +	POOL_FREEZING		= 1 << 3,	/* freeze in progress */  	/* worker flags */  	WORKER_STARTED		= 1 << 0,	/* started */ @@ -79,11 +78,9 @@ enum {  	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_UNBOUND |  				  WORKER_CPU_INTENSIVE, -	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */ +	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */  	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */ -	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER, -	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,  	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */  	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */ @@ -111,48 +108,24 @@ enum {   * P: Preemption protected.  Disabling preemption is enough and should   *    only be modified and accessed from the local cpu.   * - * L: gcwq->lock protected.  Access with gcwq->lock held. + * L: pool->lock protected.  Access with pool->lock held.   * - * X: During normal operation, modification requires gcwq->lock and - *    should be done only from local cpu.  Either disabling preemption - *    on local cpu or grabbing gcwq->lock is enough for read access. - *    If GCWQ_DISASSOCIATED is set, it's identical to L. + * X: During normal operation, modification requires pool->lock and should + *    be done only from local cpu.  Either disabling preemption on local + *    cpu or grabbing pool->lock is enough for read access.  If + *    POOL_DISASSOCIATED is set, it's identical to L.   *   * F: wq->flush_mutex protected.   *   * W: workqueue_lock protected.   */ -struct global_cwq; -struct worker_pool; - -/* - * The poor guys doing the actual heavy lifting.  All on-duty workers - * are either serving the manager role, on idle list or on busy hash. - */ -struct worker { -	/* on idle list while idle, on busy hash table while busy */ -	union { -		struct list_head	entry;	/* L: while idle */ -		struct hlist_node	hentry;	/* L: while busy */ -	}; - -	struct work_struct	*current_work;	/* L: work being processed */ -	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ -	struct list_head	scheduled;	/* L: scheduled works */ -	struct task_struct	*task;		/* I: worker task */ -	struct worker_pool	*pool;		/* I: the associated pool */ -	/* 64 bytes boundary on 64bit, 32 on 32bit */ -	unsigned long		last_active;	/* L: last active timestamp */ -	unsigned int		flags;		/* X: flags */ -	int			id;		/* I: worker id */ - -	/* for rebinding worker to CPU */ -	struct work_struct	rebind_work;	/* L: for busy worker */ -}; +/* struct worker is defined in workqueue_internal.h */  struct worker_pool { -	struct global_cwq	*gcwq;		/* I: the owning gcwq */ +	spinlock_t		lock;		/* the pool lock */ +	unsigned int		cpu;		/* I: the associated cpu */ +	int			id;		/* I: pool ID */  	unsigned int		flags;		/* X: flags */  	struct list_head	worklist;	/* L: list of pending works */ @@ -165,34 +138,28 @@ struct worker_pool {  	struct timer_list	idle_timer;	/* L: worker idle timeout */  	struct timer_list	mayday_timer;	/* L: SOS timer for workers */ -	struct mutex		assoc_mutex;	/* protect GCWQ_DISASSOCIATED */ -	struct ida		worker_ida;	/* L: for worker IDs */ -}; - -/* - * Global per-cpu workqueue.  There's one and only one for each cpu - * and all works are queued and processed here regardless of their - * target workqueues. - */ -struct global_cwq { -	spinlock_t		lock;		/* the gcwq lock */ -	unsigned int		cpu;		/* I: the associated cpu */ -	unsigned int		flags;		/* L: GCWQ_* flags */ - -	/* workers are chained either in busy_hash or pool idle_list */ -	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE]; +	/* workers are chained either in busy_hash or idle_list */ +	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);  						/* L: hash of busy workers */ -	struct worker_pool	pools[NR_WORKER_POOLS]; -						/* normal and highpri pools */ +	struct mutex		assoc_mutex;	/* protect POOL_DISASSOCIATED */ +	struct ida		worker_ida;	/* L: for worker IDs */ + +	/* +	 * The current concurrency level.  As it's likely to be accessed +	 * from other CPUs during try_to_wake_up(), put it in a separate +	 * cacheline. +	 */ +	atomic_t		nr_running ____cacheline_aligned_in_smp;  } ____cacheline_aligned_in_smp;  /* - * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of - * work_struct->data are used for flags and thus cwqs need to be - * aligned at two's power of the number of flag bits. + * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS + * of work_struct->data are used for flags and the remaining high bits + * point to the pwq; thus, pwqs need to be aligned at two's power of the + * number of flag bits.   */ -struct cpu_workqueue_struct { +struct pool_workqueue {  	struct worker_pool	*pool;		/* I: the associated pool */  	struct workqueue_struct *wq;		/* I: the owning workqueue */  	int			work_color;	/* L: current color */ @@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;  struct workqueue_struct {  	unsigned int		flags;		/* W: WQ_* flags */  	union { -		struct cpu_workqueue_struct __percpu	*pcpu; -		struct cpu_workqueue_struct		*single; +		struct pool_workqueue __percpu		*pcpu; +		struct pool_workqueue			*single;  		unsigned long				v; -	} cpu_wq;				/* I: cwq's */ +	} pool_wq;				/* I: pwq's */  	struct list_head	list;		/* W: list of all workqueues */  	struct mutex		flush_mutex;	/* protects wq flushing */  	int			work_color;	/* F: current work color */  	int			flush_color;	/* F: current flush color */ -	atomic_t		nr_cwqs_to_flush; /* flush in progress */ +	atomic_t		nr_pwqs_to_flush; /* flush in progress */  	struct wq_flusher	*first_flusher;	/* F: first flusher */  	struct list_head	flusher_queue;	/* F: flush waiters */  	struct list_head	flusher_overflow; /* F: flush overflow list */ @@ -259,7 +226,7 @@ struct workqueue_struct {  	struct worker		*rescuer;	/* I: rescue worker */  	int			nr_drainers;	/* W: drain in progress */ -	int			saved_max_active; /* W: saved cwq max_active */ +	int			saved_max_active; /* W: saved pwq max_active */  #ifdef CONFIG_LOCKDEP  	struct lockdep_map	lockdep_map;  #endif @@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);  #define CREATE_TRACE_POINTS  #include <trace/events/workqueue.h> -#define for_each_worker_pool(pool, gcwq)				\ -	for ((pool) = &(gcwq)->pools[0];				\ -	     (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) +#define for_each_std_worker_pool(pool, cpu)				\ +	for ((pool) = &std_worker_pools(cpu)[0];			\ +	     (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) -#define for_each_busy_worker(worker, i, pos, gcwq)			\ -	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\ -		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) +#define for_each_busy_worker(worker, i, pos, pool)			\ +	hash_for_each(pool->busy_hash, i, pos, worker, hentry) -static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, -				  unsigned int sw) +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, +				unsigned int sw)  {  	if (cpu < nr_cpu_ids) {  		if (sw & 1) { @@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,  		if (sw & 2)  			return WORK_CPU_UNBOUND;  	} -	return WORK_CPU_NONE; +	return WORK_CPU_END;  } -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, -				struct workqueue_struct *wq) +static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, +				 struct workqueue_struct *wq)  { -	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); +	return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);  }  /*   * CPU iterators   * - * An extra gcwq is defined for an invalid cpu number + * An extra cpu number is defined using an invalid cpu number   * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU.  The following iterators are similar to - * for_each_*_cpu() iterators but also considers the unbound gcwq. + * specific CPU.  The following iterators are similar to for_each_*_cpu() + * iterators but also considers the unbound CPU.   * - * for_each_gcwq_cpu()		: possible CPUs + WORK_CPU_UNBOUND - * for_each_online_gcwq_cpu()	: online CPUs + WORK_CPU_UNBOUND - * for_each_cwq_cpu()		: possible CPUs for bound workqueues, + * for_each_wq_cpu()		: possible CPUs + WORK_CPU_UNBOUND + * for_each_online_wq_cpu()	: online CPUs + WORK_CPU_UNBOUND + * for_each_pwq_cpu()		: possible CPUs for bound workqueues,   *				  WORK_CPU_UNBOUND for unbound workqueues   */ -#define for_each_gcwq_cpu(cpu)						\ -	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\ -	     (cpu) < WORK_CPU_NONE;					\ -	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) +#define for_each_wq_cpu(cpu)						\ +	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);		\ +	     (cpu) < WORK_CPU_END;					\ +	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) -#define for_each_online_gcwq_cpu(cpu)					\ -	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\ -	     (cpu) < WORK_CPU_NONE;					\ -	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) +#define for_each_online_wq_cpu(cpu)					\ +	for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);		\ +	     (cpu) < WORK_CPU_END;					\ +	     (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) -#define for_each_cwq_cpu(cpu, wq)					\ -	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\ -	     (cpu) < WORK_CPU_NONE;					\ -	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) +#define for_each_pwq_cpu(cpu, wq)					\ +	for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq));	\ +	     (cpu) < WORK_CPU_END;					\ +	     (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))  #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -459,57 +425,69 @@ static LIST_HEAD(workqueues);  static bool workqueue_freezing;		/* W: have wqs started freezing? */  /* - * The almighty global cpu workqueues.  nr_running is the only field - * which is expected to be used frequently by other cpus via - * try_to_wake_up().  Put it in a separate cacheline. + * The CPU and unbound standard worker pools.  The unbound ones have + * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.   */ -static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], +				     cpu_std_worker_pools); +static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; -/* - * Global cpu workqueue and nr_running counter for unbound gcwq.  The - * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its - * workers have WORKER_UNBOUND set. - */ -static struct global_cwq unbound_global_cwq; -static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { -	[0 ... NR_WORKER_POOLS - 1]	= ATOMIC_INIT(0),	/* always 0 */ -}; +/* idr of all pools */ +static DEFINE_MUTEX(worker_pool_idr_mutex); +static DEFINE_IDR(worker_pool_idr);  static int worker_thread(void *__worker); -static int worker_pool_pri(struct worker_pool *pool) +static struct worker_pool *std_worker_pools(int cpu)  { -	return pool - pool->gcwq->pools; +	if (cpu != WORK_CPU_UNBOUND) +		return per_cpu(cpu_std_worker_pools, cpu); +	else +		return unbound_std_worker_pools;  } -static struct global_cwq *get_gcwq(unsigned int cpu) +static int std_worker_pool_pri(struct worker_pool *pool)  { -	if (cpu != WORK_CPU_UNBOUND) -		return &per_cpu(global_cwq, cpu); -	else -		return &unbound_global_cwq; +	return pool - std_worker_pools(pool->cpu);  } -static atomic_t *get_pool_nr_running(struct worker_pool *pool) +/* allocate ID and assign it to @pool */ +static int worker_pool_assign_id(struct worker_pool *pool)  { -	int cpu = pool->gcwq->cpu; -	int idx = worker_pool_pri(pool); +	int ret; -	if (cpu != WORK_CPU_UNBOUND) -		return &per_cpu(pool_nr_running, cpu)[idx]; -	else -		return &unbound_pool_nr_running[idx]; +	mutex_lock(&worker_pool_idr_mutex); +	idr_pre_get(&worker_pool_idr, GFP_KERNEL); +	ret = idr_get_new(&worker_pool_idr, pool, &pool->id); +	mutex_unlock(&worker_pool_idr_mutex); + +	return ret;  } -static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, -					    struct workqueue_struct *wq) +/* + * Lookup worker_pool by id.  The idr currently is built during boot and + * never modified.  Don't worry about locking for now. + */ +static struct worker_pool *worker_pool_by_id(int pool_id) +{ +	return idr_find(&worker_pool_idr, pool_id); +} + +static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) +{ +	struct worker_pool *pools = std_worker_pools(cpu); + +	return &pools[highpri]; +} + +static struct pool_workqueue *get_pwq(unsigned int cpu, +				      struct workqueue_struct *wq)  {  	if (!(wq->flags & WQ_UNBOUND)) {  		if (likely(cpu < nr_cpu_ids)) -			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); +			return per_cpu_ptr(wq->pool_wq.pcpu, cpu);  	} else if (likely(cpu == WORK_CPU_UNBOUND)) -		return wq->cpu_wq.single; +		return wq->pool_wq.single;  	return NULL;  } @@ -530,19 +508,19 @@ static int work_next_color(int color)  }  /* - * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data - * contain the pointer to the queued cwq.  Once execution starts, the flag - * is cleared and the high bits contain OFFQ flags and CPU number. + * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data + * contain the pointer to the queued pwq.  Once execution starts, the flag + * is cleared and the high bits contain OFFQ flags and pool ID.   * - * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() - * and clear_work_data() can be used to set the cwq, cpu or clear + * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the pwq, pool or clear   * work->data.  These functions should only be called while the work is   * owned - ie. while the PENDING bit is set.   * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to - * a work.  gcwq is available once the work has been queued anywhere after - * initialization until it is sync canceled.  cwq is available only while - * the work item is queued. + * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq + * corresponding to a work.  Pool is available once the work has been + * queued anywhere after initialization until it is sync canceled.  pwq is + * available only while the work item is queued.   *   * %WORK_OFFQ_CANCELING is used to mark a work item which is being   * canceled.  While being canceled, a work item may have its PENDING set @@ -556,16 +534,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,  	atomic_long_set(&work->data, data | flags | work_static(work));  } -static void set_work_cwq(struct work_struct *work, -			 struct cpu_workqueue_struct *cwq, +static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,  			 unsigned long extra_flags)  { -	set_work_data(work, (unsigned long)cwq, -		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); +	set_work_data(work, (unsigned long)pwq, +		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);  } -static void set_work_cpu_and_clear_pending(struct work_struct *work, -					   unsigned int cpu) +static void set_work_pool_and_keep_pending(struct work_struct *work, +					   int pool_id) +{ +	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, +		      WORK_STRUCT_PENDING); +} + +static void set_work_pool_and_clear_pending(struct work_struct *work, +					    int pool_id)  {  	/*  	 * The following wmb is paired with the implied mb in @@ -574,67 +558,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,  	 * owner.  	 */  	smp_wmb(); -	set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); +	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);  }  static void clear_work_data(struct work_struct *work)  { -	smp_wmb();	/* see set_work_cpu_and_clear_pending() */ -	set_work_data(work, WORK_STRUCT_NO_CPU, 0); +	smp_wmb();	/* see set_work_pool_and_clear_pending() */ +	set_work_data(work, WORK_STRUCT_NO_POOL, 0);  } -static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +static struct pool_workqueue *get_work_pwq(struct work_struct *work)  {  	unsigned long data = atomic_long_read(&work->data); -	if (data & WORK_STRUCT_CWQ) +	if (data & WORK_STRUCT_PWQ)  		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);  	else  		return NULL;  } -static struct global_cwq *get_work_gcwq(struct work_struct *work) +/** + * get_work_pool - return the worker_pool a given work was associated with + * @work: the work item of interest + * + * Return the worker_pool @work was last associated with.  %NULL if none. + */ +static struct worker_pool *get_work_pool(struct work_struct *work)  {  	unsigned long data = atomic_long_read(&work->data); -	unsigned int cpu; +	struct worker_pool *pool; +	int pool_id; -	if (data & WORK_STRUCT_CWQ) -		return ((struct cpu_workqueue_struct *) -			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; +	if (data & WORK_STRUCT_PWQ) +		return ((struct pool_workqueue *) +			(data & WORK_STRUCT_WQ_DATA_MASK))->pool; -	cpu = data >> WORK_OFFQ_CPU_SHIFT; -	if (cpu == WORK_CPU_NONE) +	pool_id = data >> WORK_OFFQ_POOL_SHIFT; +	if (pool_id == WORK_OFFQ_POOL_NONE)  		return NULL; -	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); -	return get_gcwq(cpu); +	pool = worker_pool_by_id(pool_id); +	WARN_ON_ONCE(!pool); +	return pool; +} + +/** + * get_work_pool_id - return the worker pool ID a given work is associated with + * @work: the work item of interest + * + * Return the worker_pool ID @work was last associated with. + * %WORK_OFFQ_POOL_NONE if none. + */ +static int get_work_pool_id(struct work_struct *work) +{ +	unsigned long data = atomic_long_read(&work->data); + +	if (data & WORK_STRUCT_PWQ) +		return ((struct pool_workqueue *) +			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; + +	return data >> WORK_OFFQ_POOL_SHIFT;  }  static void mark_work_canceling(struct work_struct *work)  { -	struct global_cwq *gcwq = get_work_gcwq(work); -	unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; +	unsigned long pool_id = get_work_pool_id(work); -	set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, -		      WORK_STRUCT_PENDING); +	pool_id <<= WORK_OFFQ_POOL_SHIFT; +	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);  }  static bool work_is_canceling(struct work_struct *work)  {  	unsigned long data = atomic_long_read(&work->data); -	return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); +	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);  }  /*   * Policy functions.  These define the policies on how the global worker   * pools are managed.  Unless noted otherwise, these functions assume that - * they're being called with gcwq->lock held. + * they're being called with pool->lock held.   */  static bool __need_more_worker(struct worker_pool *pool)  { -	return !atomic_read(get_pool_nr_running(pool)); +	return !atomic_read(&pool->nr_running);  }  /* @@ -642,7 +651,7 @@ static bool __need_more_worker(struct worker_pool *pool)   * running workers.   *   * Note that, because unbound workers never contribute to nr_running, this - * function will always return %true for unbound gcwq as long as the + * function will always return %true for unbound pools as long as the   * worklist isn't empty.   */  static bool need_more_worker(struct worker_pool *pool) @@ -659,9 +668,8 @@ static bool may_start_working(struct worker_pool *pool)  /* Do I need to keep working?  Called from currently running workers. */  static bool keep_working(struct worker_pool *pool)  { -	atomic_t *nr_running = get_pool_nr_running(pool); - -	return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; +	return !list_empty(&pool->worklist) && +		atomic_read(&pool->nr_running) <= 1;  }  /* Do we need a new worker?  Called from manager. */ @@ -714,7 +722,7 @@ static struct worker *first_worker(struct worker_pool *pool)   * Wake up the first idle worker of @pool.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */  static void wake_up_worker(struct worker_pool *pool)  { @@ -740,8 +748,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)  	struct worker *worker = kthread_data(task);  	if (!(worker->flags & WORKER_NOT_RUNNING)) { -		WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); -		atomic_inc(get_pool_nr_running(worker->pool)); +		WARN_ON_ONCE(worker->pool->cpu != cpu); +		atomic_inc(&worker->pool->nr_running);  	}  } @@ -764,12 +772,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,  				       unsigned int cpu)  {  	struct worker *worker = kthread_data(task), *to_wakeup = NULL; -	struct worker_pool *pool = worker->pool; -	atomic_t *nr_running = get_pool_nr_running(pool); +	struct worker_pool *pool; +	/* +	 * Rescuers, which may not have all the fields set up like normal +	 * workers, also reach here, let's not access anything before +	 * checking NOT_RUNNING. +	 */  	if (worker->flags & WORKER_NOT_RUNNING)  		return NULL; +	pool = worker->pool; +  	/* this can only happen on the local cpu */  	BUG_ON(cpu != raw_smp_processor_id()); @@ -781,10 +795,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,  	 * NOT_RUNNING is clear.  This means that we're bound to and  	 * running on the local cpu w/ rq lock held and preemption  	 * disabled, which in turn means that none else could be -	 * manipulating idle_list, so dereferencing idle_list without gcwq +	 * manipulating idle_list, so dereferencing idle_list without pool  	 * lock is safe.  	 */ -	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) +	if (atomic_dec_and_test(&pool->nr_running) && +	    !list_empty(&pool->worklist))  		to_wakeup = first_worker(pool);  	return to_wakeup ? to_wakeup->task : NULL;  } @@ -800,7 +815,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,   * woken up.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock) + * spin_lock_irq(pool->lock)   */  static inline void worker_set_flags(struct worker *worker, unsigned int flags,  				    bool wakeup) @@ -816,14 +831,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,  	 */  	if ((flags & WORKER_NOT_RUNNING) &&  	    !(worker->flags & WORKER_NOT_RUNNING)) { -		atomic_t *nr_running = get_pool_nr_running(pool); -  		if (wakeup) { -			if (atomic_dec_and_test(nr_running) && +			if (atomic_dec_and_test(&pool->nr_running) &&  			    !list_empty(&pool->worklist))  				wake_up_worker(pool);  		} else -			atomic_dec(nr_running); +			atomic_dec(&pool->nr_running);  	}  	worker->flags |= flags; @@ -837,7 +850,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,   * Clear @flags in @worker->flags and adjust nr_running accordingly.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock) + * spin_lock_irq(pool->lock)   */  static inline void worker_clr_flags(struct worker *worker, unsigned int flags)  { @@ -855,87 +868,56 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)  	 */  	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))  		if (!(worker->flags & WORKER_NOT_RUNNING)) -			atomic_inc(get_pool_nr_running(pool)); +			atomic_inc(&pool->nr_running);  }  /** - * busy_worker_head - return the busy hash head for a work - * @gcwq: gcwq of interest - * @work: work to be hashed + * find_worker_executing_work - find worker which is executing a work + * @pool: pool of interest + * @work: work to find worker for   * - * Return hash head of @gcwq for @work. + * Find a worker which is executing @work on @pool by searching + * @pool->busy_hash which is keyed by the address of @work.  For a worker + * to match, its current execution should match the address of @work and + * its work function.  This is to avoid unwanted dependency between + * unrelated work executions through a work item being recycled while still + * being executed.   * - * CONTEXT: - * spin_lock_irq(gcwq->lock). + * This is a bit tricky.  A work item may be freed once its execution + * starts and nothing prevents the freed area from being recycled for + * another work item.  If the same work item address ends up being reused + * before the original execution finishes, workqueue will identify the + * recycled work item as currently executing and make it wait until the + * current execution finishes, introducing an unwanted dependency.   * - * RETURNS: - * Pointer to the hash head. - */ -static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, -					   struct work_struct *work) -{ -	const int base_shift = ilog2(sizeof(struct work_struct)); -	unsigned long v = (unsigned long)work; - -	/* simple shift and fold hash, do we need something better? */ -	v >>= base_shift; -	v += v >> BUSY_WORKER_HASH_ORDER; -	v &= BUSY_WORKER_HASH_MASK; - -	return &gcwq->busy_hash[v]; -} - -/** - * __find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest - * @bwh: hash head as returned by busy_worker_head() - * @work: work to find worker for - * - * Find a worker which is executing @work on @gcwq.  @bwh should be - * the hash head obtained by calling busy_worker_head() with the same - * work. + * This function checks the work item address, work function and workqueue + * to avoid false positives.  Note that this isn't complete as one may + * construct a work function which can introduce dependency onto itself + * through a recycled work item.  Well, if somebody wants to shoot oneself + * in the foot that badly, there's only so much we can do, and if such + * deadlock actually occurs, it should be easy to locate the culprit work + * function.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   *   * RETURNS:   * Pointer to worker which is executing @work if found, NULL   * otherwise.   */ -static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, -						   struct hlist_head *bwh, -						   struct work_struct *work) +static struct worker *find_worker_executing_work(struct worker_pool *pool, +						 struct work_struct *work)  {  	struct worker *worker;  	struct hlist_node *tmp; -	hlist_for_each_entry(worker, tmp, bwh, hentry) -		if (worker->current_work == work) +	hash_for_each_possible(pool->busy_hash, worker, tmp, hentry, +			       (unsigned long)work) +		if (worker->current_work == work && +		    worker->current_func == work->func)  			return worker; -	return NULL; -} -/** - * find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest - * @work: work to find worker for - * - * Find a worker which is executing @work on @gcwq.  This function is - * identical to __find_worker_executing_work() except that this - * function calculates @bwh itself. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to worker which is executing @work if found, NULL - * otherwise. - */ -static struct worker *find_worker_executing_work(struct global_cwq *gcwq, -						 struct work_struct *work) -{ -	return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), -					    work); +	return NULL;  }  /** @@ -953,7 +935,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,   * nested inside outer list_for_each_entry_safe().   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */  static void move_linked_works(struct work_struct *work, struct list_head *head,  			      struct work_struct **nextp) @@ -979,67 +961,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,  		*nextp = n;  } -static void cwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_delayed_work(struct work_struct *work)  { -	struct cpu_workqueue_struct *cwq = get_work_cwq(work); +	struct pool_workqueue *pwq = get_work_pwq(work);  	trace_workqueue_activate_work(work); -	move_linked_works(work, &cwq->pool->worklist, NULL); +	move_linked_works(work, &pwq->pool->worklist, NULL);  	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); -	cwq->nr_active++; +	pwq->nr_active++;  } -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +static void pwq_activate_first_delayed(struct pool_workqueue *pwq)  { -	struct work_struct *work = list_first_entry(&cwq->delayed_works, +	struct work_struct *work = list_first_entry(&pwq->delayed_works,  						    struct work_struct, entry); -	cwq_activate_delayed_work(work); +	pwq_activate_delayed_work(work);  }  /** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest + * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight + * @pwq: pwq of interest   * @color: color of work which left the queue   *   * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. + * decrement nr_in_flight of its pwq and handle workqueue flushing.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)  {  	/* ignore uncolored works */  	if (color == WORK_NO_COLOR)  		return; -	cwq->nr_in_flight[color]--; +	pwq->nr_in_flight[color]--; -	cwq->nr_active--; -	if (!list_empty(&cwq->delayed_works)) { +	pwq->nr_active--; +	if (!list_empty(&pwq->delayed_works)) {  		/* one down, submit a delayed one */ -		if (cwq->nr_active < cwq->max_active) -			cwq_activate_first_delayed(cwq); +		if (pwq->nr_active < pwq->max_active) +			pwq_activate_first_delayed(pwq);  	}  	/* is flush in progress and are we at the flushing tip? */ -	if (likely(cwq->flush_color != color)) +	if (likely(pwq->flush_color != color))  		return;  	/* are there still in-flight works? */ -	if (cwq->nr_in_flight[color]) +	if (pwq->nr_in_flight[color])  		return; -	/* this cwq is done, clear flush_color */ -	cwq->flush_color = -1; +	/* this pwq is done, clear flush_color */ +	pwq->flush_color = -1;  	/* -	 * If this was the last cwq, wake up the first flusher.  It +	 * If this was the last pwq, wake up the first flusher.  It  	 * will handle the rest.  	 */ -	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) -		complete(&cwq->wq->first_flusher->done); +	if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) +		complete(&pwq->wq->first_flusher->done);  }  /** @@ -1070,7 +1052,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)  static int try_to_grab_pending(struct work_struct *work, bool is_dwork,  			       unsigned long *flags)  { -	struct global_cwq *gcwq; +	struct worker_pool *pool; +	struct pool_workqueue *pwq;  	local_irq_save(*flags); @@ -1095,41 +1078,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,  	 * The queueing is in progress, or it is already queued. Try to  	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.  	 */ -	gcwq = get_work_gcwq(work); -	if (!gcwq) +	pool = get_work_pool(work); +	if (!pool)  		goto fail; -	spin_lock(&gcwq->lock); -	if (!list_empty(&work->entry)) { +	spin_lock(&pool->lock); +	/* +	 * work->data is guaranteed to point to pwq only while the work +	 * item is queued on pwq->wq, and both updating work->data to point +	 * to pwq on queueing and to pool on dequeueing are done under +	 * pwq->pool->lock.  This in turn guarantees that, if work->data +	 * points to pwq which is associated with a locked pool, the work +	 * item is currently queued on that pool. +	 */ +	pwq = get_work_pwq(work); +	if (pwq && pwq->pool == pool) { +		debug_work_deactivate(work); +  		/* -		 * This work is queued, but perhaps we locked the wrong gcwq. -		 * In that case we must see the new value after rmb(), see -		 * insert_work()->wmb(). +		 * A delayed work item cannot be grabbed directly because +		 * it might have linked NO_COLOR work items which, if left +		 * on the delayed_list, will confuse pwq->nr_active +		 * management later on and cause stall.  Make sure the work +		 * item is activated before grabbing.  		 */ -		smp_rmb(); -		if (gcwq == get_work_gcwq(work)) { -			debug_work_deactivate(work); +		if (*work_data_bits(work) & WORK_STRUCT_DELAYED) +			pwq_activate_delayed_work(work); -			/* -			 * A delayed work item cannot be grabbed directly -			 * because it might have linked NO_COLOR work items -			 * which, if left on the delayed_list, will confuse -			 * cwq->nr_active management later on and cause -			 * stall.  Make sure the work item is activated -			 * before grabbing. -			 */ -			if (*work_data_bits(work) & WORK_STRUCT_DELAYED) -				cwq_activate_delayed_work(work); +		list_del_init(&work->entry); +		pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); -			list_del_init(&work->entry); -			cwq_dec_nr_in_flight(get_work_cwq(work), -				get_work_color(work)); +		/* work->data points to pwq iff queued, point to pool */ +		set_work_pool_and_keep_pending(work, pool->id); -			spin_unlock(&gcwq->lock); -			return 1; -		} +		spin_unlock(&pool->lock); +		return 1;  	} -	spin_unlock(&gcwq->lock); +	spin_unlock(&pool->lock);  fail:  	local_irq_restore(*flags);  	if (work_is_canceling(work)) @@ -1139,33 +1124,25 @@ fail:  }  /** - * insert_work - insert a work into gcwq - * @cwq: cwq @work belongs to + * insert_work - insert a work into a pool + * @pwq: pwq @work belongs to   * @work: work to insert   * @head: insertion point   * @extra_flags: extra WORK_STRUCT_* flags to set   * - * Insert @work which belongs to @cwq into @gcwq after @head. - * @extra_flags is or'd to work_struct flags. + * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to + * work_struct flags.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */ -static void insert_work(struct cpu_workqueue_struct *cwq, -			struct work_struct *work, struct list_head *head, -			unsigned int extra_flags) +static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, +			struct list_head *head, unsigned int extra_flags)  { -	struct worker_pool *pool = cwq->pool; +	struct worker_pool *pool = pwq->pool;  	/* we own @work, set data and link */ -	set_work_cwq(work, cwq, extra_flags); - -	/* -	 * Ensure that we get the right work->data if we see the -	 * result of list_add() below, see try_to_grab_pending(). -	 */ -	smp_wmb(); - +	set_work_pwq(work, pwq, extra_flags);  	list_add_tail(&work->entry, head);  	/* @@ -1181,41 +1158,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,  /*   * Test whether @work is being queued from another work executing on the - * same workqueue.  This is rather expensive and should only be used from - * cold paths. + * same workqueue.   */  static bool is_chained_work(struct workqueue_struct *wq)  { -	unsigned long flags; -	unsigned int cpu; - -	for_each_gcwq_cpu(cpu) { -		struct global_cwq *gcwq = get_gcwq(cpu); -		struct worker *worker; -		struct hlist_node *pos; -		int i; +	struct worker *worker; -		spin_lock_irqsave(&gcwq->lock, flags); -		for_each_busy_worker(worker, i, pos, gcwq) { -			if (worker->task != current) -				continue; -			spin_unlock_irqrestore(&gcwq->lock, flags); -			/* -			 * I'm @worker, no locking necessary.  See if @work -			 * is headed to the same workqueue. -			 */ -			return worker->current_cwq->wq == wq; -		} -		spin_unlock_irqrestore(&gcwq->lock, flags); -	} -	return false; +	worker = current_wq_worker(); +	/* +	 * Return %true iff I'm a worker execuing a work item on @wq.  If +	 * I'm @worker, it's safe to dereference it without locking. +	 */ +	return worker && worker->current_pwq->wq == wq;  }  static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  			 struct work_struct *work)  { -	struct global_cwq *gcwq; -	struct cpu_workqueue_struct *cwq; +	struct pool_workqueue *pwq;  	struct list_head *worklist;  	unsigned int work_flags;  	unsigned int req_cpu = cpu; @@ -1235,9 +1195,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  	    WARN_ON_ONCE(!is_chained_work(wq)))  		return; -	/* determine gcwq to use */ +	/* determine the pwq to use */  	if (!(wq->flags & WQ_UNBOUND)) { -		struct global_cwq *last_gcwq; +		struct worker_pool *last_pool;  		if (cpu == WORK_CPU_UNBOUND)  			cpu = raw_smp_processor_id(); @@ -1248,55 +1208,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  		 * work needs to be queued on that cpu to guarantee  		 * non-reentrancy.  		 */ -		gcwq = get_gcwq(cpu); -		last_gcwq = get_work_gcwq(work); +		pwq = get_pwq(cpu, wq); +		last_pool = get_work_pool(work); -		if (last_gcwq && last_gcwq != gcwq) { +		if (last_pool && last_pool != pwq->pool) {  			struct worker *worker; -			spin_lock(&last_gcwq->lock); +			spin_lock(&last_pool->lock); -			worker = find_worker_executing_work(last_gcwq, work); +			worker = find_worker_executing_work(last_pool, work); -			if (worker && worker->current_cwq->wq == wq) -				gcwq = last_gcwq; -			else { +			if (worker && worker->current_pwq->wq == wq) { +				pwq = get_pwq(last_pool->cpu, wq); +			} else {  				/* meh... not running there, queue here */ -				spin_unlock(&last_gcwq->lock); -				spin_lock(&gcwq->lock); +				spin_unlock(&last_pool->lock); +				spin_lock(&pwq->pool->lock);  			}  		} else { -			spin_lock(&gcwq->lock); +			spin_lock(&pwq->pool->lock);  		}  	} else { -		gcwq = get_gcwq(WORK_CPU_UNBOUND); -		spin_lock(&gcwq->lock); +		pwq = get_pwq(WORK_CPU_UNBOUND, wq); +		spin_lock(&pwq->pool->lock);  	} -	/* gcwq determined, get cwq and queue */ -	cwq = get_cwq(gcwq->cpu, wq); -	trace_workqueue_queue_work(req_cpu, cwq, work); +	/* pwq determined, queue */ +	trace_workqueue_queue_work(req_cpu, pwq, work);  	if (WARN_ON(!list_empty(&work->entry))) { -		spin_unlock(&gcwq->lock); +		spin_unlock(&pwq->pool->lock);  		return;  	} -	cwq->nr_in_flight[cwq->work_color]++; -	work_flags = work_color_to_flags(cwq->work_color); +	pwq->nr_in_flight[pwq->work_color]++; +	work_flags = work_color_to_flags(pwq->work_color); -	if (likely(cwq->nr_active < cwq->max_active)) { +	if (likely(pwq->nr_active < pwq->max_active)) {  		trace_workqueue_activate_work(work); -		cwq->nr_active++; -		worklist = &cwq->pool->worklist; +		pwq->nr_active++; +		worklist = &pwq->pool->worklist;  	} else {  		work_flags |= WORK_STRUCT_DELAYED; -		worklist = &cwq->delayed_works; +		worklist = &pwq->delayed_works;  	} -	insert_work(cwq, work, worklist, work_flags); +	insert_work(pwq, work, worklist, work_flags); -	spin_unlock(&gcwq->lock); +	spin_unlock(&pwq->pool->lock);  }  /** @@ -1347,19 +1306,17 @@ EXPORT_SYMBOL_GPL(queue_work);  void delayed_work_timer_fn(unsigned long __data)  {  	struct delayed_work *dwork = (struct delayed_work *)__data; -	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);  	/* should have been called from irqsafe timer with irq already off */ -	__queue_work(dwork->cpu, cwq->wq, &dwork->work); +	__queue_work(dwork->cpu, dwork->wq, &dwork->work);  } -EXPORT_SYMBOL_GPL(delayed_work_timer_fn); +EXPORT_SYMBOL(delayed_work_timer_fn);  static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,  				struct delayed_work *dwork, unsigned long delay)  {  	struct timer_list *timer = &dwork->timer;  	struct work_struct *work = &dwork->work; -	unsigned int lcpu;  	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||  		     timer->data != (unsigned long)dwork); @@ -1379,30 +1336,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,  	timer_stats_timer_set_start_info(&dwork->timer); -	/* -	 * This stores cwq for the moment, for the timer_fn.  Note that the -	 * work's gcwq is preserved to allow reentrance detection for -	 * delayed works. -	 */ -	if (!(wq->flags & WQ_UNBOUND)) { -		struct global_cwq *gcwq = get_work_gcwq(work); - -		/* -		 * If we cannot get the last gcwq from @work directly, -		 * select the last CPU such that it avoids unnecessarily -		 * triggering non-reentrancy check in __queue_work(). -		 */ -		lcpu = cpu; -		if (gcwq) -			lcpu = gcwq->cpu; -		if (lcpu == WORK_CPU_UNBOUND) -			lcpu = raw_smp_processor_id(); -	} else { -		lcpu = WORK_CPU_UNBOUND; -	} - -	set_work_cwq(work, get_cwq(lcpu, wq), 0); - +	dwork->wq = wq;  	dwork->cpu = cpu;  	timer->expires = jiffies + delay; @@ -1519,12 +1453,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);   * necessary.   *   * LOCKING: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */  static void worker_enter_idle(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	struct global_cwq *gcwq = pool->gcwq;  	BUG_ON(worker->flags & WORKER_IDLE);  	BUG_ON(!list_empty(&worker->entry) && @@ -1542,14 +1475,14 @@ static void worker_enter_idle(struct worker *worker)  		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);  	/* -	 * Sanity check nr_running.  Because gcwq_unbind_fn() releases -	 * gcwq->lock between setting %WORKER_UNBOUND and zapping +	 * Sanity check nr_running.  Because wq_unbind_fn() releases +	 * pool->lock between setting %WORKER_UNBOUND and zapping  	 * nr_running, the warning may trigger spuriously.  Check iff  	 * unbind is not in progress.  	 */ -	WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && +	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&  		     pool->nr_workers == pool->nr_idle && -		     atomic_read(get_pool_nr_running(pool))); +		     atomic_read(&pool->nr_running));  }  /** @@ -1559,7 +1492,7 @@ static void worker_enter_idle(struct worker *worker)   * @worker is leaving idle state.  Update stats.   *   * LOCKING: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */  static void worker_leave_idle(struct worker *worker)  { @@ -1572,7 +1505,7 @@ static void worker_leave_idle(struct worker *worker)  }  /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool   * @worker: self   *   * Works which are scheduled while the cpu is online must at least be @@ -1584,27 +1517,27 @@ static void worker_leave_idle(struct worker *worker)   * themselves to the target cpu and may race with cpu going down or   * coming online.  kthread_bind() can't be used because it may put the   * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and gcwq may be + * verbatim as it's best effort and blocking and pool may be   * [dis]associated in the meantime.   * - * This function tries set_cpus_allowed() and locks gcwq and verifies the - * binding against %GCWQ_DISASSOCIATED which is set during + * This function tries set_cpus_allowed() and locks pool and verifies the + * binding against %POOL_DISASSOCIATED which is set during   * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker   * enters idle state or fetches works without dropping lock, it can   * guarantee the scheduling requirement described in the first paragraph.   *   * CONTEXT: - * Might sleep.  Called without any lock but returns with gcwq->lock + * Might sleep.  Called without any lock but returns with pool->lock   * held.   *   * RETURNS: - * %true if the associated gcwq is online (@worker is successfully + * %true if the associated pool is online (@worker is successfully   * bound), %false if offline.   */  static bool worker_maybe_bind_and_lock(struct worker *worker) -__acquires(&gcwq->lock) +__acquires(&pool->lock)  { -	struct global_cwq *gcwq = worker->pool->gcwq; +	struct worker_pool *pool = worker->pool;  	struct task_struct *task = worker->task;  	while (true) { @@ -1612,19 +1545,19 @@ __acquires(&gcwq->lock)  		 * The following call may fail, succeed or succeed  		 * without actually migrating the task to the cpu if  		 * it races with cpu hotunplug operation.  Verify -		 * against GCWQ_DISASSOCIATED. +		 * against POOL_DISASSOCIATED.  		 */ -		if (!(gcwq->flags & GCWQ_DISASSOCIATED)) -			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); +		if (!(pool->flags & POOL_DISASSOCIATED)) +			set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); -		spin_lock_irq(&gcwq->lock); -		if (gcwq->flags & GCWQ_DISASSOCIATED) +		spin_lock_irq(&pool->lock); +		if (pool->flags & POOL_DISASSOCIATED)  			return false; -		if (task_cpu(task) == gcwq->cpu && +		if (task_cpu(task) == pool->cpu &&  		    cpumask_equal(¤t->cpus_allowed, -				  get_cpu_mask(gcwq->cpu))) +				  get_cpu_mask(pool->cpu)))  			return true; -		spin_unlock_irq(&gcwq->lock); +		spin_unlock_irq(&pool->lock);  		/*  		 * We've raced with CPU hot[un]plug.  Give it a breather @@ -1643,15 +1576,13 @@ __acquires(&gcwq->lock)   */  static void idle_worker_rebind(struct worker *worker)  { -	struct global_cwq *gcwq = worker->pool->gcwq; -  	/* CPU may go down again inbetween, clear UNBOUND only on success */  	if (worker_maybe_bind_and_lock(worker))  		worker_clr_flags(worker, WORKER_UNBOUND);  	/* rebind complete, become available again */  	list_add(&worker->entry, &worker->pool->idle_list); -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&worker->pool->lock);  }  /* @@ -1663,19 +1594,18 @@ static void idle_worker_rebind(struct worker *worker)  static void busy_worker_rebind_fn(struct work_struct *work)  {  	struct worker *worker = container_of(work, struct worker, rebind_work); -	struct global_cwq *gcwq = worker->pool->gcwq;  	if (worker_maybe_bind_and_lock(worker))  		worker_clr_flags(worker, WORKER_UNBOUND); -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&worker->pool->lock);  }  /** - * rebind_workers - rebind all workers of a gcwq to the associated CPU - * @gcwq: gcwq of interest + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest   * - * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding + * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding   * is different for idle and busy ones.   *   * Idle ones will be removed from the idle_list and woken up.  They will @@ -1693,38 +1623,32 @@ static void busy_worker_rebind_fn(struct work_struct *work)   * including the manager will not appear on @idle_list until rebind is   * complete, making local wake-ups safe.   */ -static void rebind_workers(struct global_cwq *gcwq) +static void rebind_workers(struct worker_pool *pool)  { -	struct worker_pool *pool;  	struct worker *worker, *n;  	struct hlist_node *pos;  	int i; -	lockdep_assert_held(&gcwq->lock); - -	for_each_worker_pool(pool, gcwq) -		lockdep_assert_held(&pool->assoc_mutex); +	lockdep_assert_held(&pool->assoc_mutex); +	lockdep_assert_held(&pool->lock);  	/* dequeue and kick idle ones */ -	for_each_worker_pool(pool, gcwq) { -		list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { -			/* -			 * idle workers should be off @pool->idle_list -			 * until rebind is complete to avoid receiving -			 * premature local wake-ups. -			 */ -			list_del_init(&worker->entry); +	list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { +		/* +		 * idle workers should be off @pool->idle_list until rebind +		 * is complete to avoid receiving premature local wake-ups. +		 */ +		list_del_init(&worker->entry); -			/* -			 * worker_thread() will see the above dequeuing -			 * and call idle_worker_rebind(). -			 */ -			wake_up_process(worker->task); -		} +		/* +		 * worker_thread() will see the above dequeuing and call +		 * idle_worker_rebind(). +		 */ +		wake_up_process(worker->task);  	}  	/* rebind busy workers */ -	for_each_busy_worker(worker, i, pos, gcwq) { +	for_each_busy_worker(worker, i, pos, pool) {  		struct work_struct *rebind_work = &worker->rebind_work;  		struct workqueue_struct *wq; @@ -1736,16 +1660,16 @@ static void rebind_workers(struct global_cwq *gcwq)  		/*  		 * wq doesn't really matter but let's keep @worker->pool -		 * and @cwq->pool consistent for sanity. +		 * and @pwq->pool consistent for sanity.  		 */ -		if (worker_pool_pri(worker->pool)) +		if (std_worker_pool_pri(worker->pool))  			wq = system_highpri_wq;  		else  			wq = system_wq; -		insert_work(get_cwq(gcwq->cpu, wq), rebind_work, -			worker->scheduled.next, -			work_color_to_flags(WORK_NO_COLOR)); +		insert_work(get_pwq(pool->cpu, wq), rebind_work, +			    worker->scheduled.next, +			    work_color_to_flags(WORK_NO_COLOR));  	}  } @@ -1780,19 +1704,18 @@ static struct worker *alloc_worker(void)   */  static struct worker *create_worker(struct worker_pool *pool)  { -	struct global_cwq *gcwq = pool->gcwq; -	const char *pri = worker_pool_pri(pool) ? "H" : ""; +	const char *pri = std_worker_pool_pri(pool) ? "H" : "";  	struct worker *worker = NULL;  	int id = -1; -	spin_lock_irq(&gcwq->lock); +	spin_lock_irq(&pool->lock);  	while (ida_get_new(&pool->worker_ida, &id)) { -		spin_unlock_irq(&gcwq->lock); +		spin_unlock_irq(&pool->lock);  		if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))  			goto fail; -		spin_lock_irq(&gcwq->lock); +		spin_lock_irq(&pool->lock);  	} -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock);  	worker = alloc_worker();  	if (!worker) @@ -1801,30 +1724,30 @@ static struct worker *create_worker(struct worker_pool *pool)  	worker->pool = pool;  	worker->id = id; -	if (gcwq->cpu != WORK_CPU_UNBOUND) +	if (pool->cpu != WORK_CPU_UNBOUND)  		worker->task = kthread_create_on_node(worker_thread, -					worker, cpu_to_node(gcwq->cpu), -					"kworker/%u:%d%s", gcwq->cpu, id, pri); +					worker, cpu_to_node(pool->cpu), +					"kworker/%u:%d%s", pool->cpu, id, pri);  	else  		worker->task = kthread_create(worker_thread, worker,  					      "kworker/u:%d%s", id, pri);  	if (IS_ERR(worker->task))  		goto fail; -	if (worker_pool_pri(pool)) +	if (std_worker_pool_pri(pool))  		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);  	/*  	 * Determine CPU binding of the new worker depending on -	 * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the +	 * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the  	 * flag remains stable across this function.  See the comments  	 * above the flag definition for details.  	 *  	 * As an unbound worker may later become a regular one if CPU comes  	 * online, make sure every worker has %PF_THREAD_BOUND set.  	 */ -	if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { -		kthread_bind(worker->task, gcwq->cpu); +	if (!(pool->flags & POOL_DISASSOCIATED)) { +		kthread_bind(worker->task, pool->cpu);  	} else {  		worker->task->flags |= PF_THREAD_BOUND;  		worker->flags |= WORKER_UNBOUND; @@ -1833,9 +1756,9 @@ static struct worker *create_worker(struct worker_pool *pool)  	return worker;  fail:  	if (id >= 0) { -		spin_lock_irq(&gcwq->lock); +		spin_lock_irq(&pool->lock);  		ida_remove(&pool->worker_ida, id); -		spin_unlock_irq(&gcwq->lock); +		spin_unlock_irq(&pool->lock);  	}  	kfree(worker);  	return NULL; @@ -1845,10 +1768,10 @@ fail:   * start_worker - start a newly created worker   * @worker: worker to start   * - * Make the gcwq aware of @worker and start it. + * Make the pool aware of @worker and start it.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */  static void start_worker(struct worker *worker)  { @@ -1862,15 +1785,14 @@ static void start_worker(struct worker *worker)   * destroy_worker - destroy a workqueue worker   * @worker: worker to be destroyed   * - * Destroy @worker and adjust @gcwq stats accordingly. + * Destroy @worker and adjust @pool stats accordingly.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock) which is released and regrabbed.   */  static void destroy_worker(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	struct global_cwq *gcwq = pool->gcwq;  	int id = worker->id;  	/* sanity check frenzy */ @@ -1885,21 +1807,20 @@ static void destroy_worker(struct worker *worker)  	list_del_init(&worker->entry);  	worker->flags |= WORKER_DIE; -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock);  	kthread_stop(worker->task);  	kfree(worker); -	spin_lock_irq(&gcwq->lock); +	spin_lock_irq(&pool->lock);  	ida_remove(&pool->worker_ida, id);  }  static void idle_worker_timeout(unsigned long __pool)  {  	struct worker_pool *pool = (void *)__pool; -	struct global_cwq *gcwq = pool->gcwq; -	spin_lock_irq(&gcwq->lock); +	spin_lock_irq(&pool->lock);  	if (too_many_workers(pool)) {  		struct worker *worker; @@ -1918,20 +1839,20 @@ static void idle_worker_timeout(unsigned long __pool)  		}  	} -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock);  }  static bool send_mayday(struct work_struct *work)  { -	struct cpu_workqueue_struct *cwq = get_work_cwq(work); -	struct workqueue_struct *wq = cwq->wq; +	struct pool_workqueue *pwq = get_work_pwq(work); +	struct workqueue_struct *wq = pwq->wq;  	unsigned int cpu;  	if (!(wq->flags & WQ_RESCUER))  		return false;  	/* mayday mayday mayday */ -	cpu = cwq->pool->gcwq->cpu; +	cpu = pwq->pool->cpu;  	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */  	if (cpu == WORK_CPU_UNBOUND)  		cpu = 0; @@ -1940,13 +1861,12 @@ static bool send_mayday(struct work_struct *work)  	return true;  } -static void gcwq_mayday_timeout(unsigned long __pool) +static void pool_mayday_timeout(unsigned long __pool)  {  	struct worker_pool *pool = (void *)__pool; -	struct global_cwq *gcwq = pool->gcwq;  	struct work_struct *work; -	spin_lock_irq(&gcwq->lock); +	spin_lock_irq(&pool->lock);  	if (need_to_create_worker(pool)) {  		/* @@ -1959,7 +1879,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)  			send_mayday(work);  	} -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock);  	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);  } @@ -1978,24 +1898,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)   * may_start_working() true.   *   * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed   * multiple times.  Does GFP_KERNEL allocations.  Called only from   * manager.   *   * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true + * false if no action was taken and pool->lock stayed locked, true   * otherwise.   */  static bool maybe_create_worker(struct worker_pool *pool) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) +__releases(&pool->lock) +__acquires(&pool->lock)  { -	struct global_cwq *gcwq = pool->gcwq; -  	if (!need_to_create_worker(pool))  		return false;  restart: -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock);  	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */  	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2006,7 +1924,7 @@ restart:  		worker = create_worker(pool);  		if (worker) {  			del_timer_sync(&pool->mayday_timer); -			spin_lock_irq(&gcwq->lock); +			spin_lock_irq(&pool->lock);  			start_worker(worker);  			BUG_ON(need_to_create_worker(pool));  			return true; @@ -2023,7 +1941,7 @@ restart:  	}  	del_timer_sync(&pool->mayday_timer); -	spin_lock_irq(&gcwq->lock); +	spin_lock_irq(&pool->lock);  	if (need_to_create_worker(pool))  		goto restart;  	return true; @@ -2037,11 +1955,11 @@ restart:   * IDLE_WORKER_TIMEOUT.   *   * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed   * multiple times.  Called only from manager.   *   * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true + * false if no action was taken and pool->lock stayed locked, true   * otherwise.   */  static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2071,21 +1989,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)   * manage_workers - manage worker pool   * @worker: self   * - * Assume the manager role and manage gcwq worker pool @worker belongs + * Assume the manager role and manage the worker pool @worker belongs   * to.  At any given time, there can be only zero or one manager per - * gcwq.  The exclusion is handled automatically by this function. + * pool.  The exclusion is handled automatically by this function.   *   * The caller can safely start processing works on false return.  On   * true return, it's guaranteed that need_to_create_worker() is false   * and may_start_working() is true.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed   * multiple times.  Does GFP_KERNEL allocations.   *   * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true if - * some action was taken. + * spin_lock_irq(pool->lock) which may be released and regrabbed + * multiple times.  Does GFP_KERNEL allocations.   */  static bool manage_workers(struct worker *worker)  { @@ -2107,20 +2025,20 @@ static bool manage_workers(struct worker *worker)  	 * manager against CPU hotplug.  	 *  	 * assoc_mutex would always be free unless CPU hotplug is in -	 * progress.  trylock first without dropping @gcwq->lock. +	 * progress.  trylock first without dropping @pool->lock.  	 */  	if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { -		spin_unlock_irq(&pool->gcwq->lock); +		spin_unlock_irq(&pool->lock);  		mutex_lock(&pool->assoc_mutex);  		/*  		 * CPU hotplug could have happened while we were waiting  		 * for assoc_mutex.  Hotplug itself can't handle us  		 * because manager isn't either on idle or busy list, and -		 * @gcwq's state and ours could have deviated. +		 * @pool's state and ours could have deviated.  		 *  		 * As hotplug is now excluded via assoc_mutex, we can  		 * simply try to bind.  It will succeed or fail depending -		 * on @gcwq's current state.  Try it and adjust +		 * on @pool's current state.  Try it and adjust  		 * %WORKER_UNBOUND accordingly.  		 */  		if (worker_maybe_bind_and_lock(worker)) @@ -2157,18 +2075,15 @@ static bool manage_workers(struct worker *worker)   * call this function to process a work.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock) which is released and regrabbed.   */  static void process_one_work(struct worker *worker, struct work_struct *work) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) +__releases(&pool->lock) +__acquires(&pool->lock)  { -	struct cpu_workqueue_struct *cwq = get_work_cwq(work); +	struct pool_workqueue *pwq = get_work_pwq(work);  	struct worker_pool *pool = worker->pool; -	struct global_cwq *gcwq = pool->gcwq; -	struct hlist_head *bwh = busy_worker_head(gcwq, work); -	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; -	work_func_t f = work->func; +	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;  	int work_color;  	struct worker *collision;  #ifdef CONFIG_LOCKDEP @@ -2186,11 +2101,11 @@ __acquires(&gcwq->lock)  	/*  	 * Ensure we're on the correct CPU.  DISASSOCIATED test is  	 * necessary to avoid spurious warnings from rescuers servicing the -	 * unbound or a disassociated gcwq. +	 * unbound or a disassociated pool.  	 */  	WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && -		     !(gcwq->flags & GCWQ_DISASSOCIATED) && -		     raw_smp_processor_id() != gcwq->cpu); +		     !(pool->flags & POOL_DISASSOCIATED) && +		     raw_smp_processor_id() != pool->cpu);  	/*  	 * A single work shouldn't be executed concurrently by @@ -2198,7 +2113,7 @@ __acquires(&gcwq->lock)  	 * already processing the work.  If so, defer the work to the  	 * currently executing one.  	 */ -	collision = __find_worker_executing_work(gcwq, bwh, work); +	collision = find_worker_executing_work(pool, work);  	if (unlikely(collision)) {  		move_linked_works(work, &collision->scheduled, NULL);  		return; @@ -2206,9 +2121,10 @@ __acquires(&gcwq->lock)  	/* claim and dequeue */  	debug_work_deactivate(work); -	hlist_add_head(&worker->hentry, bwh); +	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);  	worker->current_work = work; -	worker->current_cwq = cwq; +	worker->current_func = work->func; +	worker->current_pwq = pwq;  	work_color = get_work_color(work);  	list_del_init(&work->entry); @@ -2221,53 +2137,55 @@ __acquires(&gcwq->lock)  		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);  	/* -	 * Unbound gcwq isn't concurrency managed and work items should be +	 * Unbound pool isn't concurrency managed and work items should be  	 * executed ASAP.  Wake up another worker if necessary.  	 */  	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))  		wake_up_worker(pool);  	/* -	 * Record the last CPU and clear PENDING which should be the last -	 * update to @work.  Also, do this inside @gcwq->lock so that +	 * Record the last pool and clear PENDING which should be the last +	 * update to @work.  Also, do this inside @pool->lock so that  	 * PENDING and queued state changes happen together while IRQ is  	 * disabled.  	 */ -	set_work_cpu_and_clear_pending(work, gcwq->cpu); +	set_work_pool_and_clear_pending(work, pool->id); -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock); -	lock_map_acquire_read(&cwq->wq->lockdep_map); +	lock_map_acquire_read(&pwq->wq->lockdep_map);  	lock_map_acquire(&lockdep_map);  	trace_workqueue_execute_start(work); -	f(work); +	worker->current_func(work);  	/*  	 * While we must be careful to not use "work" after this, the trace  	 * point will only record its address.  	 */  	trace_workqueue_execute_end(work);  	lock_map_release(&lockdep_map); -	lock_map_release(&cwq->wq->lockdep_map); +	lock_map_release(&pwq->wq->lockdep_map);  	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {  		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"  		       "     last function: %pf\n", -		       current->comm, preempt_count(), task_pid_nr(current), f); +		       current->comm, preempt_count(), task_pid_nr(current), +		       worker->current_func);  		debug_show_held_locks(current);  		dump_stack();  	} -	spin_lock_irq(&gcwq->lock); +	spin_lock_irq(&pool->lock);  	/* clear cpu intensive status */  	if (unlikely(cpu_intensive))  		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);  	/* we're done with it, release */ -	hlist_del_init(&worker->hentry); +	hash_del(&worker->hentry);  	worker->current_work = NULL; -	worker->current_cwq = NULL; -	cwq_dec_nr_in_flight(cwq, work_color); +	worker->current_func = NULL; +	worker->current_pwq = NULL; +	pwq_dec_nr_in_flight(pwq, work_color);  }  /** @@ -2279,7 +2197,7 @@ __acquires(&gcwq->lock)   * fetches a work from the top and executes it.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed   * multiple times.   */  static void process_scheduled_works(struct worker *worker) @@ -2295,8 +2213,8 @@ static void process_scheduled_works(struct worker *worker)   * worker_thread - the worker thread function   * @__worker: self   * - * The gcwq worker thread function.  There's a single dynamic pool of - * these per each cpu.  These workers process all works regardless of + * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools + * of these per each cpu.  These workers process all works regardless of   * their specific target workqueue.  The only exception is works which   * belong to workqueues with a rescuer which will be explained in   * rescuer_thread(). @@ -2305,16 +2223,15 @@ static int worker_thread(void *__worker)  {  	struct worker *worker = __worker;  	struct worker_pool *pool = worker->pool; -	struct global_cwq *gcwq = pool->gcwq;  	/* tell the scheduler that this is a workqueue worker */  	worker->task->flags |= PF_WQ_WORKER;  woke_up: -	spin_lock_irq(&gcwq->lock); +	spin_lock_irq(&pool->lock);  	/* we are off idle list if destruction or rebind is requested */  	if (unlikely(list_empty(&worker->entry))) { -		spin_unlock_irq(&gcwq->lock); +		spin_unlock_irq(&pool->lock);  		/* if DIE is set, destruction is requested */  		if (worker->flags & WORKER_DIE) { @@ -2373,52 +2290,59 @@ sleep:  		goto recheck;  	/* -	 * gcwq->lock is held and there's no work to process and no -	 * need to manage, sleep.  Workers are woken up only while -	 * holding gcwq->lock or from local cpu, so setting the -	 * current state before releasing gcwq->lock is enough to -	 * prevent losing any event. +	 * pool->lock is held and there's no work to process and no need to +	 * manage, sleep.  Workers are woken up only while holding +	 * pool->lock or from local cpu, so setting the current state +	 * before releasing pool->lock is enough to prevent losing any +	 * event.  	 */  	worker_enter_idle(worker);  	__set_current_state(TASK_INTERRUPTIBLE); -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock);  	schedule();  	goto woke_up;  }  /**   * rescuer_thread - the rescuer thread function - * @__wq: the associated workqueue + * @__rescuer: self   *   * Workqueue rescuer thread function.  There's one rescuer for each   * workqueue which has WQ_RESCUER set.   * - * Regular work processing on a gcwq may block trying to create a new + * Regular work processing on a pool may block trying to create a new   * worker which uses GFP_KERNEL allocation which has slight chance of   * developing into deadlock if some works currently on the same queue   * need to be processed to satisfy the GFP_KERNEL allocation.  This is   * the problem rescuer solves.   * - * When such condition is possible, the gcwq summons rescuers of all - * workqueues which have works queued on the gcwq and let them process + * When such condition is possible, the pool summons rescuers of all + * workqueues which have works queued on the pool and let them process   * those works so that forward progress can be guaranteed.   *   * This should happen rarely.   */ -static int rescuer_thread(void *__wq) +static int rescuer_thread(void *__rescuer)  { -	struct workqueue_struct *wq = __wq; -	struct worker *rescuer = wq->rescuer; +	struct worker *rescuer = __rescuer; +	struct workqueue_struct *wq = rescuer->rescue_wq;  	struct list_head *scheduled = &rescuer->scheduled;  	bool is_unbound = wq->flags & WQ_UNBOUND;  	unsigned int cpu;  	set_user_nice(current, RESCUER_NICE_LEVEL); + +	/* +	 * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it +	 * doesn't participate in concurrency management. +	 */ +	rescuer->task->flags |= PF_WQ_WORKER;  repeat:  	set_current_state(TASK_INTERRUPTIBLE);  	if (kthread_should_stop()) {  		__set_current_state(TASK_RUNNING); +		rescuer->task->flags &= ~PF_WQ_WORKER;  		return 0;  	} @@ -2428,9 +2352,8 @@ repeat:  	 */  	for_each_mayday_cpu(cpu, wq->mayday_mask) {  		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; -		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); -		struct worker_pool *pool = cwq->pool; -		struct global_cwq *gcwq = pool->gcwq; +		struct pool_workqueue *pwq = get_pwq(tcpu, wq); +		struct worker_pool *pool = pwq->pool;  		struct work_struct *work, *n;  		__set_current_state(TASK_RUNNING); @@ -2446,22 +2369,24 @@ repeat:  		 */  		BUG_ON(!list_empty(&rescuer->scheduled));  		list_for_each_entry_safe(work, n, &pool->worklist, entry) -			if (get_work_cwq(work) == cwq) +			if (get_work_pwq(work) == pwq)  				move_linked_works(work, scheduled, &n);  		process_scheduled_works(rescuer);  		/* -		 * Leave this gcwq.  If keep_working() is %true, notify a +		 * Leave this pool.  If keep_working() is %true, notify a  		 * regular worker; otherwise, we end up with 0 concurrency  		 * and stalling the execution.  		 */  		if (keep_working(pool))  			wake_up_worker(pool); -		spin_unlock_irq(&gcwq->lock); +		spin_unlock_irq(&pool->lock);  	} +	/* rescuers should never participate in concurrency management */ +	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));  	schedule();  	goto repeat;  } @@ -2479,7 +2404,7 @@ static void wq_barrier_func(struct work_struct *work)  /**   * insert_wq_barrier - insert a barrier work - * @cwq: cwq to insert barrier into + * @pwq: pwq to insert barrier into   * @barr: wq_barrier to insert   * @target: target work to attach @barr to   * @worker: worker currently executing @target, NULL if @target is not executing @@ -2496,12 +2421,12 @@ static void wq_barrier_func(struct work_struct *work)   * after a work with LINKED flag set.   *   * Note that when @worker is non-NULL, @target may be modified - * underneath us, so we can't reliably determine cwq from @target. + * underneath us, so we can't reliably determine pwq from @target.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */ -static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, +static void insert_wq_barrier(struct pool_workqueue *pwq,  			      struct wq_barrier *barr,  			      struct work_struct *target, struct worker *worker)  { @@ -2509,7 +2434,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,  	unsigned int linked = 0;  	/* -	 * debugobject calls are safe here even with gcwq->lock locked +	 * debugobject calls are safe here even with pool->lock locked  	 * as we know for sure that this will not trigger any of the  	 * checks and call back into the fixup functions where we  	 * might deadlock. @@ -2534,23 +2459,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,  	}  	debug_work_activate(&barr->work); -	insert_work(cwq, &barr->work, head, +	insert_work(pwq, &barr->work, head,  		    work_color_to_flags(WORK_NO_COLOR) | linked);  }  /** - * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing   * @wq: workqueue being flushed   * @flush_color: new flush color, < 0 for no-op   * @work_color: new work color, < 0 for no-op   * - * Prepare cwqs for workqueue flushing. + * Prepare pwqs for workqueue flushing.   * - * If @flush_color is non-negative, flush_color on all cwqs should be - * -1.  If no cwq has in-flight commands at the specified color, all - * cwq->flush_color's stay at -1 and %false is returned.  If any cwq - * has in flight commands, its cwq->flush_color is set to - * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * If @flush_color is non-negative, flush_color on all pwqs should be + * -1.  If no pwq has in-flight commands at the specified color, all + * pwq->flush_color's stay at -1 and %false is returned.  If any pwq + * has in flight commands, its pwq->flush_color is set to + * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq   * wakeup logic is armed and %true is returned.   *   * The caller should have initialized @wq->first_flusher prior to @@ -2558,7 +2483,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,   * @flush_color is negative, no flush color update is done and %false   * is returned.   * - * If @work_color is non-negative, all cwqs should have the same + * If @work_color is non-negative, all pwqs should have the same   * work_color which is previous to @work_color and all will be   * advanced to @work_color.   * @@ -2569,42 +2494,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,   * %true if @flush_color >= 0 and there's something to flush.  %false   * otherwise.   */ -static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, +static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,  				      int flush_color, int work_color)  {  	bool wait = false;  	unsigned int cpu;  	if (flush_color >= 0) { -		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); -		atomic_set(&wq->nr_cwqs_to_flush, 1); +		BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); +		atomic_set(&wq->nr_pwqs_to_flush, 1);  	} -	for_each_cwq_cpu(cpu, wq) { -		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); -		struct global_cwq *gcwq = cwq->pool->gcwq; +	for_each_pwq_cpu(cpu, wq) { +		struct pool_workqueue *pwq = get_pwq(cpu, wq); +		struct worker_pool *pool = pwq->pool; -		spin_lock_irq(&gcwq->lock); +		spin_lock_irq(&pool->lock);  		if (flush_color >= 0) { -			BUG_ON(cwq->flush_color != -1); +			BUG_ON(pwq->flush_color != -1); -			if (cwq->nr_in_flight[flush_color]) { -				cwq->flush_color = flush_color; -				atomic_inc(&wq->nr_cwqs_to_flush); +			if (pwq->nr_in_flight[flush_color]) { +				pwq->flush_color = flush_color; +				atomic_inc(&wq->nr_pwqs_to_flush);  				wait = true;  			}  		}  		if (work_color >= 0) { -			BUG_ON(work_color != work_next_color(cwq->work_color)); -			cwq->work_color = work_color; +			BUG_ON(work_color != work_next_color(pwq->work_color)); +			pwq->work_color = work_color;  		} -		spin_unlock_irq(&gcwq->lock); +		spin_unlock_irq(&pool->lock);  	} -	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) +	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))  		complete(&wq->first_flusher->done);  	return wait; @@ -2655,7 +2580,7 @@ void flush_workqueue(struct workqueue_struct *wq)  			wq->first_flusher = &this_flusher; -			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, +			if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,  						       wq->work_color)) {  				/* nothing to flush, done */  				wq->flush_color = next_color; @@ -2666,7 +2591,7 @@ void flush_workqueue(struct workqueue_struct *wq)  			/* wait in queue */  			BUG_ON(wq->flush_color == this_flusher.flush_color);  			list_add_tail(&this_flusher.list, &wq->flusher_queue); -			flush_workqueue_prep_cwqs(wq, -1, wq->work_color); +			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);  		}  	} else {  		/* @@ -2733,7 +2658,7 @@ void flush_workqueue(struct workqueue_struct *wq)  			list_splice_tail_init(&wq->flusher_overflow,  					      &wq->flusher_queue); -			flush_workqueue_prep_cwqs(wq, -1, wq->work_color); +			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);  		}  		if (list_empty(&wq->flusher_queue)) { @@ -2743,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq)  		/*  		 * Need to flush more colors.  Make the next flusher -		 * the new first flusher and arm cwqs. +		 * the new first flusher and arm pwqs.  		 */  		BUG_ON(wq->flush_color == wq->work_color);  		BUG_ON(wq->flush_color != next->flush_color); @@ -2751,7 +2676,7 @@ void flush_workqueue(struct workqueue_struct *wq)  		list_del_init(&next->list);  		wq->first_flusher = next; -		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) +		if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))  			break;  		/* @@ -2794,13 +2719,13 @@ void drain_workqueue(struct workqueue_struct *wq)  reflush:  	flush_workqueue(wq); -	for_each_cwq_cpu(cpu, wq) { -		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); +	for_each_pwq_cpu(cpu, wq) { +		struct pool_workqueue *pwq = get_pwq(cpu, wq);  		bool drained; -		spin_lock_irq(&cwq->pool->gcwq->lock); -		drained = !cwq->nr_active && list_empty(&cwq->delayed_works); -		spin_unlock_irq(&cwq->pool->gcwq->lock); +		spin_lock_irq(&pwq->pool->lock); +		drained = !pwq->nr_active && list_empty(&pwq->delayed_works); +		spin_unlock_irq(&pwq->pool->lock);  		if (drained)  			continue; @@ -2822,34 +2747,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);  static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)  {  	struct worker *worker = NULL; -	struct global_cwq *gcwq; -	struct cpu_workqueue_struct *cwq; +	struct worker_pool *pool; +	struct pool_workqueue *pwq;  	might_sleep(); -	gcwq = get_work_gcwq(work); -	if (!gcwq) +	pool = get_work_pool(work); +	if (!pool)  		return false; -	spin_lock_irq(&gcwq->lock); -	if (!list_empty(&work->entry)) { -		/* -		 * See the comment near try_to_grab_pending()->smp_rmb(). -		 * If it was re-queued to a different gcwq under us, we -		 * are not going to wait. -		 */ -		smp_rmb(); -		cwq = get_work_cwq(work); -		if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) +	spin_lock_irq(&pool->lock); +	/* see the comment in try_to_grab_pending() with the same code */ +	pwq = get_work_pwq(work); +	if (pwq) { +		if (unlikely(pwq->pool != pool))  			goto already_gone;  	} else { -		worker = find_worker_executing_work(gcwq, work); +		worker = find_worker_executing_work(pool, work);  		if (!worker)  			goto already_gone; -		cwq = worker->current_cwq; +		pwq = worker->current_pwq;  	} -	insert_wq_barrier(cwq, barr, work, worker); -	spin_unlock_irq(&gcwq->lock); +	insert_wq_barrier(pwq, barr, work, worker); +	spin_unlock_irq(&pool->lock);  	/*  	 * If @max_active is 1 or rescuer is in use, flushing another work @@ -2857,15 +2777,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)  	 * flusher is not running on the same workqueue by verifying write  	 * access.  	 */ -	if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) -		lock_map_acquire(&cwq->wq->lockdep_map); +	if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) +		lock_map_acquire(&pwq->wq->lockdep_map);  	else -		lock_map_acquire_read(&cwq->wq->lockdep_map); -	lock_map_release(&cwq->wq->lockdep_map); +		lock_map_acquire_read(&pwq->wq->lockdep_map); +	lock_map_release(&pwq->wq->lockdep_map);  	return true;  already_gone: -	spin_unlock_irq(&gcwq->lock); +	spin_unlock_irq(&pool->lock);  	return false;  } @@ -2961,8 +2881,7 @@ bool flush_delayed_work(struct delayed_work *dwork)  {  	local_irq_disable();  	if (del_timer_sync(&dwork->timer)) -		__queue_work(dwork->cpu, -			     get_work_cwq(&dwork->work)->wq, &dwork->work); +		__queue_work(dwork->cpu, dwork->wq, &dwork->work);  	local_irq_enable();  	return flush_work(&dwork->work);  } @@ -2992,7 +2911,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)  	if (unlikely(ret < 0))  		return false; -	set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); +	set_work_pool_and_clear_pending(&dwork->work, +					get_work_pool_id(&dwork->work));  	local_irq_restore(flags);  	return ret;  } @@ -3171,46 +3091,46 @@ int keventd_up(void)  	return system_wq != NULL;  } -static int alloc_cwqs(struct workqueue_struct *wq) +static int alloc_pwqs(struct workqueue_struct *wq)  {  	/* -	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. +	 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.  	 * Make sure that the alignment isn't lower than that of  	 * unsigned long long.  	 */ -	const size_t size = sizeof(struct cpu_workqueue_struct); +	const size_t size = sizeof(struct pool_workqueue);  	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,  				   __alignof__(unsigned long long));  	if (!(wq->flags & WQ_UNBOUND)) -		wq->cpu_wq.pcpu = __alloc_percpu(size, align); +		wq->pool_wq.pcpu = __alloc_percpu(size, align);  	else {  		void *ptr;  		/* -		 * Allocate enough room to align cwq and put an extra +		 * Allocate enough room to align pwq and put an extra  		 * pointer at the end pointing back to the originally  		 * allocated pointer which will be used for free.  		 */  		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);  		if (ptr) { -			wq->cpu_wq.single = PTR_ALIGN(ptr, align); -			*(void **)(wq->cpu_wq.single + 1) = ptr; +			wq->pool_wq.single = PTR_ALIGN(ptr, align); +			*(void **)(wq->pool_wq.single + 1) = ptr;  		}  	}  	/* just in case, make sure it's actually aligned */ -	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); -	return wq->cpu_wq.v ? 0 : -ENOMEM; +	BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); +	return wq->pool_wq.v ? 0 : -ENOMEM;  } -static void free_cwqs(struct workqueue_struct *wq) +static void free_pwqs(struct workqueue_struct *wq)  {  	if (!(wq->flags & WQ_UNBOUND)) -		free_percpu(wq->cpu_wq.pcpu); -	else if (wq->cpu_wq.single) { -		/* the pointer to free is stored right after the cwq */ -		kfree(*(void **)(wq->cpu_wq.single + 1)); +		free_percpu(wq->pool_wq.pcpu); +	else if (wq->pool_wq.single) { +		/* the pointer to free is stored right after the pwq */ +		kfree(*(void **)(wq->pool_wq.single + 1));  	}  } @@ -3264,27 +3184,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	wq->flags = flags;  	wq->saved_max_active = max_active;  	mutex_init(&wq->flush_mutex); -	atomic_set(&wq->nr_cwqs_to_flush, 0); +	atomic_set(&wq->nr_pwqs_to_flush, 0);  	INIT_LIST_HEAD(&wq->flusher_queue);  	INIT_LIST_HEAD(&wq->flusher_overflow);  	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);  	INIT_LIST_HEAD(&wq->list); -	if (alloc_cwqs(wq) < 0) +	if (alloc_pwqs(wq) < 0)  		goto err; -	for_each_cwq_cpu(cpu, wq) { -		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); -		struct global_cwq *gcwq = get_gcwq(cpu); -		int pool_idx = (bool)(flags & WQ_HIGHPRI); +	for_each_pwq_cpu(cpu, wq) { +		struct pool_workqueue *pwq = get_pwq(cpu, wq); -		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); -		cwq->pool = &gcwq->pools[pool_idx]; -		cwq->wq = wq; -		cwq->flush_color = -1; -		cwq->max_active = max_active; -		INIT_LIST_HEAD(&cwq->delayed_works); +		BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); +		pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); +		pwq->wq = wq; +		pwq->flush_color = -1; +		pwq->max_active = max_active; +		INIT_LIST_HEAD(&pwq->delayed_works);  	}  	if (flags & WQ_RESCUER) { @@ -3297,7 +3215,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  		if (!rescuer)  			goto err; -		rescuer->task = kthread_create(rescuer_thread, wq, "%s", +		rescuer->rescue_wq = wq; +		rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",  					       wq->name);  		if (IS_ERR(rescuer->task))  			goto err; @@ -3314,8 +3233,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	spin_lock(&workqueue_lock);  	if (workqueue_freezing && wq->flags & WQ_FREEZABLE) -		for_each_cwq_cpu(cpu, wq) -			get_cwq(cpu, wq)->max_active = 0; +		for_each_pwq_cpu(cpu, wq) +			get_pwq(cpu, wq)->max_active = 0;  	list_add(&wq->list, &workqueues); @@ -3324,7 +3243,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	return wq;  err:  	if (wq) { -		free_cwqs(wq); +		free_pwqs(wq);  		free_mayday_mask(wq->mayday_mask);  		kfree(wq->rescuer);  		kfree(wq); @@ -3355,14 +3274,14 @@ void destroy_workqueue(struct workqueue_struct *wq)  	spin_unlock(&workqueue_lock);  	/* sanity check */ -	for_each_cwq_cpu(cpu, wq) { -		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); +	for_each_pwq_cpu(cpu, wq) { +		struct pool_workqueue *pwq = get_pwq(cpu, wq);  		int i;  		for (i = 0; i < WORK_NR_COLORS; i++) -			BUG_ON(cwq->nr_in_flight[i]); -		BUG_ON(cwq->nr_active); -		BUG_ON(!list_empty(&cwq->delayed_works)); +			BUG_ON(pwq->nr_in_flight[i]); +		BUG_ON(pwq->nr_active); +		BUG_ON(!list_empty(&pwq->delayed_works));  	}  	if (wq->flags & WQ_RESCUER) { @@ -3371,29 +3290,29 @@ void destroy_workqueue(struct workqueue_struct *wq)  		kfree(wq->rescuer);  	} -	free_cwqs(wq); +	free_pwqs(wq);  	kfree(wq);  }  EXPORT_SYMBOL_GPL(destroy_workqueue);  /** - * cwq_set_max_active - adjust max_active of a cwq - * @cwq: target cpu_workqueue_struct + * pwq_set_max_active - adjust max_active of a pwq + * @pwq: target pool_workqueue   * @max_active: new max_active value.   * - * Set @cwq->max_active to @max_active and activate delayed works if + * Set @pwq->max_active to @max_active and activate delayed works if   * increased.   *   * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock).   */ -static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)  { -	cwq->max_active = max_active; +	pwq->max_active = max_active; -	while (!list_empty(&cwq->delayed_works) && -	       cwq->nr_active < cwq->max_active) -		cwq_activate_first_delayed(cwq); +	while (!list_empty(&pwq->delayed_works) && +	       pwq->nr_active < pwq->max_active) +		pwq_activate_first_delayed(pwq);  }  /** @@ -3416,16 +3335,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)  	wq->saved_max_active = max_active; -	for_each_cwq_cpu(cpu, wq) { -		struct global_cwq *gcwq = get_gcwq(cpu); +	for_each_pwq_cpu(cpu, wq) { +		struct pool_workqueue *pwq = get_pwq(cpu, wq); +		struct worker_pool *pool = pwq->pool; -		spin_lock_irq(&gcwq->lock); +		spin_lock_irq(&pool->lock);  		if (!(wq->flags & WQ_FREEZABLE) || -		    !(gcwq->flags & GCWQ_FREEZING)) -			cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); +		    !(pool->flags & POOL_FREEZING)) +			pwq_set_max_active(pwq, max_active); -		spin_unlock_irq(&gcwq->lock); +		spin_unlock_irq(&pool->lock);  	}  	spin_unlock(&workqueue_lock); @@ -3446,57 +3366,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);   */  bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)  { -	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); +	struct pool_workqueue *pwq = get_pwq(cpu, wq); -	return !list_empty(&cwq->delayed_works); +	return !list_empty(&pwq->delayed_works);  }  EXPORT_SYMBOL_GPL(workqueue_congested);  /** - * work_cpu - return the last known associated cpu for @work - * @work: the work of interest - * - * RETURNS: - * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise. - */ -unsigned int work_cpu(struct work_struct *work) -{ -	struct global_cwq *gcwq = get_work_gcwq(work); - -	return gcwq ? gcwq->cpu : WORK_CPU_NONE; -} -EXPORT_SYMBOL_GPL(work_cpu); - -/**   * work_busy - test whether a work is currently pending or running   * @work: the work to be tested   *   * Test whether @work is currently pending or running.  There is no   * synchronization around this function and the test result is   * unreliable and only useful as advisory hints or for debugging. - * Especially for reentrant wqs, the pending state might hide the - * running state.   *   * RETURNS:   * OR'd bitmask of WORK_BUSY_* bits.   */  unsigned int work_busy(struct work_struct *work)  { -	struct global_cwq *gcwq = get_work_gcwq(work); +	struct worker_pool *pool = get_work_pool(work);  	unsigned long flags;  	unsigned int ret = 0; -	if (!gcwq) -		return 0; - -	spin_lock_irqsave(&gcwq->lock, flags); -  	if (work_pending(work))  		ret |= WORK_BUSY_PENDING; -	if (find_worker_executing_work(gcwq, work)) -		ret |= WORK_BUSY_RUNNING; -	spin_unlock_irqrestore(&gcwq->lock, flags); +	if (pool) { +		spin_lock_irqsave(&pool->lock, flags); +		if (find_worker_executing_work(pool, work)) +			ret |= WORK_BUSY_RUNNING; +		spin_unlock_irqrestore(&pool->lock, flags); +	}  	return ret;  } @@ -3506,65 +3407,49 @@ EXPORT_SYMBOL_GPL(work_busy);   * CPU hotplug.   *   * There are two challenges in supporting CPU hotplug.  Firstly, there - * are a lot of assumptions on strong associations among work, cwq and - * gcwq which make migrating pending and scheduled works very + * are a lot of assumptions on strong associations among work, pwq and + * pool which make migrating pending and scheduled works very   * difficult to implement without impacting hot paths.  Secondly, - * gcwqs serve mix of short, long and very long running works making + * worker pools serve mix of short, long and very long running works making   * blocked draining impractical.   * - * This is solved by allowing a gcwq to be disassociated from the CPU + * This is solved by allowing the pools to be disassociated from the CPU   * running as an unbound one and allowing it to be reattached later if the   * cpu comes back online.   */ -/* claim manager positions of all pools */ -static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) +static void wq_unbind_fn(struct work_struct *work)  { -	struct worker_pool *pool; - -	for_each_worker_pool(pool, gcwq) -		mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); -	spin_lock_irq(&gcwq->lock); -} - -/* release manager positions */ -static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) -{ -	struct worker_pool *pool; - -	spin_unlock_irq(&gcwq->lock); -	for_each_worker_pool(pool, gcwq) -		mutex_unlock(&pool->assoc_mutex); -} - -static void gcwq_unbind_fn(struct work_struct *work) -{ -	struct global_cwq *gcwq = get_gcwq(smp_processor_id()); +	int cpu = smp_processor_id();  	struct worker_pool *pool;  	struct worker *worker;  	struct hlist_node *pos;  	int i; -	BUG_ON(gcwq->cpu != smp_processor_id()); +	for_each_std_worker_pool(pool, cpu) { +		BUG_ON(cpu != smp_processor_id()); -	gcwq_claim_assoc_and_lock(gcwq); +		mutex_lock(&pool->assoc_mutex); +		spin_lock_irq(&pool->lock); -	/* -	 * We've claimed all manager positions.  Make all workers unbound -	 * and set DISASSOCIATED.  Before this, all workers except for the -	 * ones which are still executing works from before the last CPU -	 * down must be on the cpu.  After this, they may become diasporas. -	 */ -	for_each_worker_pool(pool, gcwq) +		/* +		 * We've claimed all manager positions.  Make all workers +		 * unbound and set DISASSOCIATED.  Before this, all workers +		 * except for the ones which are still executing works from +		 * before the last CPU down must be on the cpu.  After +		 * this, they may become diasporas. +		 */  		list_for_each_entry(worker, &pool->idle_list, entry)  			worker->flags |= WORKER_UNBOUND; -	for_each_busy_worker(worker, i, pos, gcwq) -		worker->flags |= WORKER_UNBOUND; +		for_each_busy_worker(worker, i, pos, pool) +			worker->flags |= WORKER_UNBOUND; -	gcwq->flags |= GCWQ_DISASSOCIATED; +		pool->flags |= POOL_DISASSOCIATED; -	gcwq_release_assoc_and_unlock(gcwq); +		spin_unlock_irq(&pool->lock); +		mutex_unlock(&pool->assoc_mutex); +	}  	/*  	 * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3576,16 +3461,16 @@ static void gcwq_unbind_fn(struct work_struct *work)  	/*  	 * Sched callbacks are disabled now.  Zap nr_running.  After this,  	 * nr_running stays zero and need_more_worker() and keep_working() -	 * are always true as long as the worklist is not empty.  @gcwq now -	 * behaves as unbound (in terms of concurrency management) gcwq -	 * which is served by workers tied to the CPU. +	 * are always true as long as the worklist is not empty.  Pools on +	 * @cpu now behave as unbound (in terms of concurrency management) +	 * pools which are served by workers tied to the CPU.  	 *  	 * On return from this function, the current worker would trigger  	 * unbound chain execution of pending work items if other workers  	 * didn't already.  	 */ -	for_each_worker_pool(pool, gcwq) -		atomic_set(get_pool_nr_running(pool), 0); +	for_each_std_worker_pool(pool, cpu) +		atomic_set(&pool->nr_running, 0);  }  /* @@ -3597,12 +3482,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,  					       void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; -	struct global_cwq *gcwq = get_gcwq(cpu);  	struct worker_pool *pool;  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_UP_PREPARE: -		for_each_worker_pool(pool, gcwq) { +		for_each_std_worker_pool(pool, cpu) {  			struct worker *worker;  			if (pool->nr_workers) @@ -3612,18 +3496,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,  			if (!worker)  				return NOTIFY_BAD; -			spin_lock_irq(&gcwq->lock); +			spin_lock_irq(&pool->lock);  			start_worker(worker); -			spin_unlock_irq(&gcwq->lock); +			spin_unlock_irq(&pool->lock);  		}  		break;  	case CPU_DOWN_FAILED:  	case CPU_ONLINE: -		gcwq_claim_assoc_and_lock(gcwq); -		gcwq->flags &= ~GCWQ_DISASSOCIATED; -		rebind_workers(gcwq); -		gcwq_release_assoc_and_unlock(gcwq); +		for_each_std_worker_pool(pool, cpu) { +			mutex_lock(&pool->assoc_mutex); +			spin_lock_irq(&pool->lock); + +			pool->flags &= ~POOL_DISASSOCIATED; +			rebind_workers(pool); + +			spin_unlock_irq(&pool->lock); +			mutex_unlock(&pool->assoc_mutex); +		}  		break;  	}  	return NOTIFY_OK; @@ -3643,7 +3533,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_DOWN_PREPARE:  		/* unbinding should happen on the local CPU */ -		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); +		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);  		queue_work_on(cpu, system_highpri_wq, &unbind_work);  		flush_work(&unbind_work);  		break; @@ -3696,10 +3586,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);   *   * Start freezing workqueues.  After this function returns, all freezable   * workqueues will queue new works to their frozen_works list instead of - * gcwq->worklist. + * pool->worklist.   *   * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. + * Grabs and releases workqueue_lock and pool->lock's.   */  void freeze_workqueues_begin(void)  { @@ -3710,23 +3600,26 @@ void freeze_workqueues_begin(void)  	BUG_ON(workqueue_freezing);  	workqueue_freezing = true; -	for_each_gcwq_cpu(cpu) { -		struct global_cwq *gcwq = get_gcwq(cpu); +	for_each_wq_cpu(cpu) { +		struct worker_pool *pool;  		struct workqueue_struct *wq; -		spin_lock_irq(&gcwq->lock); +		for_each_std_worker_pool(pool, cpu) { +			spin_lock_irq(&pool->lock); -		BUG_ON(gcwq->flags & GCWQ_FREEZING); -		gcwq->flags |= GCWQ_FREEZING; +			WARN_ON_ONCE(pool->flags & POOL_FREEZING); +			pool->flags |= POOL_FREEZING; -		list_for_each_entry(wq, &workqueues, list) { -			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); +			list_for_each_entry(wq, &workqueues, list) { +				struct pool_workqueue *pwq = get_pwq(cpu, wq); -			if (cwq && wq->flags & WQ_FREEZABLE) -				cwq->max_active = 0; -		} +				if (pwq && pwq->pool == pool && +				    (wq->flags & WQ_FREEZABLE)) +					pwq->max_active = 0; +			} -		spin_unlock_irq(&gcwq->lock); +			spin_unlock_irq(&pool->lock); +		}  	}  	spin_unlock(&workqueue_lock); @@ -3754,20 +3647,20 @@ bool freeze_workqueues_busy(void)  	BUG_ON(!workqueue_freezing); -	for_each_gcwq_cpu(cpu) { +	for_each_wq_cpu(cpu) {  		struct workqueue_struct *wq;  		/*  		 * nr_active is monotonically decreasing.  It's safe  		 * to peek without lock.  		 */  		list_for_each_entry(wq, &workqueues, list) { -			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); +			struct pool_workqueue *pwq = get_pwq(cpu, wq); -			if (!cwq || !(wq->flags & WQ_FREEZABLE)) +			if (!pwq || !(wq->flags & WQ_FREEZABLE))  				continue; -			BUG_ON(cwq->nr_active < 0); -			if (cwq->nr_active) { +			BUG_ON(pwq->nr_active < 0); +			if (pwq->nr_active) {  				busy = true;  				goto out_unlock;  			} @@ -3782,10 +3675,10 @@ out_unlock:   * thaw_workqueues - thaw workqueues   *   * Thaw workqueues.  Normal queueing is restored and all collected - * frozen works are transferred to their respective gcwq worklists. + * frozen works are transferred to their respective pool worklists.   *   * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. + * Grabs and releases workqueue_lock and pool->lock's.   */  void thaw_workqueues(void)  { @@ -3796,30 +3689,31 @@ void thaw_workqueues(void)  	if (!workqueue_freezing)  		goto out_unlock; -	for_each_gcwq_cpu(cpu) { -		struct global_cwq *gcwq = get_gcwq(cpu); +	for_each_wq_cpu(cpu) {  		struct worker_pool *pool;  		struct workqueue_struct *wq; -		spin_lock_irq(&gcwq->lock); +		for_each_std_worker_pool(pool, cpu) { +			spin_lock_irq(&pool->lock); -		BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); -		gcwq->flags &= ~GCWQ_FREEZING; +			WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); +			pool->flags &= ~POOL_FREEZING; -		list_for_each_entry(wq, &workqueues, list) { -			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); +			list_for_each_entry(wq, &workqueues, list) { +				struct pool_workqueue *pwq = get_pwq(cpu, wq); -			if (!cwq || !(wq->flags & WQ_FREEZABLE)) -				continue; +				if (!pwq || pwq->pool != pool || +				    !(wq->flags & WQ_FREEZABLE)) +					continue; -			/* restore max_active and repopulate worklist */ -			cwq_set_max_active(cwq, wq->saved_max_active); -		} +				/* restore max_active and repopulate worklist */ +				pwq_set_max_active(pwq, wq->saved_max_active); +			} -		for_each_worker_pool(pool, gcwq)  			wake_up_worker(pool); -		spin_unlock_irq(&gcwq->lock); +			spin_unlock_irq(&pool->lock); +		}  	}  	workqueue_freezing = false; @@ -3831,60 +3725,56 @@ out_unlock:  static int __init init_workqueues(void)  {  	unsigned int cpu; -	int i; -	/* make sure we have enough bits for OFFQ CPU number */ -	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < -		     WORK_CPU_LAST); +	/* make sure we have enough bits for OFFQ pool ID */ +	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < +		     WORK_CPU_END * NR_STD_WORKER_POOLS);  	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);  	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); -	/* initialize gcwqs */ -	for_each_gcwq_cpu(cpu) { -		struct global_cwq *gcwq = get_gcwq(cpu); +	/* initialize CPU pools */ +	for_each_wq_cpu(cpu) {  		struct worker_pool *pool; -		spin_lock_init(&gcwq->lock); -		gcwq->cpu = cpu; -		gcwq->flags |= GCWQ_DISASSOCIATED; - -		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) -			INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - -		for_each_worker_pool(pool, gcwq) { -			pool->gcwq = gcwq; +		for_each_std_worker_pool(pool, cpu) { +			spin_lock_init(&pool->lock); +			pool->cpu = cpu; +			pool->flags |= POOL_DISASSOCIATED;  			INIT_LIST_HEAD(&pool->worklist);  			INIT_LIST_HEAD(&pool->idle_list); +			hash_init(pool->busy_hash);  			init_timer_deferrable(&pool->idle_timer);  			pool->idle_timer.function = idle_worker_timeout;  			pool->idle_timer.data = (unsigned long)pool; -			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, +			setup_timer(&pool->mayday_timer, pool_mayday_timeout,  				    (unsigned long)pool);  			mutex_init(&pool->assoc_mutex);  			ida_init(&pool->worker_ida); + +			/* alloc pool ID */ +			BUG_ON(worker_pool_assign_id(pool));  		}  	}  	/* create the initial worker */ -	for_each_online_gcwq_cpu(cpu) { -		struct global_cwq *gcwq = get_gcwq(cpu); +	for_each_online_wq_cpu(cpu) {  		struct worker_pool *pool; -		if (cpu != WORK_CPU_UNBOUND) -			gcwq->flags &= ~GCWQ_DISASSOCIATED; - -		for_each_worker_pool(pool, gcwq) { +		for_each_std_worker_pool(pool, cpu) {  			struct worker *worker; +			if (cpu != WORK_CPU_UNBOUND) +				pool->flags &= ~POOL_DISASSOCIATED; +  			worker = create_worker(pool);  			BUG_ON(!worker); -			spin_lock_irq(&gcwq->lock); +			spin_lock_irq(&pool->lock);  			start_worker(worker); -			spin_unlock_irq(&gcwq->lock); +			spin_unlock_irq(&pool->lock);  		}  	} diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h new file mode 100644 index 00000000000..07650264ec1 --- /dev/null +++ b/kernel/workqueue_internal.h @@ -0,0 +1,65 @@ +/* + * kernel/workqueue_internal.h + * + * Workqueue internal header file.  Only to be included by workqueue and + * core kernel subsystems. + */ +#ifndef _KERNEL_WORKQUEUE_INTERNAL_H +#define _KERNEL_WORKQUEUE_INTERNAL_H + +#include <linux/workqueue.h> +#include <linux/kthread.h> + +struct worker_pool; + +/* + * The poor guys doing the actual heavy lifting.  All on-duty workers are + * either serving the manager role, on idle list or on busy hash.  For + * details on the locking annotation (L, I, X...), refer to workqueue.c. + * + * Only to be used in workqueue and async. + */ +struct worker { +	/* on idle list while idle, on busy hash table while busy */ +	union { +		struct list_head	entry;	/* L: while idle */ +		struct hlist_node	hentry;	/* L: while busy */ +	}; + +	struct work_struct	*current_work;	/* L: work being processed */ +	work_func_t		current_func;	/* L: current_work's fn */ +	struct pool_workqueue	*current_pwq; /* L: current_work's pwq */ +	struct list_head	scheduled;	/* L: scheduled works */ +	struct task_struct	*task;		/* I: worker task */ +	struct worker_pool	*pool;		/* I: the associated pool */ +	/* 64 bytes boundary on 64bit, 32 on 32bit */ +	unsigned long		last_active;	/* L: last active timestamp */ +	unsigned int		flags;		/* X: flags */ +	int			id;		/* I: worker id */ + +	/* for rebinding worker to CPU */ +	struct work_struct	rebind_work;	/* L: for busy worker */ + +	/* used only by rescuers to point to the target workqueue */ +	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */ +}; + +/** + * current_wq_worker - return struct worker if %current is a workqueue worker + */ +static inline struct worker *current_wq_worker(void) +{ +	if (current->flags & PF_WQ_WORKER) +		return kthread_data(current); +	return NULL; +} + +/* + * Scheduler hooks for concurrency managed workqueue.  Only to be used from + * sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, +				       unsigned int cpu); + +#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h deleted file mode 100644 index 2d10fc98dc7..00000000000 --- a/kernel/workqueue_sched.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * kernel/workqueue_sched.h - * - * Scheduler hooks for concurrency managed workqueue.  Only to be - * included from sched.c and workqueue.c. - */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, -				       unsigned int cpu);  |