diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/events/core.c | 64 | ||||
| -rw-r--r-- | kernel/events/hw_breakpoint.c | 11 | ||||
| -rw-r--r-- | kernel/sched/core.c | 73 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 37 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 1 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 1 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 1 | ||||
| -rw-r--r-- | kernel/workqueue.c | 110 | 
8 files changed, 172 insertions, 126 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d..7fee567153f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1253,7 +1253,7 @@ retry:  /*   * Cross CPU call to disable a performance event   */ -static int __perf_event_disable(void *info) +int __perf_event_disable(void *info)  {  	struct perf_event *event = info;  	struct perf_event_context *ctx = event->ctx; @@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);  /*   * Called when the last reference to the file is gone.   */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event)  { -	struct perf_event *event = file->private_data;  	struct task_struct *owner; -	file->private_data = NULL; +	if (!atomic_long_dec_and_test(&event->refcount)) +		return;  	rcu_read_lock();  	owner = ACCESS_ONCE(event->owner); @@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file)  		put_task_struct(owner);  	} -	return perf_event_release_kernel(event); +	perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ +	put_event(file->private_data); +	return 0;  }  u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3227,7 +3233,7 @@ unlock:  static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed)  {  	struct file *file; @@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)  		return ERR_PTR(-EBADF);  	} -	return file->private_data; +	return file;  }  static int perf_event_set_output(struct perf_event *event, @@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  	case PERF_EVENT_IOC_SET_OUTPUT:  	{ +		struct file *output_file = NULL;  		struct perf_event *output_event = NULL;  		int fput_needed = 0;  		int ret;  		if (arg != -1) { -			output_event = perf_fget_light(arg, &fput_needed); -			if (IS_ERR(output_event)) -				return PTR_ERR(output_event); +			output_file = perf_fget_light(arg, &fput_needed); +			if (IS_ERR(output_file)) +				return PTR_ERR(output_file); +			output_event = output_file->private_data;  		}  		ret = perf_event_set_output(event, output_event);  		if (output_event) -			fput_light(output_event->filp, fput_needed); +			fput_light(output_file, fput_needed);  		return ret;  	} @@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	mutex_init(&event->mmap_mutex); +	atomic_long_set(&event->refcount, 1);  	event->cpu		= cpu;  	event->attr		= *attr;  	event->group_leader	= group_leader; @@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open,  		return event_fd;  	if (group_fd != -1) { -		group_leader = perf_fget_light(group_fd, &fput_needed); -		if (IS_ERR(group_leader)) { -			err = PTR_ERR(group_leader); +		group_file = perf_fget_light(group_fd, &fput_needed); +		if (IS_ERR(group_file)) { +			err = PTR_ERR(group_file);  			goto err_fd;  		} -		group_file = group_leader->filp; +		group_leader = group_file->private_data;  		if (flags & PERF_FLAG_FD_OUTPUT)  			output_event = group_leader;  		if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open,  		put_ctx(gctx);  	} -	event->filp = event_file;  	WARN_ON_ONCE(ctx->parent_ctx);  	mutex_lock(&ctx->mutex); @@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,  		goto err_free;  	} -	event->filp = NULL;  	WARN_ON_ONCE(ctx->parent_ctx);  	mutex_lock(&ctx->mutex);  	perf_install_in_context(ctx, event, cpu); @@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event,  	 * Release the parent event, if this was the last  	 * reference to it.  	 */ -	fput(parent_event->filp); +	put_event(parent_event);  }  static void @@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)  	 *  	 *   __perf_event_exit_task()  	 *     sync_child_event() -	 *       fput(parent_event->filp) -	 *         perf_release() -	 *           mutex_lock(&ctx->mutex) +	 *       put_event() +	 *         mutex_lock(&ctx->mutex)  	 *  	 * But since its the parent context it won't be the same instance.  	 */ @@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event,  	list_del_init(&event->child_list);  	mutex_unlock(&parent->child_mutex); -	fput(parent->filp); +	put_event(parent);  	perf_group_detach(event);  	list_del_event(event, ctx); @@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event,  				           NULL, NULL);  	if (IS_ERR(child_event))  		return child_event; + +	if (!atomic_long_inc_not_zero(&parent_event->refcount)) { +		free_event(child_event); +		return NULL; +	} +  	get_ctx(child_ctx);  	/* @@ -6845,14 +6857,6 @@ inherit_event(struct perf_event *parent_event,  	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);  	/* -	 * Get a reference to the parent filp - we will fput it -	 * when the child event exits. This is safe to do because -	 * we are in the parent and we know that the filp still -	 * exists and has a nonzero count: -	 */ -	atomic_long_inc(&parent_event->filp->f_count); - -	/*  	 * Link this into the parent event's child list  	 */  	WARN_ON_ONCE(parent_event->ctx->parent_ctx); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee1..9a7b487c6fe 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att  	int old_type = bp->attr.bp_type;  	int err = 0; -	perf_event_disable(bp); +	/* +	 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it +	 * will not be possible to raise IPIs that invoke __perf_event_disable. +	 * So call the function directly after making sure we are targeting the +	 * current task. +	 */ +	if (irqs_disabled() && bp->ctx && bp->ctx->task == current) +		__perf_event_disable(bp); +	else +		perf_event_disable(bp);  	bp->attr.bp_addr = attr->bp_addr;  	bp->attr.bp_type = attr->bp_type; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc..649c9f876cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5304,27 +5304,17 @@ void idle_task_exit(void)  }  /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ -	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - -	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; -	rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations".   */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq)  { -	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); -	rq->calc_load_active = 0; +	long delta = calc_load_fold_active(rq); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks);  }  /* @@ -5352,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu)  	 */  	rq->stop = NULL; -	/* Ensure any throttled groups are reachable by pick_next_task */ -	unthrottle_offline_cfs_rqs(rq); -  	for ( ; ; ) {  		/*  		 * There's this thread running, bail when that's the only @@ -5618,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		BUG_ON(rq->nr_running != 1); /* the migration thread */  		raw_spin_unlock_irqrestore(&rq->lock, flags); -		migrate_nr_uninterruptible(rq); -		calc_global_load_remove(rq); +		calc_load_migrate(rq);  		break;  #endif  	} @@ -6028,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this   * allows us to avoid some pointer chasing select_idle_sibling().   * - * Iterate domains and sched_groups downward, assigning CPUs to be - * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing - * due to random perturbation self canceling, ie sw buddies pull - * their counterpart to their CPU's hw counterpart. - *   * Also keep a unique ID per domain (we use the first cpu number in   * the cpumask of the domain), this allows us to quickly tell if   * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6046,40 +6027,8 @@ static void update_top_cache_domain(int cpu)  	int id = cpu;  	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); -	if (sd) { -		struct sched_domain *tmp = sd; -		struct sched_group *sg, *prev; -		bool right; - -		/* -		 * Traverse to first CPU in group, and count hops -		 * to cpu from there, switching direction on each -		 * hop, never ever pointing the last CPU rightward. -		 */ -		do { -			id = cpumask_first(sched_domain_span(tmp)); -			prev = sg = tmp->groups; -			right = 1; - -			while (cpumask_first(sched_group_cpus(sg)) != id) -				sg = sg->next; - -			while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { -				prev = sg; -				sg = sg->next; -				right = !right; -			} - -			/* A CPU went down, never point back to domain start. */ -			if (right && cpumask_first(sched_group_cpus(sg->next)) == id) -				right = false; - -			sg = right ? sg->next : prev; -			tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); -		} while ((tmp = tmp->child)); - +	if (sd)  		id = cpumask_first(sched_domain_span(sd)); -	}  	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);  	per_cpu(sd_llc_id, cpu) = id; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c219bf8d704..96e2b18b628 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  	hrtimer_cancel(&cfs_b->slack_timer);  } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void unthrottle_offline_cfs_rqs(struct rq *rq)  {  	struct cfs_rq *cfs_rq; @@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)  	return NULL;  }  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}  #endif /* CONFIG_CFS_BANDWIDTH */ @@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target)  	int cpu = smp_processor_id();  	int prev_cpu = task_cpu(p);  	struct sched_domain *sd; +	struct sched_group *sg; +	int i;  	/*  	 * If the task is going to be woken-up on this cpu and if it is @@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target)  		return prev_cpu;  	/* -	 * Otherwise, check assigned siblings to find an elegible idle cpu. +	 * Otherwise, iterate the domains and find an elegible idle cpu.  	 */  	sd = rcu_dereference(per_cpu(sd_llc, target)); -  	for_each_lower_domain(sd) { -		if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) -			continue; -		if (idle_cpu(sd->idle_buddy)) -			return sd->idle_buddy; -	} +		sg = sd->groups; +		do { +			if (!cpumask_intersects(sched_group_cpus(sg), +						tsk_cpus_allowed(p))) +				goto next; +			for_each_cpu(i, sched_group_cpus(sg)) { +				if (!idle_cpu(i)) +					goto next; +			} + +			target = cpumask_first_and(sched_group_cpus(sg), +					tsk_cpus_allowed(p)); +			goto done; +next: +			sg = sg->next; +		} while (sg != sd->groups); +	} +done:  	return target;  } @@ -3658,7 +3672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)   * @group: sched_group whose statistics are to be updated.   * @load_idx: Load index of sched_domain of this_cpu for load calc.   * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing.   * @balance: Should we balance.   * @sgs: variable to hold the statistics for this group.   */ @@ -3805,7 +3818,6 @@ static bool update_sd_pick_busiest(struct lb_env *env,  /**   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.   * @env: The load balancing environment. - * @cpus: Set of cpus considered for load balancing.   * @balance: Should we balance.   * @sds: variable to hold the statistics for this sched_domain.   */ @@ -4956,6 +4968,9 @@ static void rq_online_fair(struct rq *rq)  static void rq_offline_fair(struct rq *rq)  {  	update_sysctl(); + +	/* Ensure any throttled groups are reachable by pick_next_task */ +	unthrottle_offline_cfs_rqs(rq);  }  #endif /* CONFIG_SMP */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 944cb68420e..e0b7ba9c040 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -691,6 +691,7 @@ balanced:  		 * runtime - in which case borrowing doesn't make sense.  		 */  		rt_rq->rt_runtime = RUNTIME_INF; +		rt_rq->rt_throttled = 0;  		raw_spin_unlock(&rt_rq->rt_runtime_lock);  		raw_spin_unlock(&rt_b->rt_runtime_lock);  	} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f6714d009e7..0848fa36c38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1144,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);  extern void init_cfs_rq(struct cfs_rq *cfs_rq);  extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq);  extern void account_cfs_bandwidth_used(int enabled, int was_enabled); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f7..3a9e5d5c109 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)  	tick_do_update_jiffies64(now);  	update_cpu_load_nohz(); +	calc_load_exit_idle();  	touch_softlockup_watchdog();  	/*  	 * Cancel the scheduled timer and restore the tick diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a1..1e1373bcb3e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,6 +66,7 @@ enum {  	/* pool flags */  	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */ +	POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */  	/* worker flags */  	WORKER_STARTED		= 1 << 0,	/* started */ @@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)  /* Do we have too many workers and should some go away? */  static bool too_many_workers(struct worker_pool *pool)  { -	bool managing = mutex_is_locked(&pool->manager_mutex); +	bool managing = pool->flags & POOL_MANAGING_WORKERS;  	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */  	int nr_busy = pool->nr_workers - nr_idle; @@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker)  	/* we did our part, wait for rebind_workers() to finish up */  	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); + +	/* +	 * rebind_workers() shouldn't finish until all workers passed the +	 * above WORKER_REBIND wait.  Tell it when done. +	 */ +	spin_lock_irq(&worker->pool->gcwq->lock); +	if (!--worker->idle_rebind->cnt) +		complete(&worker->idle_rebind->done); +	spin_unlock_irq(&worker->pool->gcwq->lock);  }  /* @@ -1396,12 +1406,15 @@ retry:  	/* set REBIND and kick idle ones, we'll wait for these later */  	for_each_worker_pool(pool, gcwq) {  		list_for_each_entry(worker, &pool->idle_list, entry) { +			unsigned long worker_flags = worker->flags; +  			if (worker->flags & WORKER_REBIND)  				continue; -			/* morph UNBOUND to REBIND */ -			worker->flags &= ~WORKER_UNBOUND; -			worker->flags |= WORKER_REBIND; +			/* morph UNBOUND to REBIND atomically */ +			worker_flags &= ~WORKER_UNBOUND; +			worker_flags |= WORKER_REBIND; +			ACCESS_ONCE(worker->flags) = worker_flags;  			idle_rebind.cnt++;  			worker->idle_rebind = &idle_rebind; @@ -1419,25 +1432,15 @@ retry:  		goto retry;  	} -	/* -	 * All idle workers are rebound and waiting for %WORKER_REBIND to -	 * be cleared inside idle_worker_rebind().  Clear and release. -	 * Clearing %WORKER_REBIND from this foreign context is safe -	 * because these workers are still guaranteed to be idle. -	 */ -	for_each_worker_pool(pool, gcwq) -		list_for_each_entry(worker, &pool->idle_list, entry) -			worker->flags &= ~WORKER_REBIND; - -	wake_up_all(&gcwq->rebind_hold); - -	/* rebind busy workers */ +	/* all idle workers are rebound, rebind busy workers */  	for_each_busy_worker(worker, i, pos, gcwq) {  		struct work_struct *rebind_work = &worker->rebind_work; +		unsigned long worker_flags = worker->flags; -		/* morph UNBOUND to REBIND */ -		worker->flags &= ~WORKER_UNBOUND; -		worker->flags |= WORKER_REBIND; +		/* morph UNBOUND to REBIND atomically */ +		worker_flags &= ~WORKER_UNBOUND; +		worker_flags |= WORKER_REBIND; +		ACCESS_ONCE(worker->flags) = worker_flags;  		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,  				     work_data_bits(rebind_work))) @@ -1449,6 +1452,34 @@ retry:  			    worker->scheduled.next,  			    work_color_to_flags(WORK_NO_COLOR));  	} + +	/* +	 * All idle workers are rebound and waiting for %WORKER_REBIND to +	 * be cleared inside idle_worker_rebind().  Clear and release. +	 * Clearing %WORKER_REBIND from this foreign context is safe +	 * because these workers are still guaranteed to be idle. +	 * +	 * We need to make sure all idle workers passed WORKER_REBIND wait +	 * in idle_worker_rebind() before returning; otherwise, workers can +	 * get stuck at the wait if hotplug cycle repeats. +	 */ +	idle_rebind.cnt = 1; +	INIT_COMPLETION(idle_rebind.done); + +	for_each_worker_pool(pool, gcwq) { +		list_for_each_entry(worker, &pool->idle_list, entry) { +			worker->flags &= ~WORKER_REBIND; +			idle_rebind.cnt++; +		} +	} + +	wake_up_all(&gcwq->rebind_hold); + +	if (--idle_rebind.cnt) { +		spin_unlock_irq(&gcwq->lock); +		wait_for_completion(&idle_rebind.done); +		spin_lock_irq(&gcwq->lock); +	}  }  static struct worker *alloc_worker(void) @@ -1794,9 +1825,45 @@ static bool manage_workers(struct worker *worker)  	struct worker_pool *pool = worker->pool;  	bool ret = false; -	if (!mutex_trylock(&pool->manager_mutex)) +	if (pool->flags & POOL_MANAGING_WORKERS)  		return ret; +	pool->flags |= POOL_MANAGING_WORKERS; + +	/* +	 * To simplify both worker management and CPU hotplug, hold off +	 * management while hotplug is in progress.  CPU hotplug path can't +	 * grab %POOL_MANAGING_WORKERS to achieve this because that can +	 * lead to idle worker depletion (all become busy thinking someone +	 * else is managing) which in turn can result in deadlock under +	 * extreme circumstances.  Use @pool->manager_mutex to synchronize +	 * manager against CPU hotplug. +	 * +	 * manager_mutex would always be free unless CPU hotplug is in +	 * progress.  trylock first without dropping @gcwq->lock. +	 */ +	if (unlikely(!mutex_trylock(&pool->manager_mutex))) { +		spin_unlock_irq(&pool->gcwq->lock); +		mutex_lock(&pool->manager_mutex); +		/* +		 * CPU hotplug could have happened while we were waiting +		 * for manager_mutex.  Hotplug itself can't handle us +		 * because manager isn't either on idle or busy list, and +		 * @gcwq's state and ours could have deviated. +		 * +		 * As hotplug is now excluded via manager_mutex, we can +		 * simply try to bind.  It will succeed or fail depending +		 * on @gcwq's current state.  Try it and adjust +		 * %WORKER_UNBOUND accordingly. +		 */ +		if (worker_maybe_bind_and_lock(worker)) +			worker->flags &= ~WORKER_UNBOUND; +		else +			worker->flags |= WORKER_UNBOUND; + +		ret = true; +	} +  	pool->flags &= ~POOL_MANAGE_WORKERS;  	/* @@ -1806,6 +1873,7 @@ static bool manage_workers(struct worker *worker)  	ret |= maybe_destroy_workers(pool);  	ret |= maybe_create_worker(pool); +	pool->flags &= ~POOL_MANAGING_WORKERS;  	mutex_unlock(&pool->manager_mutex);  	return ret;  }  |