diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 525 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 71 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 53 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 4 | 
5 files changed, 479 insertions, 175 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc3..468bdd44c1b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =  #define SCHED_FEAT(name, enabled)	\  	#name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = {  #include "features.h" -	NULL  };  #undef SCHED_FEAT @@ -2082,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,  #endif  	/* Here we just switch the register state and the stack. */ -	rcu_switch_from(prev);  	switch_to(prev, next, prev);  	barrier(); @@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)  } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + *   nr_active = 0; + *   for_each_possible_cpu(cpu) + *   	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + *  - for_each_possible_cpu() is prohibitively expensive on machines with + *    serious number of cpus, therefore we need to take a distributed approach + *    to calculating nr_active. + * + *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + *    So assuming nr_active := 0 when we start out -- true per definition, we + *    can simply take per-cpu deltas and fold those into a global accumulate + *    to obtain the same result. See calc_load_fold_active(). + * + *    Furthermore, in order to avoid synchronizing all per-cpu delta folding + *    across the machine, we assume 10 ticks is sufficient time for every + *    cpu to have completed this task. + * + *    This places an upper-bound on the IRQ-off latency of the machine. Then + *    again, being late doesn't loose the delta, just wrecks the sample. + * + *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + *    this would add another cross-cpu cacheline miss and atomic operation + *    to the wakeup path. Instead we increment on whatever cpu the task ran + *    when it went into uninterruptible state and decrement on whatever cpu + *    did the wakeup. This means that only the sum of nr_uninterruptible over + *    all cpus yields the correct result. + * + *  This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ +  /* Variables and functions for calc_load */  static atomic_long_t calc_load_tasks;  static unsigned long calc_load_update;  unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads:	pointer to dest load array + * @offset:	offset to add + * @shift:	shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ +	loads[0] = (avenrun[0] + offset) << shift; +	loads[1] = (avenrun[1] + offset) << shift; +	loads[2] = (avenrun[2] + offset) << shift; +}  static long calc_load_fold_active(struct rq *this_rq)  { @@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)  	return delta;  } +/* + * a1 = a0 * e + a * (1 - e) + */  static unsigned long  calc_load(unsigned long load, unsigned long exp, unsigned long active)  { @@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)  #ifdef CONFIG_NO_HZ  /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + *  - When we go NO_HZ idle during the window, we can negate our sample + *    contribution, causing under-accounting. + * + *    We avoid this by keeping two idle-delta counters and flipping them + *    when the window starts, thus separating old and new NO_HZ load. + * + *    The only trick is the slight shift in index flip for read vs write. + * + *        0s            5s            10s           15s + *          +10           +10           +10           +10 + *        |-|-----------|-|-----------|-|-----------|-| + *    r:0 0 1           1 0           0 1           1 0 + *    w:0 1 1           0 0           1 1           0 0 + * + *    This ensures we'll fold the old idle contribution in this window while + *    accumlating the new one. + * + *  - When we wake up from NO_HZ idle during the window, we push up our + *    contribution, since we effectively move our sample point to a known + *    busy state. + * + *    This is solved by pushing the window forward, and thus skipping the + *    sample, for this cpu (effectively using the idle-delta for this cpu which + *    was in effect at the time the window opened). This also solves the issue + *    of having to deal with a cpu having been in NOHZ idle for multiple + *    LOAD_FREQ intervals.   *   * When making the ILB scale, we should try to pull this in as well.   */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ +	int idx = calc_load_idx; + +	/* +	 * See calc_global_nohz(), if we observe the new index, we also +	 * need to observe the new update time. +	 */ +	smp_rmb(); -void calc_load_account_idle(struct rq *this_rq) +	/* +	 * If the folding window started, make sure we start writing in the +	 * next idle-delta. +	 */ +	if (!time_before(jiffies, calc_load_update)) +		idx++; + +	return idx & 1; +} + +static inline int calc_load_read_idx(void)  { +	return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ +	struct rq *this_rq = this_rq();  	long delta; +	/* +	 * We're going into NOHZ mode, if there's any pending delta, fold it +	 * into the pending idle delta. +	 */  	delta = calc_load_fold_active(this_rq); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks_idle); +	if (delta) { +		int idx = calc_load_write_idx(); +		atomic_long_add(delta, &calc_load_idle[idx]); +	}  } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void)  { -	long delta = 0; +	struct rq *this_rq = this_rq();  	/* -	 * Its got a race, we don't care... +	 * If we're still before the sample window, we're done.  	 */ -	if (atomic_long_read(&calc_load_tasks_idle)) -		delta = atomic_long_xchg(&calc_load_tasks_idle, 0); +	if (time_before(jiffies, this_rq->calc_load_update)) +		return; + +	/* +	 * We woke inside or after the sample window, this means we're already +	 * accounted through the nohz accounting, so skip the entire deal and +	 * sync up for the next window. +	 */ +	this_rq->calc_load_update = calc_load_update; +	if (time_before(jiffies, this_rq->calc_load_update + 10)) +		this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ +	int idx = calc_load_read_idx(); +	long delta = 0; + +	if (atomic_long_read(&calc_load_idle[idx])) +		delta = atomic_long_xchg(&calc_load_idle[idx], 0);  	return delta;  } @@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)  {  	long delta, active, n; -	/* -	 * If we crossed a calc_load_update boundary, make sure to fold -	 * any pending idle changes, the respective CPUs might have -	 * missed the tick driven calc_load_account_active() update -	 * due to NO_HZ. -	 */ -	delta = calc_load_fold_idle(); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks); - -	/* -	 * It could be the one fold was all it took, we done! -	 */ -	if (time_before(jiffies, calc_load_update + 10)) -		return; - -	/* -	 * Catch-up, fold however many we are behind still -	 */ -	delta = jiffies - calc_load_update - 10; -	n = 1 + (delta / LOAD_FREQ); +	if (!time_before(jiffies, calc_load_update + 10)) { +		/* +		 * Catch-up, fold however many we are behind still +		 */ +		delta = jiffies - calc_load_update - 10; +		n = 1 + (delta / LOAD_FREQ); -	active = atomic_long_read(&calc_load_tasks); -	active = active > 0 ? active * FIXED_1 : 0; +		active = atomic_long_read(&calc_load_tasks); +		active = active > 0 ? active * FIXED_1 : 0; -	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); -	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); -	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); -	calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} +		calc_load_update += n * LOAD_FREQ; +	} -static inline long calc_load_fold_idle(void) -{ -	return 0; +	/* +	 * Flip the idle index... +	 * +	 * Make sure we first write the new time then flip the index, so that +	 * calc_load_write_idx() will see the new time when it reads the new +	 * index, this avoids a double flip messing things up. +	 */ +	smp_wmb(); +	calc_load_idx++;  } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads:	pointer to dest load array - * @offset:	offset to add - * @shift:	shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ -	loads[0] = (avenrun[0] + offset) << shift; -	loads[1] = (avenrun[1] + offset) << shift; -	loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */  /*   * calc_load - update the avenrun load estimates 10 ticks after the @@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)   */  void calc_global_load(unsigned long ticks)  { -	long active; +	long active, delta;  	if (time_before(jiffies, calc_load_update + 10))  		return; +	/* +	 * Fold the 'old' idle-delta to include all NO_HZ cpus. +	 */ +	delta = calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); +  	active = atomic_long_read(&calc_load_tasks);  	active = active > 0 ? active * FIXED_1 : 0; @@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)  	calc_load_update += LOAD_FREQ;  	/* -	 * Account one period with whatever state we found before -	 * folding in the nohz state and ageing the entire idle period. -	 * -	 * This avoids loosing a sample when we go idle between  -	 * calc_load_account_active() (10 ticks ago) and now and thus -	 * under-accounting. +	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.  	 */  	calc_global_nohz();  } @@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)  		return;  	delta  = calc_load_fold_active(this_rq); -	delta += calc_load_fold_idle();  	if (delta)  		atomic_long_add(delta, &calc_load_tasks); @@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)  }  /* + * End of global load-average stuff + */ + +/*   * The exact cpuload at various idx values, calculated at every tick would be   * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load   * @@ -2517,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,  	sched_avg_update(this_rq);  } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ +  /*   * Called from nohz_idle_balance() to update the load ratings before doing the   * idle balance.   */  void update_idle_cpu_load(struct rq *this_rq)  { -	unsigned long curr_jiffies = jiffies; +	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);  	unsigned long load = this_rq->load.weight;  	unsigned long pending_updates;  	/* -	 * Bloody broken means of dealing with nohz, but better than nothing.. -	 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy -	 * update and see 0 difference the one time and 2 the next, even though -	 * we ticked at roughtly the same rate. -	 * -	 * Hence we only use this from nohz_idle_balance() and skip this -	 * nonsense when called from the scheduler_tick() since that's -	 * guaranteed a stable rate. +	 * bail if there's load or we're actually up-to-date.  	 */  	if (load || curr_jiffies == this_rq->last_load_update_tick)  		return; @@ -2547,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq)  }  /* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ +	struct rq *this_rq = this_rq(); +	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); +	unsigned long pending_updates; + +	if (curr_jiffies == this_rq->last_load_update_tick) +		return; + +	raw_spin_lock(&this_rq->lock); +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	if (pending_updates) { +		this_rq->last_load_update_tick = curr_jiffies; +		/* +		 * We were idle, this means load 0, the current load might be +		 * !0 due to remote wakeups and the sort. +		 */ +		__update_cpu_load(this_rq, 0, pending_updates); +	} +	raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/*   * Called from scheduler_tick()   */  static void update_cpu_load_active(struct rq *this_rq)  {  	/* -	 * See the mess in update_idle_cpu_load(). +	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().  	 */  	this_rq->last_load_update_tick = jiffies;  	__update_cpu_load(this_rq, this_rq->load.weight, 1); @@ -4982,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  		p->sched_class->set_cpus_allowed(p, new_mask);  	cpumask_copy(&p->cpus_allowed, new_mask); -	p->rt.nr_cpus_allowed = cpumask_weight(new_mask); +	p->nr_cpus_allowed = cpumask_weight(new_mask);  }  /* @@ -5524,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */  #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str)  { -	sched_domain_debug_enabled = 1; +	sched_debug_enabled = 1;  	return 0;  } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ +	return sched_debug_enabled; +}  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  				  struct cpumask *groupmask) @@ -5572,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  			break;  		} -		if (!group->sgp->power) { +		/* +		 * Even though we initialize ->power to something semi-sane, +		 * we leave power_orig unset. This allows us to detect if +		 * domain iteration is still funny without causing /0 traps. +		 */ +		if (!group->sgp->power_orig) {  			printk(KERN_CONT "\n");  			printk(KERN_ERR "ERROR: domain->cpu_power not "  					"set\n"); @@ -5620,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)  {  	int level = 0; -	if (!sched_domain_debug_enabled) +	if (!sched_debug_enabled)  		return;  	if (!sd) { @@ -5641,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)  }  #else /* !CONFIG_SCHED_DEBUG */  # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ +	return false; +}  #endif /* CONFIG_SCHED_DEBUG */  static int sd_degenerate(struct sched_domain *sd) @@ -5962,6 +6138,44 @@ struct sched_domain_topology_level {  	struct sd_data      data;  }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ +	const struct cpumask *span = sched_domain_span(sd); +	struct sd_data *sdd = sd->private; +	struct sched_domain *sibling; +	int i; + +	for_each_cpu(i, span) { +		sibling = *per_cpu_ptr(sdd->sd, i); +		if (!cpumask_test_cpu(i, sched_domain_span(sibling))) +			continue; + +		cpumask_set_cpu(i, sched_group_mask(sg)); +	} +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ +	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} +  static int  build_overlap_sched_groups(struct sched_domain *sd, int cpu)  { @@ -5980,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  		if (cpumask_test_cpu(i, covered))  			continue; +		child = *per_cpu_ptr(sdd->sd, i); + +		/* See the comment near build_group_mask(). */ +		if (!cpumask_test_cpu(i, sched_domain_span(child))) +			continue; +  		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),  				GFP_KERNEL, cpu_to_node(cpu)); @@ -5987,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  			goto fail;  		sg_span = sched_group_cpus(sg); - -		child = *per_cpu_ptr(sdd->sd, i);  		if (child->child) {  			child = child->child;  			cpumask_copy(sg_span, sched_domain_span(child)); @@ -5997,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  		cpumask_or(covered, covered, sg_span); -		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); -		atomic_inc(&sg->sgp->ref); +		sg->sgp = *per_cpu_ptr(sdd->sgp, i); +		if (atomic_inc_return(&sg->sgp->ref) == 1) +			build_group_mask(sd, sg); + +		/* +		 * Initialize sgp->power such that even if we mess up the +		 * domains and no possible iteration will get us here, we won't +		 * die on a /0 trap. +		 */ +		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); -		if (cpumask_test_cpu(cpu, sg_span)) +		/* +		 * Make sure the first group of this domain contains the +		 * canonical balance cpu. Otherwise the sched_domain iteration +		 * breaks. See update_sg_lb_stats(). +		 */ +		if ((!groups && cpumask_test_cpu(cpu, sg_span)) || +		    group_balance_cpu(sg) == cpu)  			groups = sg;  		if (!first) @@ -6074,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)  		cpumask_clear(sched_group_cpus(sg));  		sg->sgp->power = 0; +		cpumask_setall(sched_group_mask(sg));  		for_each_cpu(j, span) {  			if (get_group(j, sdd, NULL) != group) @@ -6115,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  		sg = sg->next;  	} while (sg != sd->groups); -	if (cpu != group_first_cpu(sg)) +	if (cpu != group_balance_cpu(sg))  		return;  	update_group_power(sd, cpu); @@ -6165,11 +6398,8 @@ int sched_domain_level_max;  static int __init setup_relax_domain_level(char *str)  { -	unsigned long val; - -	val = simple_strtoul(str, NULL, 0); -	if (val < sched_domain_level_max) -		default_relax_domain_level = val; +	if (kstrtoint(str, 0, &default_relax_domain_level)) +		pr_warn("Unable to set relax_domain_level\n");  	return 1;  } @@ -6279,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol  #ifdef CONFIG_NUMA  static int sched_domains_numa_levels; -static int sched_domains_numa_scale;  static int *sched_domains_numa_distance;  static struct cpumask ***sched_domains_numa_masks;  static int sched_domains_curr_level;  static inline int sd_local_flags(int level)  { -	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) +	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)  		return 0;  	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; @@ -6344,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu)  	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];  } +static void sched_numa_warn(const char *str) +{ +	static int done = false; +	int i,j; + +	if (done) +		return; + +	done = true; + +	printk(KERN_WARNING "ERROR: %s\n\n", str); + +	for (i = 0; i < nr_node_ids; i++) { +		printk(KERN_WARNING "  "); +		for (j = 0; j < nr_node_ids; j++) +			printk(KERN_CONT "%02d ", node_distance(i,j)); +		printk(KERN_CONT "\n"); +	} +	printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ +	int i; + +	if (distance == node_distance(0, 0)) +		return true; + +	for (i = 0; i < sched_domains_numa_levels; i++) { +		if (sched_domains_numa_distance[i] == distance) +			return true; +	} + +	return false; +} +  static void sched_init_numa(void)  {  	int next_distance, curr_distance = node_distance(0, 0); @@ -6351,7 +6616,6 @@ static void sched_init_numa(void)  	int level = 0;  	int i, j, k; -	sched_domains_numa_scale = curr_distance;  	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);  	if (!sched_domains_numa_distance)  		return; @@ -6362,23 +6626,41 @@ static void sched_init_numa(void)  	 *  	 * Assumes node_distance(0,j) includes all distances in  	 * node_distance(i,j) in order to avoid cubic time. -	 * -	 * XXX: could be optimized to O(n log n) by using sort()  	 */  	next_distance = curr_distance;  	for (i = 0; i < nr_node_ids; i++) {  		for (j = 0; j < nr_node_ids; j++) { -			int distance = node_distance(0, j); -			if (distance > curr_distance && -					(distance < next_distance || -					 next_distance == curr_distance)) -				next_distance = distance; +			for (k = 0; k < nr_node_ids; k++) { +				int distance = node_distance(i, k); + +				if (distance > curr_distance && +				    (distance < next_distance || +				     next_distance == curr_distance)) +					next_distance = distance; + +				/* +				 * While not a strong assumption it would be nice to know +				 * about cases where if node A is connected to B, B is not +				 * equally connected to A. +				 */ +				if (sched_debug() && node_distance(k, i) != distance) +					sched_numa_warn("Node-distance not symmetric"); + +				if (sched_debug() && i && !find_numa_distance(distance)) +					sched_numa_warn("Node-0 not representative"); +			} +			if (next_distance != curr_distance) { +				sched_domains_numa_distance[level++] = next_distance; +				sched_domains_numa_levels = level; +				curr_distance = next_distance; +			} else break;  		} -		if (next_distance != curr_distance) { -			sched_domains_numa_distance[level++] = next_distance; -			sched_domains_numa_levels = level; -			curr_distance = next_distance; -		} else break; + +		/* +		 * In case of sched_debug() we verify the above assumption. +		 */ +		if (!sched_debug()) +			break;  	}  	/*  	 * 'level' contains the number of unique distances, excluding the @@ -6403,7 +6685,7 @@ static void sched_init_numa(void)  			return;  		for (j = 0; j < nr_node_ids; j++) { -			struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); +			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);  			if (!mask)  				return; @@ -6490,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  			*per_cpu_ptr(sdd->sg, j) = sg; -			sgp = kzalloc_node(sizeof(struct sched_group_power), +			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),  					GFP_KERNEL, cpu_to_node(j));  			if (!sgp)  				return -ENOMEM; @@ -6543,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  	if (!sd)  		return child; -	set_domain_attribute(sd, attr);  	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));  	if (child) {  		sd->level = child->level + 1; @@ -6551,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		child->parent = sd;  	}  	sd->child = child; +	set_domain_attribute(sd, attr);  	return sd;  } @@ -6691,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)  	if (!doms_cur)  		doms_cur = &fallback_doms;  	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); -	dattr_cur = NULL;  	err = build_sched_domains(doms_cur[0], NULL);  	register_sched_domain_sysctl(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf9..c099cc6eebe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)  	int want_sd = 1;  	int sync = wake_flags & WF_SYNC; -	if (p->rt.nr_cpus_allowed == 1) +	if (p->nr_cpus_allowed == 1)  		return prev_cpu;  	if (sd_flag & SD_BALANCE_WAKE) { @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)  unsigned long scale_rt_power(int cpu)  {  	struct rq *rq = cpu_rq(cpu); -	u64 total, available; +	u64 total, available, age_stamp, avg; -	total = sched_avg_period() + (rq->clock - rq->age_stamp); +	/* +	 * Since we're reading these variables without serialization make sure +	 * we read them once before doing sanity checks on them. +	 */ +	age_stamp = ACCESS_ONCE(rq->age_stamp); +	avg = ACCESS_ONCE(rq->rt_avg); + +	total = sched_avg_period() + (rq->clock - age_stamp); -	if (unlikely(total < rq->rt_avg)) { +	if (unlikely(total < avg)) {  		/* Ensures that power won't end up being negative */  		available = 0;  	} else { -		available = total - rq->rt_avg; +		available = total - avg;  	}  	if (unlikely((s64)total < SCHED_POWER_SCALE)) @@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)  	power = 0; -	group = child->groups; -	do { -		power += group->sgp->power; -		group = group->next; -	} while (group != child->groups); +	if (child->flags & SD_OVERLAP) { +		/* +		 * SD_OVERLAP domains cannot assume that child groups +		 * span the current group. +		 */ -	sdg->sgp->power = power; +		for_each_cpu(cpu, sched_group_cpus(sdg)) +			power += power_of(cpu); +	} else  { +		/* +		 * !SD_OVERLAP domains can assume that child groups +		 * span the current group. +		 */  + +		group = child->groups; +		do { +			power += group->sgp->power; +			group = group->next; +		} while (group != child->groups); +	} + +	sdg->sgp->power_orig = sdg->sgp->power = power;  }  /* @@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)  /**   * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. + * @env: The load balancing environment.   * @group: sched_group whose statistics are to be updated.   * @load_idx: Load index of sched_domain of this_cpu for load calc.   * @local_group: Does group contain this_cpu. @@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,  	int i;  	if (local_group) -		balance_cpu = group_first_cpu(group); +		balance_cpu = group_balance_cpu(group);  	/* Tally up the load of all CPUs in the group */  	max_cpu_load = 0; @@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,  		/* Bias balancing toward cpus of our domain */  		if (local_group) { -			if (idle_cpu(i) && !first_idle_cpu) { +			if (idle_cpu(i) && !first_idle_cpu && +					cpumask_test_cpu(i, sched_group_mask(group))) {  				first_idle_cpu = 1;  				balance_cpu = i;  			} @@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,  /**   * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked + * @env: The load balancing environment.   * @sds: sched_domain statistics   * @sg: sched_group candidate to be checked for being the busiest   * @sgs: sched_group statistics - * @this_cpu: the current cpu   *   * Determine if @sg is a busier group than the previously selected   * busiest group. @@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,  /**   * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu + * @env: The load balancing environment.   * @cpus: Set of cpus considered for load balancing.   * @balance: Should we balance.   * @sds: variable to hold the statistics for this sched_domain. @@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env,   * Returns 1 when packing is required and a task should be moved to   * this CPU.  The amount of the imbalance is returned in *imbalance.   * - * @sd: The sched_domain whose packing is to be checked. + * @env: The load balancing environment.   * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing.   */  static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)  { @@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)   * fix_small_imbalance - Calculate the minor imbalance that exists   *			amongst the groups of a sched_domain, during   *			load balancing. + * @env: The load balancing environment.   * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance.   */  static inline  void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) @@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s   * Also calculates the amount of weighted load which should be moved   * to restore balance.   * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - *		be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. + * @env: The load balancing environment.   * @cpus: The set of CPUs under consideration for load-balancing.   * @balance: Pointer to a variable indicating if this_cpu   *	is the appropriate cpu to perform load balancing at this_level. diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d..b6baf370cae 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl  static struct task_struct *pick_next_task_idle(struct rq *rq)  {  	schedstat_inc(rq, sched_goidle); -	calc_load_account_idle(rq);  	return rq->idle;  } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515..573e1ca0110 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)  static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  { +	struct task_struct *p; +  	if (!rt_entity_is_task(rt_se))  		return; +	p = rt_task_of(rt_se);  	rt_rq = &rq_of_rt_rq(rt_rq)->rt;  	rt_rq->rt_nr_total++; -	if (rt_se->nr_cpus_allowed > 1) +	if (p->nr_cpus_allowed > 1)  		rt_rq->rt_nr_migratory++;  	update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  { +	struct task_struct *p; +  	if (!rt_entity_is_task(rt_se))  		return; +	p = rt_task_of(rt_se);  	rt_rq = &rq_of_rt_rq(rt_rq)->rt;  	rt_rq->rt_nr_total--; -	if (rt_se->nr_cpus_allowed > 1) +	if (p->nr_cpus_allowed > 1)  		rt_rq->rt_nr_migratory--;  	update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); -	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) +	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)  		enqueue_pushable_task(rq, p);  	inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)  	cpu = task_cpu(p); -	if (p->rt.nr_cpus_allowed == 1) +	if (p->nr_cpus_allowed == 1)  		goto out;  	/* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)  	 * will have to sort it out.  	 */  	if (curr && unlikely(rt_task(curr)) && -	    (curr->rt.nr_cpus_allowed < 2 || +	    (curr->nr_cpus_allowed < 2 ||  	     curr->prio <= p->prio) && -	    (p->rt.nr_cpus_allowed > 1)) { +	    (p->nr_cpus_allowed > 1)) {  		int target = find_lowest_rq(p);  		if (target != -1) @@ -1276,10 +1282,10 @@ out:  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)  { -	if (rq->curr->rt.nr_cpus_allowed == 1) +	if (rq->curr->nr_cpus_allowed == 1)  		return; -	if (p->rt.nr_cpus_allowed != 1 +	if (p->nr_cpus_allowed != 1  	    && cpupri_find(&rq->rd->cpupri, p, NULL))  		return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)  	 * The previous task needs to be made eligible for pushing  	 * if it is still active  	 */ -	if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) +	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)  		enqueue_pushable_task(rq, p);  } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)  {  	if (!task_running(rq, p) &&  	    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && -	    (p->rt.nr_cpus_allowed > 1)) +	    (p->nr_cpus_allowed > 1))  		return 1;  	return 0;  } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)  	if (unlikely(!lowest_mask))  		return -1; -	if (task->rt.nr_cpus_allowed == 1) +	if (task->nr_cpus_allowed == 1)  		return -1; /* No other targets possible */  	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)  				     task_running(rq, task) ||  				     !task->on_rq)) { -				raw_spin_unlock(&lowest_rq->lock); +				double_unlock_balance(rq, lowest_rq);  				lowest_rq = NULL;  				break;  			} @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)  	BUG_ON(rq->cpu != task_cpu(p));  	BUG_ON(task_current(rq, p)); -	BUG_ON(p->rt.nr_cpus_allowed <= 1); +	BUG_ON(p->nr_cpus_allowed <= 1);  	BUG_ON(!p->on_rq);  	BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)  	if (!task_running(rq, p) &&  	    !test_tsk_need_resched(rq->curr) &&  	    has_pushable_tasks(rq) && -	    p->rt.nr_cpus_allowed > 1 && +	    p->nr_cpus_allowed > 1 &&  	    rt_task(rq->curr) && -	    (rq->curr->rt.nr_cpus_allowed < 2 || +	    (rq->curr->nr_cpus_allowed < 2 ||  	     rq->curr->prio <= p->prio))  		push_rt_tasks(rq);  } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,  	 * Only update if the process changes its state from whether it  	 * can migrate or not.  	 */ -	if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) +	if ((p->nr_cpus_allowed > 1) == (weight > 1))  		return;  	rq = task_rq(p); @@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)  static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  { +	struct sched_rt_entity *rt_se = &p->rt; +  	update_curr_rt(rq);  	watchdog(rq, p); @@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  	p->rt.time_slice = RR_TIMESLICE;  	/* -	 * Requeue to the end of queue if we are not the only element -	 * on the queue: +	 * Requeue to the end of queue if we (and all of our ancestors) are the +	 * only element on the queue  	 */ -	if (p->rt.run_list.prev != p->rt.run_list.next) { -		requeue_task_rt(rq, p, 0); -		set_tsk_need_resched(p); +	for_each_sched_rt_entity(rt_se) { +		if (rt_se->run_list.prev != rt_se->run_list.next) { +			requeue_task_rt(rq, p, 0); +			set_tsk_need_resched(p); +			return; +		}  	}  } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24c..55844f24435 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)  DECLARE_PER_CPU(struct sched_domain *, sd_llc);  DECLARE_PER_CPU(int, sd_llc_id); +extern int group_balance_cpu(struct sched_group *sg); +  #endif /* CONFIG_SMP */  #include "stats.h" @@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void)  	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;  } -void calc_load_account_idle(struct rq *this_rq); -  #ifdef CONFIG_SCHED_HRTICK  /*  |