diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-22 18:27:32 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-22 18:27:32 -0700 | 
| commit | d79ee93de909dfb252279b9a95978bbda9a814a9 (patch) | |
| tree | bfccca60fd36259ff4bcc5e78a2c272fbd680065 /kernel/sched/core.c | |
| parent | 2ff2b289a695807e291e1ed9f639d8a3ba5f4254 (diff) | |
| parent | 1c2927f18576d65631d8e0ddd19e1d023183222e (diff) | |
| download | olio-linux-3.10-d79ee93de909dfb252279b9a95978bbda9a814a9.tar.xz olio-linux-3.10-d79ee93de909dfb252279b9a95978bbda9a814a9.zip  | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
 "The biggest change is the cleanup/simplification of the load-balancer:
  instead of the current practice of architectures twiddling scheduler
  internal data structures and providing the scheduler domains in
  colorfully inconsistent ways, we now have generic scheduler code in
  kernel/sched/core.c:sched_init_numa() that looks at the architecture's
  node_distance() parameters and (while not fully trusting it) deducts a
  NUMA topology from it.
  This inevitably changes balancing behavior - hopefully for the better.
  There are various smaller optimizations, cleanups and fixlets as well"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Taint kernel with TAINT_WARN after sleep-in-atomic bug
  sched: Remove stale power aware scheduling remnants and dysfunctional knobs
  sched/debug: Fix printing large integers on 32-bit platforms
  sched/fair: Improve the ->group_imb logic
  sched/nohz: Fix rq->cpu_load[] calculations
  sched/numa: Don't scale the imbalance
  sched/fair: Revert sched-domain iteration breakage
  sched/x86: Rewrite set_cpu_sibling_map()
  sched/numa: Fix the new NUMA topology bits
  sched/numa: Rewrite the CONFIG_NUMA sched domain support
  sched/fair: Propagate 'struct lb_env' usage into find_busiest_group
  sched/fair: Add some serialization to the sched_domain load-balance walk
  sched/fair: Let minimally loaded cpu balance the group
  sched: Change rq->nr_running to unsigned int
  x86/numa: Check for nonsensical topologies on real hw as well
  x86/numa: Hard partition cpu topology masks on node boundaries
  x86/numa: Allow specifying node_distance() for numa=fake
  x86/sched: Make mwait_usable() heed to "idle=" kernel parameters properly
  sched: Update documentation and comments
  sched_rt: Avoid unnecessary dequeue and enqueue of pushable tasks in set_cpus_allowed_rt()
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 420 | 
1 files changed, 218 insertions, 202 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2e2e173d8f..d833cc94eed 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -693,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)  }  #endif -void update_cpu_load(struct rq *this_rq); -  static void set_load_weight(struct task_struct *p)  {  	int prio = p->static_prio - MAX_RT_PRIO; @@ -2481,22 +2479,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)   * scheduler tick (TICK_NSEC). With tickless idle this will not be called   * every tick. We fix it up based on jiffies.   */ -void update_cpu_load(struct rq *this_rq) +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, +			      unsigned long pending_updates)  { -	unsigned long this_load = this_rq->load.weight; -	unsigned long curr_jiffies = jiffies; -	unsigned long pending_updates;  	int i, scale;  	this_rq->nr_load_updates++; -	/* Avoid repeated calls on same jiffy, when moving in and out of idle */ -	if (curr_jiffies == this_rq->last_load_update_tick) -		return; - -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	this_rq->last_load_update_tick = curr_jiffies; -  	/* Update our load: */  	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */  	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -2521,9 +2510,45 @@ void update_cpu_load(struct rq *this_rq)  	sched_avg_update(this_rq);  } +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ +	unsigned long curr_jiffies = jiffies; +	unsigned long load = this_rq->load.weight; +	unsigned long pending_updates; + +	/* +	 * Bloody broken means of dealing with nohz, but better than nothing.. +	 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy +	 * update and see 0 difference the one time and 2 the next, even though +	 * we ticked at roughtly the same rate. +	 * +	 * Hence we only use this from nohz_idle_balance() and skip this +	 * nonsense when called from the scheduler_tick() since that's +	 * guaranteed a stable rate. +	 */ +	if (load || curr_jiffies == this_rq->last_load_update_tick) +		return; + +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	this_rq->last_load_update_tick = curr_jiffies; + +	__update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from scheduler_tick() + */  static void update_cpu_load_active(struct rq *this_rq)  { -	update_cpu_load(this_rq); +	/* +	 * See the mess in update_idle_cpu_load(). +	 */ +	this_rq->last_load_update_tick = jiffies; +	__update_cpu_load(this_rq, this_rq->load.weight, 1);  	calc_load_account_active(this_rq);  } @@ -3108,6 +3133,7 @@ static noinline void __schedule_bug(struct task_struct *prev)  	if (irqs_disabled())  		print_irqtrace_events(prev);  	dump_stack(); +	add_taint(TAINT_WARN);  }  /* @@ -5555,7 +5581,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  			break;  		} -		if (cpumask_intersects(groupmask, sched_group_cpus(group))) { +		if (!(sd->flags & SD_OVERLAP) && +		    cpumask_intersects(groupmask, sched_group_cpus(group))) {  			printk(KERN_CONT "\n");  			printk(KERN_ERR "ERROR: repeated CPUs\n");  			break; @@ -5893,99 +5920,11 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ -	int i, n, val, min_val, best_node = -1; - -	min_val = INT_MAX; - -	for (i = 0; i < nr_node_ids; i++) { -		/* Start at @node */ -		n = (node + i) % nr_node_ids; - -		if (!nr_cpus_node(n)) -			continue; - -		/* Skip already used nodes */ -		if (node_isset(n, *used_nodes)) -			continue; - -		/* Simple min distance search */ -		val = node_distance(node, n); - -		if (val < min_val) { -			min_val = val; -			best_node = n; -		} -	} - -	if (best_node != -1) -		node_set(best_node, *used_nodes); -	return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ -	nodemask_t used_nodes; -	int i; - -	cpumask_clear(span); -	nodes_clear(used_nodes); - -	cpumask_or(span, span, cpumask_of_node(node)); -	node_set(node, used_nodes); - -	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { -		int next_node = find_next_best_node(node, &used_nodes); -		if (next_node < 0) -			break; -		cpumask_or(span, span, cpumask_of_node(next_node)); -	} -} - -static const struct cpumask *cpu_node_mask(int cpu) -{ -	lockdep_assert_held(&sched_domains_mutex); - -	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); - -	return sched_domains_tmpmask; -} - -static const struct cpumask *cpu_allnodes_mask(int cpu) -{ -	return cpu_possible_mask; -} -#endif /* CONFIG_NUMA */ -  static const struct cpumask *cpu_cpu_mask(int cpu)  {  	return cpumask_of_node(cpu_to_node(cpu));  } -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -  struct sd_data {  	struct sched_domain **__percpu sd;  	struct sched_group **__percpu sg; @@ -6015,6 +5954,7 @@ struct sched_domain_topology_level {  	sched_domain_init_f init;  	sched_domain_mask_f mask;  	int		    flags; +	int		    numa_level;  	struct sd_data      data;  }; @@ -6206,10 +6146,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\  }  SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif  #ifdef CONFIG_SCHED_SMT   SD_INIT_FUNC(SIBLING)  #endif @@ -6331,15 +6267,184 @@ static struct sched_domain_topology_level default_topology[] = {  	{ sd_init_BOOK, cpu_book_mask, },  #endif  	{ sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_NUMA -	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, -	{ sd_init_ALLNODES, cpu_allnodes_mask, }, -#endif  	{ NULL, },  };  static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_NUMA + +static int sched_domains_numa_levels; +static int sched_domains_numa_scale; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; + +static inline int sd_local_flags(int level) +{ +	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) +		return 0; + +	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; +} + +static struct sched_domain * +sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +{ +	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); +	int level = tl->numa_level; +	int sd_weight = cpumask_weight( +			sched_domains_numa_masks[level][cpu_to_node(cpu)]); + +	*sd = (struct sched_domain){ +		.min_interval		= sd_weight, +		.max_interval		= 2*sd_weight, +		.busy_factor		= 32, +		.imbalance_pct		= 125, +		.cache_nice_tries	= 2, +		.busy_idx		= 3, +		.idle_idx		= 2, +		.newidle_idx		= 0, +		.wake_idx		= 0, +		.forkexec_idx		= 0, + +		.flags			= 1*SD_LOAD_BALANCE +					| 1*SD_BALANCE_NEWIDLE +					| 0*SD_BALANCE_EXEC +					| 0*SD_BALANCE_FORK +					| 0*SD_BALANCE_WAKE +					| 0*SD_WAKE_AFFINE +					| 0*SD_PREFER_LOCAL +					| 0*SD_SHARE_CPUPOWER +					| 0*SD_SHARE_PKG_RESOURCES +					| 1*SD_SERIALIZE +					| 0*SD_PREFER_SIBLING +					| sd_local_flags(level) +					, +		.last_balance		= jiffies, +		.balance_interval	= sd_weight, +	}; +	SD_INIT_NAME(sd, NUMA); +	sd->private = &tl->data; + +	/* +	 * Ugly hack to pass state to sd_numa_mask()... +	 */ +	sched_domains_curr_level = tl->numa_level; + +	return sd; +} + +static const struct cpumask *sd_numa_mask(int cpu) +{ +	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_init_numa(void) +{ +	int next_distance, curr_distance = node_distance(0, 0); +	struct sched_domain_topology_level *tl; +	int level = 0; +	int i, j, k; + +	sched_domains_numa_scale = curr_distance; +	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); +	if (!sched_domains_numa_distance) +		return; + +	/* +	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the +	 * unique distances in the node_distance() table. +	 * +	 * Assumes node_distance(0,j) includes all distances in +	 * node_distance(i,j) in order to avoid cubic time. +	 * +	 * XXX: could be optimized to O(n log n) by using sort() +	 */ +	next_distance = curr_distance; +	for (i = 0; i < nr_node_ids; i++) { +		for (j = 0; j < nr_node_ids; j++) { +			int distance = node_distance(0, j); +			if (distance > curr_distance && +					(distance < next_distance || +					 next_distance == curr_distance)) +				next_distance = distance; +		} +		if (next_distance != curr_distance) { +			sched_domains_numa_distance[level++] = next_distance; +			sched_domains_numa_levels = level; +			curr_distance = next_distance; +		} else break; +	} +	/* +	 * 'level' contains the number of unique distances, excluding the +	 * identity distance node_distance(i,i). +	 * +	 * The sched_domains_nume_distance[] array includes the actual distance +	 * numbers. +	 */ + +	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); +	if (!sched_domains_numa_masks) +		return; + +	/* +	 * Now for each level, construct a mask per node which contains all +	 * cpus of nodes that are that many hops away from us. +	 */ +	for (i = 0; i < level; i++) { +		sched_domains_numa_masks[i] = +			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); +		if (!sched_domains_numa_masks[i]) +			return; + +		for (j = 0; j < nr_node_ids; j++) { +			struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); +			if (!mask) +				return; + +			sched_domains_numa_masks[i][j] = mask; + +			for (k = 0; k < nr_node_ids; k++) { +				if (node_distance(j, k) > sched_domains_numa_distance[i]) +					continue; + +				cpumask_or(mask, mask, cpumask_of_node(k)); +			} +		} +	} + +	tl = kzalloc((ARRAY_SIZE(default_topology) + level) * +			sizeof(struct sched_domain_topology_level), GFP_KERNEL); +	if (!tl) +		return; + +	/* +	 * Copy the default topology bits.. +	 */ +	for (i = 0; default_topology[i].init; i++) +		tl[i] = default_topology[i]; + +	/* +	 * .. and append 'j' levels of NUMA goodness. +	 */ +	for (j = 0; j < level; i++, j++) { +		tl[i] = (struct sched_domain_topology_level){ +			.init = sd_numa_init, +			.mask = sd_numa_mask, +			.flags = SDTL_OVERLAP, +			.numa_level = j, +		}; +	} + +	sched_domain_topology = tl; +} +#else +static inline void sched_init_numa(void) +{ +} +#endif /* CONFIG_NUMA */ +  static int __sdt_alloc(const struct cpumask *cpu_map)  {  	struct sched_domain_topology_level *tl; @@ -6707,97 +6812,6 @@ match2:  	mutex_unlock(&sched_domains_mutex);  } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void reinit_sched_domains(void) -{ -	get_online_cpus(); - -	/* Destroy domains first to force the rebuild */ -	partition_sched_domains(0, NULL, NULL); - -	rebuild_sched_domains(); -	put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ -	unsigned int level = 0; - -	if (sscanf(buf, "%u", &level) != 1) -		return -EINVAL; - -	/* -	 * level is always be positive so don't check for -	 * level < POWERSAVINGS_BALANCE_NONE which is 0 -	 * What happens on 0 or 1 byte write, -	 * need to check for count as well? -	 */ - -	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) -		return -EINVAL; - -	if (smt) -		sched_smt_power_savings = level; -	else -		sched_mc_power_savings = level; - -	reinit_sched_domains(); - -	return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct device *dev, -					   struct device_attribute *attr, -					   char *buf) -{ -	return sprintf(buf, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct device *dev, -					    struct device_attribute *attr, -					    const char *buf, size_t count) -{ -	return sched_power_savings_store(buf, count, 0); -} -static DEVICE_ATTR(sched_mc_power_savings, 0644, -		   sched_mc_power_savings_show, -		   sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct device *dev, -					    struct device_attribute *attr, -					    char *buf) -{ -	return sprintf(buf, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct device *dev, -					    struct device_attribute *attr, -					     const char *buf, size_t count) -{ -	return sched_power_savings_store(buf, count, 1); -} -static DEVICE_ATTR(sched_smt_power_savings, 0644, -		   sched_smt_power_savings_show, -		   sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct device *dev) -{ -	int err = 0; - -#ifdef CONFIG_SCHED_SMT -	if (smt_capable()) -		err = device_create_file(dev, &dev_attr_sched_smt_power_savings); -#endif -#ifdef CONFIG_SCHED_MC -	if (!err && mc_capable()) -		err = device_create_file(dev, &dev_attr_sched_mc_power_savings); -#endif -	return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -  /*   * Update cpusets according to cpu_active mask.  If cpusets are   * disabled, cpuset_update_active_cpus() becomes a simple wrapper @@ -6835,6 +6849,8 @@ void __init sched_init_smp(void)  	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);  	alloc_cpumask_var(&fallback_doms, GFP_KERNEL); +	sched_init_numa(); +  	get_online_cpus();  	mutex_lock(&sched_domains_mutex);  	init_sched_domains(cpu_active_mask);  |