diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 113 | 
1 files changed, 86 insertions, 27 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe..22321db6495 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)  	int cpu = smp_processor_id();  	int prev_cpu = task_cpu(p);  	struct sched_domain *sd; -	struct sched_group *sg; -	int i;  	/*  	 * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)  		return prev_cpu;  	/* -	 * Otherwise, iterate the domains and find an elegible idle cpu. +	 * Otherwise, check assigned siblings to find an elegible idle cpu.  	 */  	sd = rcu_dereference(per_cpu(sd_llc, target)); -	for_each_lower_domain(sd) { -		sg = sd->groups; -		do { -			if (!cpumask_intersects(sched_group_cpus(sg), -						tsk_cpus_allowed(p))) -				goto next; - -			for_each_cpu(i, sched_group_cpus(sg)) { -				if (!idle_cpu(i)) -					goto next; -			} -			target = cpumask_first_and(sched_group_cpus(sg), -					tsk_cpus_allowed(p)); -			goto done; -next: -			sg = sg->next; -		} while (sg != sd->groups); +	for_each_lower_domain(sd) { +		if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) +			continue; +		if (idle_cpu(sd->idle_buddy)) +			return sd->idle_buddy;  	} -done: +  	return target;  } @@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;  #define LBF_ALL_PINNED	0x01  #define LBF_NEED_BREAK	0x02 +#define LBF_SOME_PINNED 0x04  struct lb_env {  	struct sched_domain	*sd; -	int			src_cpu;  	struct rq		*src_rq; +	int			src_cpu;  	int			dst_cpu;  	struct rq		*dst_rq; +	struct cpumask		*dst_grpmask; +	int			new_dst_cpu;  	enum cpu_idle_type	idle;  	long			imbalance;  	unsigned int		flags; @@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	 * 3) are cache-hot on their current CPU.  	 */  	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { +		int new_dst_cpu; +  		schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + +		/* +		 * Remember if this task can be migrated to any other cpu in +		 * our sched_group. We may want to revisit it if we couldn't +		 * meet load balance goals by pulling other tasks on src_cpu. +		 * +		 * Also avoid computing new_dst_cpu if we have already computed +		 * one in current iteration. +		 */ +		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) +			return 0; + +		new_dst_cpu = cpumask_first_and(env->dst_grpmask, +						tsk_cpus_allowed(p)); +		if (new_dst_cpu < nr_cpu_ids) { +			env->flags |= LBF_SOME_PINNED; +			env->new_dst_cpu = new_dst_cpu; +		}  		return 0;  	} + +	/* Record that we found atleast one task that could run on dst_cpu */  	env->flags &= ~LBF_ALL_PINNED;  	if (task_running(env->src_rq, p)) { @@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,  			struct sched_domain *sd, enum cpu_idle_type idle,  			int *balance)  { -	int ld_moved, active_balance = 0; +	int ld_moved, cur_ld_moved, active_balance = 0; +	int lb_iterations, max_lb_iterations;  	struct sched_group *group;  	struct rq *busiest;  	unsigned long flags; @@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,  		.sd		= sd,  		.dst_cpu	= this_cpu,  		.dst_rq		= this_rq, +		.dst_grpmask    = sched_group_cpus(sd->groups),  		.idle		= idle,  		.loop_break	= sched_nr_migrate_break,  	};  	cpumask_copy(cpus, cpu_active_mask); +	max_lb_iterations = cpumask_weight(env.dst_grpmask);  	schedstat_inc(sd, lb_count[idle]); @@ -4267,6 +4281,7 @@ redo:  	schedstat_add(sd, lb_imbalance[idle], env.imbalance);  	ld_moved = 0; +	lb_iterations = 1;  	if (busiest->nr_running > 1) {  		/*  		 * Attempt to move tasks. If find_busiest_group has found @@ -4284,7 +4299,13 @@ more_balance:  		double_rq_lock(this_rq, busiest);  		if (!env.loop)  			update_h_load(env.src_cpu); -		ld_moved += move_tasks(&env); + +		/* +		 * cur_ld_moved - load moved in current iteration +		 * ld_moved     - cumulative load moved across iterations +		 */ +		cur_ld_moved = move_tasks(&env); +		ld_moved += cur_ld_moved;  		double_rq_unlock(this_rq, busiest);  		local_irq_restore(flags); @@ -4296,14 +4317,52 @@ more_balance:  		/*  		 * some other cpu did the load balance for us.  		 */ -		if (ld_moved && this_cpu != smp_processor_id()) -			resched_cpu(this_cpu); +		if (cur_ld_moved && env.dst_cpu != smp_processor_id()) +			resched_cpu(env.dst_cpu); + +		/* +		 * Revisit (affine) tasks on src_cpu that couldn't be moved to +		 * us and move them to an alternate dst_cpu in our sched_group +		 * where they can run. The upper limit on how many times we +		 * iterate on same src_cpu is dependent on number of cpus in our +		 * sched_group. +		 * +		 * This changes load balance semantics a bit on who can move +		 * load to a given_cpu. In addition to the given_cpu itself +		 * (or a ilb_cpu acting on its behalf where given_cpu is +		 * nohz-idle), we now have balance_cpu in a position to move +		 * load to given_cpu. In rare situations, this may cause +		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding +		 * _independently_ and at _same_ time to move some load to +		 * given_cpu) causing exceess load to be moved to given_cpu. +		 * This however should not happen so much in practice and +		 * moreover subsequent load balance cycles should correct the +		 * excess load moved. +		 */ +		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && +				lb_iterations++ < max_lb_iterations) { + +			this_rq		 = cpu_rq(env.new_dst_cpu); +			env.dst_rq	 = this_rq; +			env.dst_cpu	 = env.new_dst_cpu; +			env.flags	&= ~LBF_SOME_PINNED; +			env.loop	 = 0; +			env.loop_break	 = sched_nr_migrate_break; +			/* +			 * Go back to "more_balance" rather than "redo" since we +			 * need to continue with same src_cpu. +			 */ +			goto more_balance; +		}  		/* All tasks on this runqueue were pinned by CPU affinity */  		if (unlikely(env.flags & LBF_ALL_PINNED)) {  			cpumask_clear_cpu(cpu_of(busiest), cpus); -			if (!cpumask_empty(cpus)) +			if (!cpumask_empty(cpus)) { +				env.loop = 0; +				env.loop_break = sched_nr_migrate_break;  				goto redo; +			}  			goto out_balanced;  		}  	}  |