diff options
Diffstat (limited to 'kernel/sched_fair.c')
| -rw-r--r-- | kernel/sched_fair.c | 322 | 
1 files changed, 265 insertions, 57 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 00ebd768667..c62ebae65cf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;  const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +/* + * The exponential sliding  window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; +  static const struct sched_class fair_sched_class;  /************************************************************** @@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)  	return cfs_rq->tg->cfs_rq[this_cpu];  } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +	if (!cfs_rq->on_list) { +		/* +		 * Ensure we either appear before our parent (if already +		 * enqueued) or force our parent to appear after us when it is +		 * enqueued.  The fact that we always enqueue bottom-up +		 * reduces this to two cases. +		 */ +		if (cfs_rq->tg->parent && +		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { +			list_add_rcu(&cfs_rq->leaf_cfs_rq_list, +				&rq_of(cfs_rq)->leaf_cfs_rq_list); +		} else { +			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, +				&rq_of(cfs_rq)->leaf_cfs_rq_list); +		} + +		cfs_rq->on_list = 1; +	} +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +	if (cfs_rq->on_list) { +		list_del_rcu(&cfs_rq->leaf_cfs_rq_list); +		cfs_rq->on_list = 0; +	} +} +  /* Iterate thr' all leaf cfs_rq's on a runqueue */  #define for_each_leaf_cfs_rq(rq, cfs_rq) \  	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) @@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)  	return &cpu_rq(this_cpu)->cfs;  } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} +  #define for_each_leaf_cfs_rq(rq, cfs_rq) \  		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) @@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,  	WRT_SYSCTL(sched_min_granularity);  	WRT_SYSCTL(sched_latency);  	WRT_SYSCTL(sched_wakeup_granularity); -	WRT_SYSCTL(sched_shares_ratelimit);  #undef WRT_SYSCTL  	return 0; @@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  	return calc_delta_fair(sched_slice(cfs_rq, se), se);  } +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +  /*   * Update the current task's runtime statistics. Skip current tasks that   * are not in our scheduling class. @@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,  	curr->vruntime += delta_exec_weighted;  	update_min_vruntime(cfs_rq); + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +	cfs_rq->load_unacc_exec_time += delta_exec; +#endif  }  static void update_curr(struct cfs_rq *cfs_rq) @@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  		list_add(&se->group_node, &cfs_rq->tasks);  	}  	cfs_rq->nr_running++; -	se->on_rq = 1;  }  static void @@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  		list_del_init(&se->group_node);  	}  	cfs_rq->nr_running--; -	se->on_rq = 0;  } +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, +					    int global_update) +{ +	struct task_group *tg = cfs_rq->tg; +	long load_avg; + +	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); +	load_avg -= cfs_rq->load_contribution; + +	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { +		atomic_add(load_avg, &tg->load_weight); +		cfs_rq->load_contribution += load_avg; +	} +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +	u64 period = sysctl_sched_shares_window; +	u64 now, delta; +	unsigned long load = cfs_rq->load.weight; + +	if (!cfs_rq) +		return; + +	now = rq_of(cfs_rq)->clock; +	delta = now - cfs_rq->load_stamp; + +	/* truncate load history at 4 idle periods */ +	if (cfs_rq->load_stamp > cfs_rq->load_last && +	    now - cfs_rq->load_last > 4 * period) { +		cfs_rq->load_period = 0; +		cfs_rq->load_avg = 0; +	} + +	cfs_rq->load_stamp = now; +	cfs_rq->load_unacc_exec_time = 0; +	cfs_rq->load_period += delta; +	if (load) { +		cfs_rq->load_last = now; +		cfs_rq->load_avg += delta * load; +	} + +	/* consider updating load contribution on each fold or truncate */ +	if (global_update || cfs_rq->load_period > period +	    || !cfs_rq->load_period) +		update_cfs_rq_load_contribution(cfs_rq, global_update); + +	while (cfs_rq->load_period > period) { +		/* +		 * Inline assembly required to prevent the compiler +		 * optimising this loop into a divmod call. +		 * See __iter_div_u64_rem() for another example of this. +		 */ +		asm("" : "+rm" (cfs_rq->load_period)); +		cfs_rq->load_period /= 2; +		cfs_rq->load_avg /= 2; +	} + +	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) +		list_del_leaf_cfs_rq(cfs_rq); +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +			    unsigned long weight) +{ +	if (se->on_rq) { +		/* commit outstanding execution time */ +		if (cfs_rq->curr == se) +			update_curr(cfs_rq); +		account_entity_dequeue(cfs_rq, se); +	} + +	update_load_set(&se->load, weight); + +	if (se->on_rq) +		account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ +	struct task_group *tg; +	struct sched_entity *se; +	long load_weight, load, shares; + +	if (!cfs_rq) +		return; + +	tg = cfs_rq->tg; +	se = tg->se[cpu_of(rq_of(cfs_rq))]; +	if (!se) +		return; + +	load = cfs_rq->load.weight + weight_delta; + +	load_weight = atomic_read(&tg->load_weight); +	load_weight -= cfs_rq->load_contribution; +	load_weight += load; + +	shares = (tg->shares * load); +	if (load_weight) +		shares /= load_weight; + +	if (shares < MIN_SHARES) +		shares = MIN_SHARES; +	if (shares > tg->shares) +		shares = tg->shares; + +	reweight_entity(cfs_rq_of(se), se, shares); +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { +		update_cfs_load(cfs_rq, 0); +		update_cfs_shares(cfs_rq, 0); +	} +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ +  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  #ifdef CONFIG_SCHEDSTATS @@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	 * Update run-time statistics of the 'current'.  	 */  	update_curr(cfs_rq); +	update_cfs_load(cfs_rq, 0); +	update_cfs_shares(cfs_rq, se->load.weight);  	account_entity_enqueue(cfs_rq, se);  	if (flags & ENQUEUE_WAKEUP) { @@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	check_spread(cfs_rq, se);  	if (se != cfs_rq->curr)  		__enqueue_entity(cfs_rq, se); +	se->on_rq = 1; + +	if (cfs_rq->nr_running == 1) +		list_add_leaf_cfs_rq(cfs_rq);  }  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	if (se != cfs_rq->curr)  		__dequeue_entity(cfs_rq, se); +	se->on_rq = 0; +	update_cfs_load(cfs_rq, 0);  	account_entity_dequeue(cfs_rq, se);  	update_min_vruntime(cfs_rq); +	update_cfs_shares(cfs_rq, 0);  	/*  	 * Normalize the entity after updating the min_vruntime because the @@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  	 */  	update_curr(cfs_rq); +	/* +	 * Update share accounting for long-running entities. +	 */ +	update_entity_shares_tick(cfs_rq); +  #ifdef CONFIG_SCHED_HRTICK  	/*  	 * queued ticks are scheduled to match the slice, so don't bother @@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		flags = ENQUEUE_WAKEUP;  	} +	for_each_sched_entity(se) { +		struct cfs_rq *cfs_rq = cfs_rq_of(se); + +		update_cfs_load(cfs_rq, 0); +		update_cfs_shares(cfs_rq, 0); +	} +  	hrtick_update(rq);  } @@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	for_each_sched_entity(se) {  		cfs_rq = cfs_rq_of(se);  		dequeue_entity(cfs_rq, se, flags); +  		/* Don't dequeue parent if it has other entities besides us */  		if (cfs_rq->load.weight)  			break;  		flags |= DEQUEUE_SLEEP;  	} +	for_each_sched_entity(se) { +		struct cfs_rq *cfs_rq = cfs_rq_of(se); + +		update_cfs_load(cfs_rq, 0); +		update_cfs_shares(cfs_rq, 0); +	} +  	hrtick_update(rq);  } @@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)   * Adding load to a group doesn't make a group heavier, but can cause movement   * of group shares between cpus. Assuming the shares were perfectly aligned one   * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - *   */ -static long effective_load(struct task_group *tg, int cpu, -		long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  {  	struct sched_entity *se = tg->se[cpu];  	if (!tg->parent)  		return wl; -	/* -	 * By not taking the decrease of shares on the other cpu into -	 * account our error leans towards reducing the affine wakeups. -	 */ -	if (!wl && sched_feat(ASYM_EFF_LOAD)) -		return wl; -  	for_each_sched_entity(se) {  		long S, rw, s, a, b; -		long more_w; - -		/* -		 * Instead of using this increment, also add the difference -		 * between when the shares were last updated and now. -		 */ -		more_w = se->my_q->load.weight - se->my_q->rq_weight; -		wl += more_w; -		wg += more_w;  		S = se->my_q->tg->shares; -		s = se->my_q->shares; -		rw = se->my_q->rq_weight; +		s = se->load.weight; +		rw = se->my_q->load.weight;  		a = S*(rw + wl);  		b = S*rw + s*wg; @@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_  			sd = tmp;  	} -#ifdef CONFIG_FAIR_GROUP_SCHED -	if (sched_feat(LB_SHARES_UPDATE)) { -		/* -		 * Pick the largest domain to update shares over -		 */ -		tmp = sd; -		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) -			tmp = affine_sd; - -		if (tmp) { -			raw_spin_unlock(&rq->lock); -			update_shares(tmp); -			raw_spin_lock(&rq->lock); -		} -	} -#endif -  	if (affine_sd) {  		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))  			return select_idle_sibling(p, cpu); @@ -1909,6 +2071,48 @@ out:  }  #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int update_shares_cpu(struct task_group *tg, int cpu) +{ +	struct cfs_rq *cfs_rq; +	unsigned long flags; +	struct rq *rq; + +	if (!tg->se[cpu]) +		return 0; + +	rq = cpu_rq(cpu); +	cfs_rq = tg->cfs_rq[cpu]; + +	raw_spin_lock_irqsave(&rq->lock, flags); + +	update_rq_clock(rq); +	update_cfs_load(cfs_rq, 1); + +	/* +	 * We need to update shares after updating tg->load_weight in +	 * order to adjust the weight of groups with long running tasks. +	 */ +	update_cfs_shares(cfs_rq, 0); + +	raw_spin_unlock_irqrestore(&rq->lock, flags); + +	return 0; +} + +static void update_shares(int cpu) +{ +	struct cfs_rq *cfs_rq; +	struct rq *rq = cpu_rq(cpu); + +	rcu_read_lock(); +	for_each_leaf_cfs_rq(rq, cfs_rq) +		update_shares_cpu(cfs_rq->tg, cpu); +	rcu_read_unlock(); +} +  static unsigned long  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,  		  unsigned long max_load_move, @@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,  	return max_load_move - rem_load_move;  }  #else +static inline void update_shares(int cpu) +{ +} +  static unsigned long  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,  		  unsigned long max_load_move, @@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,  	schedstat_inc(sd, lb_count[idle]);  redo: -	update_shares(sd);  	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,  				   cpus, balance); @@ -3174,8 +3381,6 @@ out_one_pinned:  	else  		ld_moved = 0;  out: -	if (ld_moved) -		update_shares(sd);  	return ld_moved;  } @@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)  	 */  	raw_spin_unlock(&this_rq->lock); +	update_shares(this_cpu);  	for_each_domain(this_cpu, sd) {  		unsigned long interval;  		int balance = 1; @@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)  	int update_next_balance = 0;  	int need_serialize; +	update_shares(cpu); +  	for_each_domain(cpu, sd) {  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue;  |