diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 912 | 
1 files changed, 745 insertions, 167 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f936552b3db..59e072b2db9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,6 +259,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)  	return grp->my_q;  } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, +				       int force_update); +  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)  {  	if (!cfs_rq->on_list) { @@ -278,6 +281,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)  		}  		cfs_rq->on_list = 1; +		/* We should have no load, but we need to update last_decay. */ +		update_cfs_rq_blocked_load(cfs_rq, 0);  	}  } @@ -653,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  	return calc_delta_fair(sched_slice(cfs_rq, se), se);  } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); -  /*   * Update the current task's runtime statistics. Skip current tasks that   * are not in our scheduling class. @@ -675,10 +677,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,  	curr->vruntime += delta_exec_weighted;  	update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -	cfs_rq->load_unacc_exec_time += delta_exec; -#endif  }  static void update_curr(struct cfs_rq *cfs_rq) @@ -801,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);  # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, -					    int global_update) -{ -	struct task_group *tg = cfs_rq->tg; -	long load_avg; - -	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); -	load_avg -= cfs_rq->load_contribution; - -	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { -		atomic_add(load_avg, &tg->load_weight); -		cfs_rq->load_contribution += load_avg; -	} -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -	u64 period = sysctl_sched_shares_window; -	u64 now, delta; -	unsigned long load = cfs_rq->load.weight; - -	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) -		return; - -	now = rq_of(cfs_rq)->clock_task; -	delta = now - cfs_rq->load_stamp; - -	/* truncate load history at 4 idle periods */ -	if (cfs_rq->load_stamp > cfs_rq->load_last && -	    now - cfs_rq->load_last > 4 * period) { -		cfs_rq->load_period = 0; -		cfs_rq->load_avg = 0; -		delta = period - 1; -	} - -	cfs_rq->load_stamp = now; -	cfs_rq->load_unacc_exec_time = 0; -	cfs_rq->load_period += delta; -	if (load) { -		cfs_rq->load_last = now; -		cfs_rq->load_avg += delta * load; -	} - -	/* consider updating load contribution on each fold or truncate */ -	if (global_update || cfs_rq->load_period > period -	    || !cfs_rq->load_period) -		update_cfs_rq_load_contribution(cfs_rq, global_update); - -	while (cfs_rq->load_period > period) { -		/* -		 * Inline assembly required to prevent the compiler -		 * optimising this loop into a divmod call. -		 * See __iter_div_u64_rem() for another example of this. -		 */ -		asm("" : "+rm" (cfs_rq->load_period)); -		cfs_rq->load_period /= 2; -		cfs_rq->load_avg /= 2; -	} - -	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) -		list_del_leaf_cfs_rq(cfs_rq); -} -  static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)  {  	long tg_weight; @@ -876,8 +809,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)  	 * to gain a more accurate current total weight. See  	 * update_cfs_rq_load_contribution().  	 */ -	tg_weight = atomic_read(&tg->load_weight); -	tg_weight -= cfs_rq->load_contribution; +	tg_weight = atomic64_read(&tg->load_avg); +	tg_weight -= cfs_rq->tg_load_contrib;  	tg_weight += cfs_rq->load.weight;  	return tg_weight; @@ -901,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  	return shares;  } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { -		update_cfs_load(cfs_rq, 0); -		update_cfs_shares(cfs_rq); -	} -}  # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} -  static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  {  	return tg->shares;  } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -}  # endif /* CONFIG_SMP */  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,  			    unsigned long weight) @@ -939,6 +856,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,  		account_entity_enqueue(cfs_rq, se);  } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); +  static void update_cfs_shares(struct cfs_rq *cfs_rq)  {  	struct task_group *tg; @@ -958,18 +877,478 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)  	reweight_entity(cfs_rq_of(se), se, shares);  }  #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)  {  } +#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { +	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, +	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, +	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, +	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, +	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, +	0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { +	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, +	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, +	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n)  { +	unsigned int local_n; + +	if (!n) +		return val; +	else if (unlikely(n > LOAD_AVG_PERIOD * 63)) +		return 0; + +	/* after bounds checking we can collapse to 32-bit */ +	local_n = n; + +	/* +	 * As y^PERIOD = 1/2, we can combine +	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) +	 * With a look-up table which covers k^n (n<PERIOD) +	 * +	 * To achieve constant time decay_load. +	 */ +	if (unlikely(local_n >= LOAD_AVG_PERIOD)) { +		val >>= local_n / LOAD_AVG_PERIOD; +		local_n %= LOAD_AVG_PERIOD; +	} + +	val *= runnable_avg_yN_inv[local_n]; +	/* We don't use SRR here since we always want to round down. */ +	return val >> 32;  } -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD} + */ +static u32 __compute_runnable_contrib(u64 n)  { +	u32 contrib = 0; + +	if (likely(n <= LOAD_AVG_PERIOD)) +		return runnable_avg_yN_sum[n]; +	else if (unlikely(n >= LOAD_AVG_MAX_N)) +		return LOAD_AVG_MAX; + +	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ +	do { +		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ +		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + +		n -= LOAD_AVG_PERIOD; +	} while (n > LOAD_AVG_PERIOD); + +	contrib = decay_load(contrib, n); +	return contrib + runnable_avg_yN_sum[n];  } -#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series.  To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + *      p0            p1           p2 + *     (now)       (~1ms ago)  (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + *   y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, +							struct sched_avg *sa, +							int runnable) +{ +	u64 delta, periods; +	u32 runnable_contrib; +	int delta_w, decayed = 0; + +	delta = now - sa->last_runnable_update; +	/* +	 * This should only happen when time goes backwards, which it +	 * unfortunately does during sched clock init when we swap over to TSC. +	 */ +	if ((s64)delta < 0) { +		sa->last_runnable_update = now; +		return 0; +	} + +	/* +	 * Use 1024ns as the unit of measurement since it's a reasonable +	 * approximation of 1us and fast to compute. +	 */ +	delta >>= 10; +	if (!delta) +		return 0; +	sa->last_runnable_update = now; + +	/* delta_w is the amount already accumulated against our next period */ +	delta_w = sa->runnable_avg_period % 1024; +	if (delta + delta_w >= 1024) { +		/* period roll-over */ +		decayed = 1; + +		/* +		 * Now that we know we're crossing a period boundary, figure +		 * out how much from delta we need to complete the current +		 * period and accrue it. +		 */ +		delta_w = 1024 - delta_w; +		if (runnable) +			sa->runnable_avg_sum += delta_w; +		sa->runnable_avg_period += delta_w; + +		delta -= delta_w; + +		/* Figure out how many additional periods this update spans */ +		periods = delta / 1024; +		delta %= 1024; + +		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, +						  periods + 1); +		sa->runnable_avg_period = decay_load(sa->runnable_avg_period, +						     periods + 1); + +		/* Efficiently calculate \sum (1..n_period) 1024*y^i */ +		runnable_contrib = __compute_runnable_contrib(periods); +		if (runnable) +			sa->runnable_avg_sum += runnable_contrib; +		sa->runnable_avg_period += runnable_contrib; +	} + +	/* Remainder of delta accrued against u_0` */ +	if (runnable) +		sa->runnable_avg_sum += delta; +	sa->runnable_avg_period += delta; + +	return decayed; +} + +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) +{ +	struct cfs_rq *cfs_rq = cfs_rq_of(se); +	u64 decays = atomic64_read(&cfs_rq->decay_counter); + +	decays -= se->avg.decay_count; +	if (!decays) +		return 0; + +	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); +	se->avg.decay_count = 0; + +	return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, +						 int force_update) +{ +	struct task_group *tg = cfs_rq->tg; +	s64 tg_contrib; + +	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; +	tg_contrib -= cfs_rq->tg_load_contrib; + +	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { +		atomic64_add(tg_contrib, &tg->load_avg); +		cfs_rq->tg_load_contrib += tg_contrib; +	} +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, +						  struct cfs_rq *cfs_rq) +{ +	struct task_group *tg = cfs_rq->tg; +	long contrib; + +	/* The fraction of a cpu used by this cfs_rq */ +	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, +			  sa->runnable_avg_period + 1); +	contrib -= cfs_rq->tg_runnable_contrib; + +	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { +		atomic_add(contrib, &tg->runnable_avg); +		cfs_rq->tg_runnable_contrib += contrib; +	} +} + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ +	struct cfs_rq *cfs_rq = group_cfs_rq(se); +	struct task_group *tg = cfs_rq->tg; +	int runnable_avg; + +	u64 contrib; + +	contrib = cfs_rq->tg_load_contrib * tg->shares; +	se->avg.load_avg_contrib = div64_u64(contrib, +					     atomic64_read(&tg->load_avg) + 1); + +	/* +	 * For group entities we need to compute a correction term in the case +	 * that they are consuming <1 cpu so that we would contribute the same +	 * load as a task of equal weight. +	 * +	 * Explicitly co-ordinating this measurement would be expensive, but +	 * fortunately the sum of each cpus contribution forms a usable +	 * lower-bound on the true value. +	 * +	 * Consider the aggregate of 2 contributions.  Either they are disjoint +	 * (and the sum represents true value) or they are disjoint and we are +	 * understating by the aggregate of their overlap. +	 * +	 * Extending this to N cpus, for a given overlap, the maximum amount we +	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of +	 * cpus that overlap for this interval and w_i is the interval width. +	 * +	 * On a small machine; the first term is well-bounded which bounds the +	 * total error since w_i is a subset of the period.  Whereas on a +	 * larger machine, while this first term can be larger, if w_i is the +	 * of consequential size guaranteed to see n_i*w_i quickly converge to +	 * our upper bound of 1-cpu. +	 */ +	runnable_avg = atomic_read(&tg->runnable_avg); +	if (runnable_avg < NICE_0_LOAD) { +		se->avg.load_avg_contrib *= runnable_avg; +		se->avg.load_avg_contrib >>= NICE_0_SHIFT; +	} +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, +						 int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, +						  struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +#endif + +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ +	u32 contrib; + +	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ +	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); +	contrib /= (se->avg.runnable_avg_period + 1); +	se->avg.load_avg_contrib = scale_load(contrib); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ +	long old_contrib = se->avg.load_avg_contrib; + +	if (entity_is_task(se)) { +		__update_task_entity_contrib(se); +	} else { +		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); +		__update_group_entity_contrib(se); +	} + +	return se->avg.load_avg_contrib - old_contrib; +} + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, +						 long load_contrib) +{ +	if (likely(load_contrib < cfs_rq->blocked_load_avg)) +		cfs_rq->blocked_load_avg -= load_contrib; +	else +		cfs_rq->blocked_load_avg = 0; +} + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, +					  int update_cfs_rq) +{ +	struct cfs_rq *cfs_rq = cfs_rq_of(se); +	long contrib_delta; +	u64 now; + +	/* +	 * For a group entity we need to use their owned cfs_rq_clock_task() in +	 * case they are the parent of a throttled hierarchy. +	 */ +	if (entity_is_task(se)) +		now = cfs_rq_clock_task(cfs_rq); +	else +		now = cfs_rq_clock_task(group_cfs_rq(se)); + +	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) +		return; + +	contrib_delta = __update_entity_load_avg_contrib(se); + +	if (!update_cfs_rq) +		return; + +	if (se->on_rq) +		cfs_rq->runnable_load_avg += contrib_delta; +	else +		subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ +	u64 now = cfs_rq_clock_task(cfs_rq) >> 20; +	u64 decays; + +	decays = now - cfs_rq->last_decay; +	if (!decays && !force_update) +		return; + +	if (atomic64_read(&cfs_rq->removed_load)) { +		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); +		subtract_blocked_load_contrib(cfs_rq, removed_load); +	} + +	if (decays) { +		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, +						      decays); +		atomic64_add(decays, &cfs_rq->decay_counter); +		cfs_rq->last_decay = now; +	} + +	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update); +	update_cfs_shares(cfs_rq); +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ +	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); +	__update_tg_runnable_avg(&rq->avg, &rq->cfs); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, +						  struct sched_entity *se, +						  int wakeup) +{ +	/* +	 * We track migrations using entity decay_count <= 0, on a wake-up +	 * migration we use a negative decay count to track the remote decays +	 * accumulated while sleeping. +	 */ +	if (unlikely(se->avg.decay_count <= 0)) { +		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; +		if (se->avg.decay_count) { +			/* +			 * In a wake-up migration we have to approximate the +			 * time sleeping.  This is because we can't synchronize +			 * clock_task between the two cpus, and it is not +			 * guaranteed to be read-safe.  Instead, we can +			 * approximate this using our carried decays, which are +			 * explicitly atomically readable. +			 */ +			se->avg.last_runnable_update -= (-se->avg.decay_count) +							<< 20; +			update_entity_load_avg(se, 0); +			/* Indicate that we're now synchronized and on-rq */ +			se->avg.decay_count = 0; +		} +		wakeup = 0; +	} else { +		__synchronize_entity_decay(se); +	} + +	/* migrated tasks did not contribute to our blocked load */ +	if (wakeup) { +		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); +		update_entity_load_avg(se, 0); +	} + +	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +	/* we force update consideration on load-balancer moves */ +	update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, +						  struct sched_entity *se, +						  int sleep) +{ +	update_entity_load_avg(se, 1); +	/* we force update consideration on load-balancer moves */ +	update_cfs_rq_blocked_load(cfs_rq, !sleep); + +	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +	if (sleep) { +		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; +		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); +	} /* migrations, e.g. sleep=0 leave decay_count == 0 */ +} +#else +static inline void update_entity_load_avg(struct sched_entity *se, +					  int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, +					   struct sched_entity *se, +					   int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, +					   struct sched_entity *se, +					   int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, +					      int force_update) {} +#endif  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  { @@ -1096,9 +1475,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	 * Update run-time statistics of the 'current'.  	 */  	update_curr(cfs_rq); -	update_cfs_load(cfs_rq, 0);  	account_entity_enqueue(cfs_rq, se); -	update_cfs_shares(cfs_rq); +	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);  	if (flags & ENQUEUE_WAKEUP) {  		place_entity(cfs_rq, se, 0); @@ -1190,9 +1568,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	if (se != cfs_rq->curr)  		__dequeue_entity(cfs_rq, se); -	se->on_rq = 0; -	update_cfs_load(cfs_rq, 0);  	account_entity_dequeue(cfs_rq, se); +	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);  	/*  	 * Normalize the entity after updating the min_vruntime because the @@ -1206,7 +1583,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	return_cfs_rq_runtime(cfs_rq);  	update_min_vruntime(cfs_rq); -	update_cfs_shares(cfs_rq); +	se->on_rq = 0;  }  /* @@ -1340,6 +1717,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)  		update_stats_wait_start(cfs_rq, prev);  		/* Put 'current' back into the tree. */  		__enqueue_entity(cfs_rq, prev); +		/* in !on_rq case, update occurred at dequeue */ +		update_entity_load_avg(prev, 1);  	}  	cfs_rq->curr = NULL;  } @@ -1353,9 +1732,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  	update_curr(cfs_rq);  	/* -	 * Update share accounting for long-running entities. +	 * Ensure that runnable average is periodically updated.  	 */ -	update_entity_shares_tick(cfs_rq); +	update_entity_load_avg(curr, 1); +	update_cfs_rq_blocked_load(cfs_rq, 1);  #ifdef CONFIG_SCHED_HRTICK  	/* @@ -1448,6 +1828,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)  	return &tg->cfs_bandwidth;  } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ +	if (unlikely(cfs_rq->throttle_count)) +		return cfs_rq->throttled_clock_task; + +	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} +  /* returns 0 on failure to allocate runtime */  static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  { @@ -1592,14 +1981,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)  	cfs_rq->throttle_count--;  #ifdef CONFIG_SMP  	if (!cfs_rq->throttle_count) { -		u64 delta = rq->clock_task - cfs_rq->load_stamp; - -		/* leaving throttled state, advance shares averaging windows */ -		cfs_rq->load_stamp += delta; -		cfs_rq->load_last += delta; - -		/* update entity weight now that we are on_rq again */ -		update_cfs_shares(cfs_rq); +		/* adjust cfs_rq_clock_task() */ +		cfs_rq->throttled_clock_task_time += rq->clock_task - +					     cfs_rq->throttled_clock_task;  	}  #endif @@ -1611,9 +1995,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)  	struct rq *rq = data;  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; -	/* group is entering throttled state, record last load */ +	/* group is entering throttled state, stop time */  	if (!cfs_rq->throttle_count) -		update_cfs_load(cfs_rq, 0); +		cfs_rq->throttled_clock_task = rq->clock_task;  	cfs_rq->throttle_count++;  	return 0; @@ -1628,7 +2012,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; -	/* account load preceding throttle */ +	/* freeze hierarchy runnable averages while throttled */  	rcu_read_lock();  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);  	rcu_read_unlock(); @@ -1652,7 +2036,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  		rq->nr_running -= task_delta;  	cfs_rq->throttled = 1; -	cfs_rq->throttled_timestamp = rq->clock; +	cfs_rq->throttled_clock = rq->clock;  	raw_spin_lock(&cfs_b->lock);  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);  	raw_spin_unlock(&cfs_b->lock); @@ -1670,10 +2054,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	cfs_rq->throttled = 0;  	raw_spin_lock(&cfs_b->lock); -	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; +	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;  	list_del_rcu(&cfs_rq->throttled_list);  	raw_spin_unlock(&cfs_b->lock); -	cfs_rq->throttled_timestamp = 0;  	update_rq_clock(rq);  	/* update hierarchical throttle state */ @@ -2073,8 +2456,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)  }  #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ +	return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, +				     unsigned long delta_exec) {}  static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2207,12 +2595,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		if (cfs_rq_throttled(cfs_rq))  			break; -		update_cfs_load(cfs_rq, 0); -		update_cfs_shares(cfs_rq); +		update_entity_load_avg(se, 1); +		update_cfs_rq_blocked_load(cfs_rq, 0);  	} -	if (!se) +	if (!se) { +		update_rq_runnable_avg(rq, rq->nr_running);  		inc_nr_running(rq); +	}  	hrtick_update(rq);  } @@ -2266,12 +2656,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		if (cfs_rq_throttled(cfs_rq))  			break; -		update_cfs_load(cfs_rq, 0); -		update_cfs_shares(cfs_rq); +		update_entity_load_avg(se, 1); +		update_cfs_rq_blocked_load(cfs_rq, 0);  	} -	if (!se) +	if (!se) {  		dec_nr_running(rq); +		update_rq_runnable_avg(rq, 1); +	}  	hrtick_update(rq);  } @@ -2781,6 +3173,37 @@ unlock:  	return new_cpu;  } + +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu.  However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ +	struct sched_entity *se = &p->se; +	struct cfs_rq *cfs_rq = cfs_rq_of(se); + +	/* +	 * Load tracking: accumulate removed load so that it can be processed +	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute +	 * to blocked load iff they have a positive decay-count.  It can never +	 * be negative here since on-rq tasks have decay-count == 0. +	 */ +	if (se->avg.decay_count) { +		se->avg.decay_count = -__synchronize_entity_decay(se); +		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); +	} +} +#endif  #endif /* CONFIG_SMP */  static unsigned long @@ -3033,8 +3456,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp  #ifdef CONFIG_SMP  /************************************************** - * Fair scheduling class load-balancing methods: - */ + * Fair scheduling class load-balancing methods. + * + * BASICS + * + * The purpose of load-balancing is to achieve the same basic fairness the + * per-cpu scheduler provides, namely provide a proportional amount of compute + * time to each task. This is expressed in the following equation: + * + *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1) + * + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * W_i,0 is defined as: + * + *   W_i,0 = \Sum_j w_i,j                                             (2) + * + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * is derived from the nice value as per prio_to_weight[]. + * + * The weight average is an exponential decay average of the instantaneous + * weight: + * + *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3) + * + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * fraction of 'recent' time available for SCHED_OTHER task execution. But it + * can also include other factors [XXX]. + * + * To achieve this balance we define a measure of imbalance which follows + * directly from (1): + * + *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4) + * + * We them move tasks around to minimize the imbalance. In the continuous + * function space it is obvious this converges, in the discrete case we get + * a few fun cases generally called infeasible weight scenarios. + * + * [XXX expand on: + *     - infeasible weights; + *     - local vs global optima in the discrete case. ] + * + * + * SCHED DOMAINS + * + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) + * for all i,j solution, we create a tree of cpus that follows the hardware + * topology where each level pairs two lower groups (or better). This results + * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * tree to only the first of the previous level and we decrease the frequency + * of load-balance at each level inv. proportional to the number of cpus in + * the groups. + * + * This yields: + * + *     log_2 n     1     n + *   \Sum       { --- * --- * 2^i } = O(n)                            (5) + *     i = 0      2^i   2^i + *                               `- size of each group + *         |         |     `- number of cpus doing load-balance + *         |         `- freq + *         `- sum over all levels + * + * Coupled with a limit on how many tasks we can migrate every balance pass, + * this makes (5) the runtime complexity of the balancer. + * + * An important property here is that each CPU is still (indirectly) connected + * to every other cpu in at most O(log n) steps: + * + * The adjacency matrix of the resulting graph is given by: + * + *             log_2 n      + *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6) + *             k = 0 + * + * And you'll find that: + * + *   A^(log_2 n)_i,j != 0  for all i,j                                (7) + * + * Showing there's indeed a path between every cpu in at most O(log n) steps. + * The task movement gives a factor of O(m), giving a convergence complexity + * of: + * + *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8) + * + * + * WORK CONSERVING + * + * In order to avoid CPUs going idle while there's still work to do, new idle + * balancing is more aggressive and has the newly idle cpu iterate up the domain + * tree itself instead of relying on other CPUs to bring it work. + * + * This adds some complexity to both (5) and (8) but it reduces the total idle + * time. + * + * [XXX more?] + * + * + * CGROUPS + * + * Cgroups make a horror show out of (2), instead of a simple sum we get: + * + *                                s_k,i + *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9) + *                                 S_k + * + * Where + * + *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10) + * + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * + * The big problem is S_k, its a global sum needed to compute a local (W_i) + * property. + * + * [XXX write more on how we solve this.. _after_ merging pjt's patches that + *      rewrite all of this once again.] + */   static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -3300,52 +3837,58 @@ next:  /*   * update tg->load_weight by folding this cpu's load_avg   */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)  { -	struct cfs_rq *cfs_rq; -	unsigned long flags; -	struct rq *rq; - -	if (!tg->se[cpu]) -		return 0; - -	rq = cpu_rq(cpu); -	cfs_rq = tg->cfs_rq[cpu]; - -	raw_spin_lock_irqsave(&rq->lock, flags); - -	update_rq_clock(rq); -	update_cfs_load(cfs_rq, 1); +	struct sched_entity *se = tg->se[cpu]; +	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; -	/* -	 * We need to update shares after updating tg->load_weight in -	 * order to adjust the weight of groups with long running tasks. -	 */ -	update_cfs_shares(cfs_rq); +	/* throttled entities do not contribute to load */ +	if (throttled_hierarchy(cfs_rq)) +		return; -	raw_spin_unlock_irqrestore(&rq->lock, flags); +	update_cfs_rq_blocked_load(cfs_rq, 1); -	return 0; +	if (se) { +		update_entity_load_avg(se, 1); +		/* +		 * We pivot on our runnable average having decayed to zero for +		 * list removal.  This generally implies that all our children +		 * have also been removed (modulo rounding error or bandwidth +		 * control); however, such cases are rare and we can fix these +		 * at enqueue. +		 * +		 * TODO: fix up out-of-order children on enqueue. +		 */ +		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) +			list_del_leaf_cfs_rq(cfs_rq); +	} else { +		struct rq *rq = rq_of(cfs_rq); +		update_rq_runnable_avg(rq, rq->nr_running); +	}  } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu)  { -	struct cfs_rq *cfs_rq;  	struct rq *rq = cpu_rq(cpu); +	struct cfs_rq *cfs_rq; +	unsigned long flags; -	rcu_read_lock(); +	raw_spin_lock_irqsave(&rq->lock, flags); +	update_rq_clock(rq);  	/*  	 * Iterates the task_group tree in a bottom up fashion, see  	 * list_add_leaf_cfs_rq() for details.  	 */  	for_each_leaf_cfs_rq(rq, cfs_rq) { -		/* throttled entities do not contribute to load */ -		if (throttled_hierarchy(cfs_rq)) -			continue; - -		update_shares_cpu(cfs_rq->tg, cpu); +		/* +		 * Note: We may want to consider periodically releasing +		 * rq->lock about these updates so that creating many task +		 * groups does not result in continually extending hold time. +		 */ +		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);  	} -	rcu_read_unlock(); + +	raw_spin_unlock_irqrestore(&rq->lock, flags);  }  /* @@ -3397,7 +3940,7 @@ static unsigned long task_h_load(struct task_struct *p)  	return load;  }  #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu)  {  } @@ -4457,12 +5000,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)  	if (this_rq->avg_idle < sysctl_sched_migration_cost)  		return; +	update_rq_runnable_avg(this_rq, 1); +  	/*  	 * Drop the rq->lock, but keep IRQ/preempt disabled.  	 */  	raw_spin_unlock(&this_rq->lock); -	update_shares(this_cpu); +	update_blocked_averages(this_cpu);  	rcu_read_lock();  	for_each_domain(this_cpu, sd) {  		unsigned long interval; @@ -4717,7 +5262,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)  	int update_next_balance = 0;  	int need_serialize; -	update_shares(cpu); +	update_blocked_averages(cpu);  	rcu_read_lock();  	for_each_domain(cpu, sd) { @@ -4954,6 +5499,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)  		cfs_rq = cfs_rq_of(se);  		entity_tick(cfs_rq, se, queued);  	} + +	update_rq_runnable_avg(rq, 1);  }  /* @@ -5046,6 +5593,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)  		place_entity(cfs_rq, se, 0);  		se->vruntime -= cfs_rq->min_vruntime;  	} + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +	/* +	* Remove our load from contribution when we leave sched_fair +	* and ensure we don't carry in an old decay_count if we +	* switch back. +	*/ +	if (p->se.avg.decay_count) { +		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); +		__synchronize_entity_decay(&p->se); +		subtract_blocked_load_contrib(cfs_rq, +				p->se.avg.load_avg_contrib); +	} +#endif  }  /* @@ -5092,11 +5653,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)  #ifndef CONFIG_64BIT  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;  #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +	atomic64_set(&cfs_rq->decay_counter, 1); +	atomic64_set(&cfs_rq->removed_load, 0); +#endif  }  #ifdef CONFIG_FAIR_GROUP_SCHED  static void task_move_group_fair(struct task_struct *p, int on_rq)  { +	struct cfs_rq *cfs_rq;  	/*  	 * If the task was not on the rq at the time of this cgroup movement  	 * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5128,8 +5694,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)  	if (!on_rq)  		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;  	set_task_rq(p, task_cpu(p)); -	if (!on_rq) -		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; +	if (!on_rq) { +		cfs_rq = cfs_rq_of(&p->se); +		p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP +		/* +		 * migrate_task_rq_fair() will have removed our previous +		 * contribution, but we must synchronize for ongoing future +		 * decay. +		 */ +		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); +		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif +	}  }  void free_fair_sched_group(struct task_group *tg) @@ -5214,10 +5791,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  	cfs_rq->tg = tg;  	cfs_rq->rq = rq; -#ifdef CONFIG_SMP -	/* allow initial update_cfs_load() to truncate */ -	cfs_rq->load_stamp = 1; -#endif  	init_cfs_rq_runtime(cfs_rq);  	tg->cfs_rq[cpu] = cfs_rq; @@ -5264,8 +5837,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)  		se = tg->se[i];  		/* Propagate contribution to hierarchy */  		raw_spin_lock_irqsave(&rq->lock, flags); -		for_each_sched_entity(se) +		for_each_sched_entity(se) {  			update_cfs_shares(group_cfs_rq(se)); +			/* update contribution to parent */ +			update_entity_load_avg(se, 1); +		}  		raw_spin_unlock_irqrestore(&rq->lock, flags);  	} @@ -5319,7 +5895,9 @@ const struct sched_class fair_sched_class = {  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_fair, - +#ifdef CONFIG_FAIR_GROUP_SCHED +	.migrate_task_rq	= migrate_task_rq_fair, +#endif  	.rq_online		= rq_online_fair,  	.rq_offline		= rq_offline_fair,  |