diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 1097 | 
1 files changed, 792 insertions, 305 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 410eec40413..2325db2be31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)  {  	ktime_t now; -	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) +	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)  		return;  	if (hrtimer_active(&rt_b->rt_period_timer)) @@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;   */  static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ +	return list_empty(&root_task_group.children); +} +#endif +  #ifdef CONFIG_FAIR_GROUP_SCHED  #ifdef CONFIG_USER_SCHED  # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD) @@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)  #else +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ +	return 1; +} +#endif +  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }  static inline struct task_group *task_group(struct task_struct *p)  { @@ -467,11 +481,17 @@ struct rt_rq {  	struct rt_prio_array active;  	unsigned long rt_nr_running;  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED -	int highest_prio; /* highest queued rt task prio */ +	struct { +		int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP +		int next; /* next highest */ +#endif +	} highest_prio;  #endif  #ifdef CONFIG_SMP  	unsigned long rt_nr_migratory;  	int overloaded; +	struct plist_head pushable_tasks;  #endif  	int rt_throttled;  	u64 rt_time; @@ -549,7 +569,6 @@ struct rq {  	unsigned long nr_running;  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX]; -	unsigned char idle_at_tick;  #ifdef CONFIG_NO_HZ  	unsigned long last_tick_seen;  	unsigned char in_nohz_recently; @@ -590,6 +609,7 @@ struct rq {  	struct root_domain *rd;  	struct sched_domain *sd; +	unsigned char idle_at_tick;  	/* For active balancing */  	int active_balance;  	int push_cpu; @@ -618,9 +638,6 @@ struct rq {  	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */  	/* sys_sched_yield() stats */ -	unsigned int yld_exp_empty; -	unsigned int yld_act_empty; -	unsigned int yld_both_empty;  	unsigned int yld_count;  	/* schedule() stats */ @@ -1093,7 +1110,7 @@ static void hrtick_start(struct rq *rq, u64 delay)  	if (rq == this_rq()) {  		hrtimer_restart(timer);  	} else if (!rq->hrtick_csd_pending) { -		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); +		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);  		rq->hrtick_csd_pending = 1;  	}  } @@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p)  	assert_spin_locked(&task_rq(p)->lock); -	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) +	if (test_tsk_need_resched(p))  		return; -	set_tsk_thread_flag(p, TIF_NEED_RESCHED); +	set_tsk_need_resched(p);  	cpu = task_cpu(p);  	if (cpu == smp_processor_id()) @@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu)  	 * lockless. The worst case is that the other CPU runs the  	 * idle task through an additional NOOP schedule()  	 */ -	set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); +	set_tsk_need_resched(rq->idle);  	/* NEED_RESCHED must be visible before we test polling */  	smp_mb(); @@ -1610,21 +1627,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)  #endif +#ifdef CONFIG_PREEMPT +  /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations.  This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below.  However, it + * also adds more overhead and therefore may reduce throughput.   */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) +	__releases(this_rq->lock) +	__acquires(busiest->lock) +	__acquires(this_rq->lock) +{ +	spin_unlock(&this_rq->lock); +	double_rq_lock(this_rq, busiest); + +	return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry.  This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)  	__releases(this_rq->lock)  	__acquires(busiest->lock)  	__acquires(this_rq->lock)  {  	int ret = 0; -	if (unlikely(!irqs_disabled())) { -		/* printk() doesn't work good under rq->lock */ -		spin_unlock(&this_rq->lock); -		BUG_ON(1); -	}  	if (unlikely(!spin_trylock(&busiest->lock))) {  		if (busiest < this_rq) {  			spin_unlock(&this_rq->lock); @@ -1637,6 +1675,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)  	return ret;  } +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ +	if (unlikely(!irqs_disabled())) { +		/* printk() doesn't work good under rq->lock */ +		spin_unlock(&this_rq->lock); +		BUG_ON(1); +	} + +	return _double_lock_balance(this_rq, busiest); +} +  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)  	__releases(busiest->lock)  { @@ -1705,6 +1759,9 @@ static void update_avg(u64 *avg, u64 sample)  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)  { +	if (wakeup) +		p->se.start_runtime = p->se.sum_exec_runtime; +  	sched_info_queued(p);  	p->sched_class->enqueue_task(rq, p, wakeup);  	p->se.on_rq = 1; @@ -1712,10 +1769,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)  { -	if (sleep && p->se.last_wakeup) { -		update_avg(&p->se.avg_overlap, -			   p->se.sum_exec_runtime - p->se.last_wakeup); -		p->se.last_wakeup = 0; +	if (sleep) { +		if (p->se.last_wakeup) { +			update_avg(&p->se.avg_overlap, +				p->se.sum_exec_runtime - p->se.last_wakeup); +			p->se.last_wakeup = 0; +		} else { +			update_avg(&p->se.avg_wakeup, +				sysctl_sched_wakeup_granularity); +		}  	}  	sched_info_dequeued(p); @@ -2017,7 +2079,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)  		 * it must be off the runqueue _entirely_, and not  		 * preempted!  		 * -		 * So if it wa still runnable (but just not actively +		 * So if it was still runnable (but just not actively  		 * running right now), it's preempted, and we should  		 * yield - it could be a while.  		 */ @@ -2267,7 +2329,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)  		sync = 0;  #ifdef CONFIG_SMP -	if (sched_feat(LB_WAKEUP_UPDATE)) { +	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {  		struct sched_domain *sd;  		this_cpu = raw_smp_processor_id(); @@ -2345,6 +2407,22 @@ out_activate:  	activate_task(rq, p, 1);  	success = 1; +	/* +	 * Only attribute actual wakeups done by this task. +	 */ +	if (!in_interrupt()) { +		struct sched_entity *se = ¤t->se; +		u64 sample = se->sum_exec_runtime; + +		if (se->last_wakeup) +			sample -= se->last_wakeup; +		else +			sample -= se->start_runtime; +		update_avg(&se->avg_wakeup, sample); + +		se->last_wakeup = se->sum_exec_runtime; +	} +  out_running:  	trace_sched_wakeup(rq, p, success);  	check_preempt_curr(rq, p, sync); @@ -2355,8 +2433,6 @@ out_running:  		p->sched_class->task_wake_up(rq, p);  #endif  out: -	current->se.last_wakeup = current->se.sum_exec_runtime; -  	task_rq_unlock(rq, &flags);  	return success; @@ -2386,6 +2462,8 @@ static void __sched_fork(struct task_struct *p)  	p->se.prev_sum_exec_runtime	= 0;  	p->se.last_wakeup		= 0;  	p->se.avg_overlap		= 0; +	p->se.start_runtime		= 0; +	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;  #ifdef CONFIG_SCHEDSTATS  	p->se.wait_start		= 0; @@ -2448,6 +2526,8 @@ void sched_fork(struct task_struct *p, int clone_flags)  	/* Want to start with kernel preemption disabled. */  	task_thread_info(p)->preempt_count = 1;  #endif +	plist_node_init(&p->pushable_tasks, MAX_PRIO); +  	put_cpu();  } @@ -2491,7 +2571,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)  #ifdef CONFIG_PREEMPT_NOTIFIERS  /** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * preempt_notifier_register - tell me when current is being preempted & rescheduled   * @notifier: notifier struct to register   */  void preempt_notifier_register(struct preempt_notifier *notifier) @@ -2588,6 +2668,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  {  	struct mm_struct *mm = rq->prev_mm;  	long prev_state; +#ifdef CONFIG_SMP +	int post_schedule = 0; + +	if (current->sched_class->needs_post_schedule) +		post_schedule = current->sched_class->needs_post_schedule(rq); +#endif  	rq->prev_mm = NULL; @@ -2606,7 +2692,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	finish_arch_switch(prev);  	finish_lock_switch(rq, prev);  #ifdef CONFIG_SMP -	if (current->sched_class->post_schedule) +	if (post_schedule)  		current->sched_class->post_schedule(rq);  #endif @@ -2913,6 +2999,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  		     struct sched_domain *sd, enum cpu_idle_type idle,  		     int *all_pinned)  { +	int tsk_cache_hot = 0;  	/*  	 * We do not migrate tasks that are:  	 * 1) running (obviously), or @@ -2936,10 +3023,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  	 * 2) too many balance attempts have failed.  	 */ -	if (!task_hot(p, rq->clock, sd) || -			sd->nr_balance_failed > sd->cache_nice_tries) { +	tsk_cache_hot = task_hot(p, rq->clock, sd); +	if (!tsk_cache_hot || +		sd->nr_balance_failed > sd->cache_nice_tries) {  #ifdef CONFIG_SCHEDSTATS -		if (task_hot(p, rq->clock, sd)) { +		if (tsk_cache_hot) {  			schedstat_inc(sd, lb_hot_gained[idle]);  			schedstat_inc(p, se.nr_forced_migrations);  		} @@ -2947,7 +3035,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  		return 1;  	} -	if (task_hot(p, rq->clock, sd)) { +	if (tsk_cache_hot) {  		schedstat_inc(p, se.nr_failed_migrations_hot);  		return 0;  	} @@ -2987,6 +3075,16 @@ next:  	pulled++;  	rem_load_move -= p->se.load.weight; +#ifdef CONFIG_PREEMPT +	/* +	 * NEWIDLE balancing is a source of latency, so preemptible kernels +	 * will stop after the first task is pulled to minimize the critical +	 * section. +	 */ +	if (idle == CPU_NEWLY_IDLE) +		goto out; +#endif +  	/*  	 * We only want to steal up to the prescribed amount of weighted load.  	 */ @@ -3033,9 +3131,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,  				sd, idle, all_pinned, &this_best_prio);  		class = class->next; +#ifdef CONFIG_PREEMPT +		/* +		 * NEWIDLE balancing is a source of latency, so preemptible +		 * kernels will stop after the first task is pulled to minimize +		 * the critical section. +		 */  		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)  			break; - +#endif  	} while (class && max_load_move > total_load_moved);  	return total_load_moved > 0; @@ -3085,246 +3189,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,  	return 0;  } - +/********** Helpers for find_busiest_group ************************/  /* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. + * sd_lb_stats - Structure to store the statistics of a sched_domain + * 		during load balancing.   */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, -		   unsigned long *imbalance, enum cpu_idle_type idle, -		   int *sd_idle, const struct cpumask *cpus, int *balance) -{ -	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; -	unsigned long max_load, avg_load, total_load, this_load, total_pwr; -	unsigned long max_pull; -	unsigned long busiest_load_per_task, busiest_nr_running; -	unsigned long this_load_per_task, this_nr_running; -	int load_idx, group_imb = 0; +struct sd_lb_stats { +	struct sched_group *busiest; /* Busiest group in this sd */ +	struct sched_group *this;  /* Local group in this sd */ +	unsigned long total_load;  /* Total load of all groups in sd */ +	unsigned long total_pwr;   /*	Total power of all groups in sd */ +	unsigned long avg_load;	   /* Average load across all groups in sd */ + +	/** Statistics of this group */ +	unsigned long this_load; +	unsigned long this_load_per_task; +	unsigned long this_nr_running; + +	/* Statistics of the busiest group */ +	unsigned long max_load; +	unsigned long busiest_load_per_task; +	unsigned long busiest_nr_running; + +	int group_imb; /* Is there imbalance in this sd */  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	int power_savings_balance = 1; -	unsigned long leader_nr_running = 0, min_load_per_task = 0; -	unsigned long min_nr_running = ULONG_MAX; -	struct sched_group *group_min = NULL, *group_leader = NULL; +	int power_savings_balance; /* Is powersave balance needed for this sd */ +	struct sched_group *group_min; /* Least loaded group in sd */ +	struct sched_group *group_leader; /* Group which relieves group_min */ +	unsigned long min_load_per_task; /* load_per_task in group_min */ +	unsigned long leader_nr_running; /* Nr running of group_leader */ +	unsigned long min_nr_running; /* Nr running of group_min */  #endif +}; -	max_load = this_load = total_load = total_pwr = 0; -	busiest_load_per_task = busiest_nr_running = 0; -	this_load_per_task = this_nr_running = 0; +/* + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { +	unsigned long avg_load; /*Avg load across the CPUs of the group */ +	unsigned long group_load; /* Total load over the CPUs of the group */ +	unsigned long sum_nr_running; /* Nr tasks running in the group */ +	unsigned long sum_weighted_load; /* Weighted load of group's tasks */ +	unsigned long group_capacity; +	int group_imb; /* Is there an imbalance in the group ? */ +}; + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ +	return cpumask_first(sched_group_cpus(group)); +} + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, +					enum cpu_idle_type idle) +{ +	int load_idx; -	if (idle == CPU_NOT_IDLE) +	switch (idle) { +	case CPU_NOT_IDLE:  		load_idx = sd->busy_idx; -	else if (idle == CPU_NEWLY_IDLE) +		break; + +	case CPU_NEWLY_IDLE:  		load_idx = sd->newidle_idx; -	else +		break; +	default:  		load_idx = sd->idle_idx; +		break; +	} -	do { -		unsigned long load, group_capacity, max_cpu_load, min_cpu_load; -		int local_group; -		int i; -		int __group_imb = 0; -		unsigned int balance_cpu = -1, first_idle_cpu = 0; -		unsigned long sum_nr_running, sum_weighted_load; -		unsigned long sum_avg_load_per_task; -		unsigned long avg_load_per_task; +	return load_idx; +} -		local_group = cpumask_test_cpu(this_cpu, -					       sched_group_cpus(group)); -		if (local_group) -			balance_cpu = cpumask_first(sched_group_cpus(group)); +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, +	struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ +	/* +	 * Busy processors will not participate in power savings +	 * balance. +	 */ +	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) +		sds->power_savings_balance = 0; +	else { +		sds->power_savings_balance = 1; +		sds->min_nr_running = ULONG_MAX; +		sds->leader_nr_running = 0; +	} +} -		/* Tally up the load of all CPUs in the group */ -		sum_weighted_load = sum_nr_running = avg_load = 0; -		sum_avg_load_per_task = avg_load_per_task = 0; +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * 		load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, +	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ -		max_cpu_load = 0; -		min_cpu_load = ~0UL; +	if (!sds->power_savings_balance) +		return; -		for_each_cpu_and(i, sched_group_cpus(group), cpus) { -			struct rq *rq = cpu_rq(i); +	/* +	 * If the local group is idle or completely loaded +	 * no need to do power savings balance at this domain +	 */ +	if (local_group && (sds->this_nr_running >= sgs->group_capacity || +				!sds->this_nr_running)) +		sds->power_savings_balance = 0; -			if (*sd_idle && rq->nr_running) -				*sd_idle = 0; +	/* +	 * If a group is already running at full capacity or idle, +	 * don't include that group in power savings calculations +	 */ +	if (!sds->power_savings_balance || +		sgs->sum_nr_running >= sgs->group_capacity || +		!sgs->sum_nr_running) +		return; -			/* Bias balancing toward cpus of our domain */ -			if (local_group) { -				if (idle_cpu(i) && !first_idle_cpu) { -					first_idle_cpu = 1; -					balance_cpu = i; -				} +	/* +	 * Calculate the group which has the least non-idle load. +	 * This is the group from where we need to pick up the load +	 * for saving power +	 */ +	if ((sgs->sum_nr_running < sds->min_nr_running) || +	    (sgs->sum_nr_running == sds->min_nr_running && +	     group_first_cpu(group) > group_first_cpu(sds->group_min))) { +		sds->group_min = group; +		sds->min_nr_running = sgs->sum_nr_running; +		sds->min_load_per_task = sgs->sum_weighted_load / +						sgs->sum_nr_running; +	} -				load = target_load(i, load_idx); -			} else { -				load = source_load(i, load_idx); -				if (load > max_cpu_load) -					max_cpu_load = load; -				if (min_cpu_load > load) -					min_cpu_load = load; -			} +	/* +	 * Calculate the group which is almost near its +	 * capacity but still has some space to pick up some load +	 * from other group and save more power +	 */ +	if (sgs->sum_nr_running > sgs->group_capacity - 1) +		return; -			avg_load += load; -			sum_nr_running += rq->nr_running; -			sum_weighted_load += weighted_cpuload(i); +	if (sgs->sum_nr_running > sds->leader_nr_running || +	    (sgs->sum_nr_running == sds->leader_nr_running && +	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) { +		sds->group_leader = group; +		sds->leader_nr_running = sgs->sum_nr_running; +	} +} -			sum_avg_load_per_task += cpu_avg_load_per_task(i); -		} +/** + * check_power_save_busiest_group - see if there is potential for some power-savings balance + * @sds: Variable containing the statistics of the sched_domain + *	under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Description: + * Check if we have potential to perform some power-savings balance. + * If yes, set the busiest group to be the least loaded group in the + * sched_domain, so that it's CPUs can be put to idle. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, +					int this_cpu, unsigned long *imbalance) +{ +	if (!sds->power_savings_balance) +		return 0; -		/* -		 * First idle cpu or the first cpu(busiest) in this sched group -		 * is eligible for doing load balancing at this and above -		 * domains. In the newly idle case, we will allow all the cpu's -		 * to do the newly idle load balance. -		 */ -		if (idle != CPU_NEWLY_IDLE && local_group && -		    balance_cpu != this_cpu && balance) { -			*balance = 0; -			goto ret; -		} +	if (sds->this != sds->group_leader || +			sds->group_leader == sds->group_min) +		return 0; -		total_load += avg_load; -		total_pwr += group->__cpu_power; +	*imbalance = sds->min_load_per_task; +	sds->busiest = sds->group_min; -		/* Adjust by relative CPU power of the group */ -		avg_load = sg_div_cpu_power(group, -				avg_load * SCHED_LOAD_SCALE); +	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { +		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = +			group_first_cpu(sds->group_leader); +	} +	return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, +	struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ +	return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, +	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ +	return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, +					int this_cpu, unsigned long *imbalance) +{ +	return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -		/* -		 * Consider the group unbalanced when the imbalance is larger -		 * than the average weight of two tasks. -		 * -		 * APZ: with cgroup the avg task weight can vary wildly and -		 *      might not be a suitable number - should we keep a -		 *      normalized nr_running number somewhere that negates -		 *      the hierarchy? -		 */ -		avg_load_per_task = sg_div_cpu_power(group, -				sum_avg_load_per_task * SCHED_LOAD_SCALE); -		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) -			__group_imb = 1; +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @group: sched_group whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @sd_idle: Idle status of the sched_domain containing group. + * @local_group: Does group contain this_cpu. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, +			enum cpu_idle_type idle, int load_idx, int *sd_idle, +			int local_group, const struct cpumask *cpus, +			int *balance, struct sg_lb_stats *sgs) +{ +	unsigned long load, max_cpu_load, min_cpu_load; +	int i; +	unsigned int balance_cpu = -1, first_idle_cpu = 0; +	unsigned long sum_avg_load_per_task; +	unsigned long avg_load_per_task; -		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; +	if (local_group) +		balance_cpu = group_first_cpu(group); +	/* Tally up the load of all CPUs in the group */ +	sum_avg_load_per_task = avg_load_per_task = 0; +	max_cpu_load = 0; +	min_cpu_load = ~0UL; + +	for_each_cpu_and(i, sched_group_cpus(group), cpus) { +		struct rq *rq = cpu_rq(i); + +		if (*sd_idle && rq->nr_running) +			*sd_idle = 0; + +		/* Bias balancing toward cpus of our domain */  		if (local_group) { -			this_load = avg_load; -			this = group; -			this_nr_running = sum_nr_running; -			this_load_per_task = sum_weighted_load; -		} else if (avg_load > max_load && -			   (sum_nr_running > group_capacity || __group_imb)) { -			max_load = avg_load; -			busiest = group; -			busiest_nr_running = sum_nr_running; -			busiest_load_per_task = sum_weighted_load; -			group_imb = __group_imb; +			if (idle_cpu(i) && !first_idle_cpu) { +				first_idle_cpu = 1; +				balance_cpu = i; +			} + +			load = target_load(i, load_idx); +		} else { +			load = source_load(i, load_idx); +			if (load > max_cpu_load) +				max_cpu_load = load; +			if (min_cpu_load > load) +				min_cpu_load = load;  		} -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -		/* -		 * Busy processors will not participate in power savings -		 * balance. -		 */ -		if (idle == CPU_NOT_IDLE || -				!(sd->flags & SD_POWERSAVINGS_BALANCE)) -			goto group_next; +		sgs->group_load += load; +		sgs->sum_nr_running += rq->nr_running; +		sgs->sum_weighted_load += weighted_cpuload(i); -		/* -		 * If the local group is idle or completely loaded -		 * no need to do power savings balance at this domain -		 */ -		if (local_group && (this_nr_running >= group_capacity || -				    !this_nr_running)) -			power_savings_balance = 0; +		sum_avg_load_per_task += cpu_avg_load_per_task(i); +	} -		/* -		 * If a group is already running at full capacity or idle, -		 * don't include that group in power savings calculations -		 */ -		if (!power_savings_balance || sum_nr_running >= group_capacity -		    || !sum_nr_running) -			goto group_next; +	/* +	 * First idle cpu or the first cpu(busiest) in this sched group +	 * is eligible for doing load balancing at this and above +	 * domains. In the newly idle case, we will allow all the cpu's +	 * to do the newly idle load balance. +	 */ +	if (idle != CPU_NEWLY_IDLE && local_group && +	    balance_cpu != this_cpu && balance) { +		*balance = 0; +		return; +	} -		/* -		 * Calculate the group which has the least non-idle load. -		 * This is the group from where we need to pick up the load -		 * for saving power -		 */ -		if ((sum_nr_running < min_nr_running) || -		    (sum_nr_running == min_nr_running && -		     cpumask_first(sched_group_cpus(group)) > -		     cpumask_first(sched_group_cpus(group_min)))) { -			group_min = group; -			min_nr_running = sum_nr_running; -			min_load_per_task = sum_weighted_load / -						sum_nr_running; -		} +	/* Adjust by relative CPU power of the group */ +	sgs->avg_load = sg_div_cpu_power(group, +			sgs->group_load * SCHED_LOAD_SCALE); -		/* -		 * Calculate the group which is almost near its -		 * capacity but still has some space to pick up some load -		 * from other group and save more power -		 */ -		if (sum_nr_running <= group_capacity - 1) { -			if (sum_nr_running > leader_nr_running || -			    (sum_nr_running == leader_nr_running && -			     cpumask_first(sched_group_cpus(group)) < -			     cpumask_first(sched_group_cpus(group_leader)))) { -				group_leader = group; -				leader_nr_running = sum_nr_running; -			} + +	/* +	 * Consider the group unbalanced when the imbalance is larger +	 * than the average weight of two tasks. +	 * +	 * APZ: with cgroup the avg task weight can vary wildly and +	 *      might not be a suitable number - should we keep a +	 *      normalized nr_running number somewhere that negates +	 *      the hierarchy? +	 */ +	avg_load_per_task = sg_div_cpu_power(group, +			sum_avg_load_per_task * SCHED_LOAD_SCALE); + +	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) +		sgs->group_imb = 1; + +	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + +} + +/** + * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * @sd: sched_domain whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @sd_idle: Idle status of the sched_domain containing group. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. + */ +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, +			enum cpu_idle_type idle, int *sd_idle, +			const struct cpumask *cpus, int *balance, +			struct sd_lb_stats *sds) +{ +	struct sched_group *group = sd->groups; +	struct sg_lb_stats sgs; +	int load_idx; + +	init_sd_power_savings_stats(sd, sds, idle); +	load_idx = get_sd_load_idx(sd, idle); + +	do { +		int local_group; + +		local_group = cpumask_test_cpu(this_cpu, +					       sched_group_cpus(group)); +		memset(&sgs, 0, sizeof(sgs)); +		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, +				local_group, cpus, balance, &sgs); + +		if (local_group && balance && !(*balance)) +			return; + +		sds->total_load += sgs.group_load; +		sds->total_pwr += group->__cpu_power; + +		if (local_group) { +			sds->this_load = sgs.avg_load; +			sds->this = group; +			sds->this_nr_running = sgs.sum_nr_running; +			sds->this_load_per_task = sgs.sum_weighted_load; +		} else if (sgs.avg_load > sds->max_load && +			   (sgs.sum_nr_running > sgs.group_capacity || +				sgs.group_imb)) { +			sds->max_load = sgs.avg_load; +			sds->busiest = group; +			sds->busiest_nr_running = sgs.sum_nr_running; +			sds->busiest_load_per_task = sgs.sum_weighted_load; +			sds->group_imb = sgs.group_imb;  		} -group_next: -#endif + +		update_sd_power_savings_stats(group, sds, local_group, &sgs);  		group = group->next;  	} while (group != sd->groups); -	if (!busiest || this_load >= max_load || busiest_nr_running == 0) -		goto out_balanced; +} -	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + *			amongst the groups of a sched_domain, during + *			load balancing. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: Variable to store the imbalance. + */ +static inline void fix_small_imbalance(struct sd_lb_stats *sds, +				int this_cpu, unsigned long *imbalance) +{ +	unsigned long tmp, pwr_now = 0, pwr_move = 0; +	unsigned int imbn = 2; -	if (this_load >= avg_load || -			100*max_load <= sd->imbalance_pct*this_load) -		goto out_balanced; +	if (sds->this_nr_running) { +		sds->this_load_per_task /= sds->this_nr_running; +		if (sds->busiest_load_per_task > +				sds->this_load_per_task) +			imbn = 1; +	} else +		sds->this_load_per_task = +			cpu_avg_load_per_task(this_cpu); -	busiest_load_per_task /= busiest_nr_running; -	if (group_imb) -		busiest_load_per_task = min(busiest_load_per_task, avg_load); +	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= +			sds->busiest_load_per_task * imbn) { +		*imbalance = sds->busiest_load_per_task; +		return; +	}  	/* -	 * We're trying to get all the cpus to the average_load, so we don't -	 * want to push ourselves above the average load, nor do we wish to -	 * reduce the max loaded cpu below the average load, as either of these -	 * actions would just result in more rebalancing later, and ping-pong -	 * tasks around. Thus we look for the minimum possible imbalance. -	 * Negative imbalances (*we* are more loaded than anyone else) will -	 * be counted as no imbalance for these purposes -- we can't fix that -	 * by pulling tasks to us. Be careful of negative numbers as they'll -	 * appear as very large values with unsigned longs. +	 * OK, we don't have enough imbalance to justify moving tasks, +	 * however we may be able to increase total CPU power used by +	 * moving them.  	 */ -	if (max_load <= busiest_load_per_task) -		goto out_balanced; +	pwr_now += sds->busiest->__cpu_power * +			min(sds->busiest_load_per_task, sds->max_load); +	pwr_now += sds->this->__cpu_power * +			min(sds->this_load_per_task, sds->this_load); +	pwr_now /= SCHED_LOAD_SCALE; + +	/* Amount of load we'd subtract */ +	tmp = sg_div_cpu_power(sds->busiest, +			sds->busiest_load_per_task * SCHED_LOAD_SCALE); +	if (sds->max_load > tmp) +		pwr_move += sds->busiest->__cpu_power * +			min(sds->busiest_load_per_task, sds->max_load - tmp); + +	/* Amount of load we'd add */ +	if (sds->max_load * sds->busiest->__cpu_power < +		sds->busiest_load_per_task * SCHED_LOAD_SCALE) +		tmp = sg_div_cpu_power(sds->this, +			sds->max_load * sds->busiest->__cpu_power); +	else +		tmp = sg_div_cpu_power(sds->this, +			sds->busiest_load_per_task * SCHED_LOAD_SCALE); +	pwr_move += sds->this->__cpu_power * +			min(sds->this_load_per_task, sds->this_load + tmp); +	pwr_move /= SCHED_LOAD_SCALE; + +	/* Move if we gain throughput */ +	if (pwr_move > pwr_now) +		*imbalance = sds->busiest_load_per_task; +} + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + *			 groups of a given sched_domain during load balance. + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: Cpu for which currently load balance is being performed. + * @imbalance: The variable to store the imbalance. + */ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, +		unsigned long *imbalance) +{ +	unsigned long max_pull;  	/*  	 * In the presence of smp nice balancing, certain scenarios can have  	 * max load less than avg load(as we skip the groups at or below  	 * its cpu_power, while calculating max_load..)  	 */ -	if (max_load < avg_load) { +	if (sds->max_load < sds->avg_load) {  		*imbalance = 0; -		goto small_imbalance; +		return fix_small_imbalance(sds, this_cpu, imbalance);  	}  	/* Don't want to pull so many tasks that a group would go idle */ -	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); +	max_pull = min(sds->max_load - sds->avg_load, +			sds->max_load - sds->busiest_load_per_task);  	/* How much load to actually move to equalise the imbalance */ -	*imbalance = min(max_pull * busiest->__cpu_power, -				(avg_load - this_load) * this->__cpu_power) +	*imbalance = min(max_pull * sds->busiest->__cpu_power, +		(sds->avg_load - sds->this_load) * sds->this->__cpu_power)  			/ SCHED_LOAD_SCALE;  	/* @@ -3333,78 +3671,110 @@ group_next:  	 * a think about bumping its value to force at least one task to be  	 * moved  	 */ -	if (*imbalance < busiest_load_per_task) { -		unsigned long tmp, pwr_now, pwr_move; -		unsigned int imbn; +	if (*imbalance < sds->busiest_load_per_task) +		return fix_small_imbalance(sds, this_cpu, imbalance); -small_imbalance: -		pwr_move = pwr_now = 0; -		imbn = 2; -		if (this_nr_running) { -			this_load_per_task /= this_nr_running; -			if (busiest_load_per_task > this_load_per_task) -				imbn = 1; -		} else -			this_load_per_task = cpu_avg_load_per_task(this_cpu); +} +/******* find_busiest_group() helpers end here *********************/ -		if (max_load - this_load + busiest_load_per_task >= -					busiest_load_per_task * imbn) { -			*imbalance = busiest_load_per_task; -			return busiest; -		} +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + *		be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @sd_idle: The idleness of sd + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + *	is the appropriate cpu to perform load balancing at this_level. + * + * Returns:	- the busiest group if imbalance exists. + *		- If no imbalance and user has opted for power-savings balance, + *		   return the least loaded group whose CPUs can be + *		   put to idle by rebalancing its tasks onto our group. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, +		   unsigned long *imbalance, enum cpu_idle_type idle, +		   int *sd_idle, const struct cpumask *cpus, int *balance) +{ +	struct sd_lb_stats sds; -		/* -		 * OK, we don't have enough imbalance to justify moving tasks, -		 * however we may be able to increase total CPU power used by -		 * moving them. -		 */ +	memset(&sds, 0, sizeof(sds)); -		pwr_now += busiest->__cpu_power * -				min(busiest_load_per_task, max_load); -		pwr_now += this->__cpu_power * -				min(this_load_per_task, this_load); -		pwr_now /= SCHED_LOAD_SCALE; +	/* +	 * Compute the various statistics relavent for load balancing at +	 * this level. +	 */ +	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, +					balance, &sds); -		/* Amount of load we'd subtract */ -		tmp = sg_div_cpu_power(busiest, -				busiest_load_per_task * SCHED_LOAD_SCALE); -		if (max_load > tmp) -			pwr_move += busiest->__cpu_power * -				min(busiest_load_per_task, max_load - tmp); +	/* Cases where imbalance does not exist from POV of this_cpu */ +	/* 1) this_cpu is not the appropriate cpu to perform load balancing +	 *    at this level. +	 * 2) There is no busy sibling group to pull from. +	 * 3) This group is the busiest group. +	 * 4) This group is more busy than the avg busieness at this +	 *    sched_domain. +	 * 5) The imbalance is within the specified limit. +	 * 6) Any rebalance would lead to ping-pong +	 */ +	if (balance && !(*balance)) +		goto ret; -		/* Amount of load we'd add */ -		if (max_load * busiest->__cpu_power < -				busiest_load_per_task * SCHED_LOAD_SCALE) -			tmp = sg_div_cpu_power(this, -					max_load * busiest->__cpu_power); -		else -			tmp = sg_div_cpu_power(this, -				busiest_load_per_task * SCHED_LOAD_SCALE); -		pwr_move += this->__cpu_power * -				min(this_load_per_task, this_load + tmp); -		pwr_move /= SCHED_LOAD_SCALE; +	if (!sds.busiest || sds.busiest_nr_running == 0) +		goto out_balanced; -		/* Move if we gain throughput */ -		if (pwr_move > pwr_now) -			*imbalance = busiest_load_per_task; -	} +	if (sds.this_load >= sds.max_load) +		goto out_balanced; -	return busiest; +	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; -out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) -		goto ret; +	if (sds.this_load >= sds.avg_load) +		goto out_balanced; -	if (this == group_leader && group_leader != group_min) { -		*imbalance = min_load_per_task; -		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { -			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = -				cpumask_first(sched_group_cpus(group_leader)); -		} -		return group_min; -	} -#endif +	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) +		goto out_balanced; + +	sds.busiest_load_per_task /= sds.busiest_nr_running; +	if (sds.group_imb) +		sds.busiest_load_per_task = +			min(sds.busiest_load_per_task, sds.avg_load); + +	/* +	 * We're trying to get all the cpus to the average_load, so we don't +	 * want to push ourselves above the average load, nor do we wish to +	 * reduce the max loaded cpu below the average load, as either of these +	 * actions would just result in more rebalancing later, and ping-pong +	 * tasks around. Thus we look for the minimum possible imbalance. +	 * Negative imbalances (*we* are more loaded than anyone else) will +	 * be counted as no imbalance for these purposes -- we can't fix that +	 * by pulling tasks to us. Be careful of negative numbers as they'll +	 * appear as very large values with unsigned longs. +	 */ +	if (sds.max_load <= sds.busiest_load_per_task) +		goto out_balanced; + +	/* Looks like there is an imbalance. Compute it */ +	calculate_imbalance(&sds, this_cpu, imbalance); +	return sds.busiest; + +out_balanced: +	/* +	 * There is no obvious imbalance. But check if we can do some balancing +	 * to save power. +	 */ +	if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) +		return sds.busiest;  ret:  	*imbalance = 0;  	return NULL; @@ -4057,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)  #endif  } +static inline int on_null_domain(int cpu) +{ +	return !rcu_dereference(cpu_rq(cpu)->sd); +} +  /*   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.   * @@ -4114,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)  	    cpumask_test_cpu(cpu, nohz.cpu_mask))  		return;  #endif -	if (time_after_eq(jiffies, rq->next_balance)) +	/* Don't need to rebalance while attached to NULL domain */ +	if (time_after_eq(jiffies, rq->next_balance) && +	    likely(!on_null_domain(cpu)))  		raise_softirq(SCHED_SOFTIRQ);  } @@ -4508,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)  #endif  } +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ +	if (prev->state == TASK_RUNNING) { +		u64 runtime = prev->se.sum_exec_runtime; + +		runtime -= prev->se.prev_sum_exec_runtime; +		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + +		/* +		 * In order to avoid avg_overlap growing stale when we are +		 * indeed overlapping and hence not getting put to sleep, grow +		 * the avg_overlap on preemption. +		 * +		 * We use the average preemption runtime because that +		 * correlates to the amount of cache footprint a task can +		 * build up. +		 */ +		update_avg(&prev->se.avg_overlap, runtime); +	} +	prev->sched_class->put_prev_task(rq, prev); +} +  /*   * Pick up the highest-prio task:   */  static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq)  {  	const struct sched_class *class;  	struct task_struct *p; @@ -4543,15 +4942,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)  /*   * schedule() is the main scheduler function.   */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void)  {  	struct task_struct *prev, *next;  	unsigned long *switch_count;  	struct rq *rq;  	int cpu; -need_resched: -	preempt_disable();  	cpu = smp_processor_id();  	rq = cpu_rq(cpu);  	rcu_qsctr_inc(cpu); @@ -4586,8 +4983,8 @@ need_resched_nonpreemptible:  	if (unlikely(!rq->nr_running))  		idle_balance(cpu, rq); -	prev->sched_class->put_prev_task(rq, prev); -	next = pick_next_task(rq, prev); +	put_prev_task(rq, prev); +	next = pick_next_task(rq);  	if (likely(prev != next)) {  		sched_info_switch(prev, next); @@ -4608,13 +5005,80 @@ need_resched_nonpreemptible:  	if (unlikely(reacquire_kernel_lock(current) < 0))  		goto need_resched_nonpreemptible; +} +asmlinkage void __sched schedule(void) +{ +need_resched: +	preempt_disable(); +	__schedule();  	preempt_enable_no_resched();  	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))  		goto need_resched;  }  EXPORT_SYMBOL(schedule); +#ifdef CONFIG_SMP +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) +{ +	unsigned int cpu; +	struct rq *rq; + +	if (!sched_feat(OWNER_SPIN)) +		return 0; + +#ifdef CONFIG_DEBUG_PAGEALLOC +	/* +	 * Need to access the cpu field knowing that +	 * DEBUG_PAGEALLOC could have unmapped it if +	 * the mutex owner just released it and exited. +	 */ +	if (probe_kernel_address(&owner->cpu, cpu)) +		goto out; +#else +	cpu = owner->cpu; +#endif + +	/* +	 * Even if the access succeeded (likely case), +	 * the cpu field may no longer be valid. +	 */ +	if (cpu >= nr_cpumask_bits) +		goto out; + +	/* +	 * We need to validate that we can do a +	 * get_cpu() and that we have the percpu area. +	 */ +	if (!cpu_online(cpu)) +		goto out; + +	rq = cpu_rq(cpu); + +	for (;;) { +		/* +		 * Owner changed, break to re-assess state. +		 */ +		if (lock->owner != owner) +			break; + +		/* +		 * Is that owner really running on that cpu? +		 */ +		if (task_thread_info(rq->curr) != owner || need_resched()) +			return 0; + +		cpu_relax(); +	} +out: +	return 1; +} +#endif +  #ifdef CONFIG_PREEMPT  /*   * this is the entry point to schedule() from in-kernel preemption @@ -4642,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)  		 * between schedule and now.  		 */  		barrier(); -	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +	} while (need_resched());  }  EXPORT_SYMBOL(preempt_schedule); @@ -4671,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)  		 * between schedule and now.  		 */  		barrier(); -	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +	} while (need_resched());  }  #endif /* CONFIG_PREEMPT */ @@ -4732,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)  	__wake_up_common(q, mode, 1, 0, NULL);  } +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ +	__wake_up_common(q, mode, 1, 0, key); +} +  /** - * __wake_up_sync - wake up threads blocked on a waitqueue. + * __wake_up_sync_key - wake up threads blocked on a waitqueue.   * @q: the waitqueue   * @mode: which threads   * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets   *   * The sync wakeup differs that the waker knows that it will schedule   * away soon, so while the target thread will be woken up, it will not @@ -4745,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)   *   * On UP it can prevent extra preemption.   */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key)  {  	unsigned long flags;  	int sync = 1; @@ -4758,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)  		sync = 0;  	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, sync, NULL); +	__wake_up_common(q, mode, nr_exclusive, sync, key);  	spin_unlock_irqrestore(&q->lock, flags);  } +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ +	__wake_up_sync_key(q, mode, nr_exclusive, NULL); +}  EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */  /** @@ -5145,7 +5624,7 @@ SYSCALL_DEFINE1(nice, int, increment)  	if (increment > 40)  		increment = 40; -	nice = PRIO_TO_NICE(current->static_prio) + increment; +	nice = TASK_NICE(current) + increment;  	if (nice < -20)  		nice = -20;  	if (nice > 19) @@ -5944,12 +6423,7 @@ void sched_show_task(struct task_struct *p)  		printk(KERN_CONT " %016lx ", thread_saved_pc(p));  #endif  #ifdef CONFIG_DEBUG_STACK_USAGE -	{ -		unsigned long *n = end_of_stack(p); -		while (!*n) -			n++; -		free = (unsigned long)n - (unsigned long)end_of_stack(p); -	} +	free = stack_not_used(p);  #endif  	printk(KERN_CONT "%5lu %5d %6d\n", free,  		task_pid_nr(p), task_pid_nr(p->real_parent)); @@ -6423,7 +6897,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)  		if (!rq->nr_running)  			break;  		update_rq_clock(rq); -		next = pick_next_task(rq, rq->curr); +		next = pick_next_task(rq);  		if (!next)  			break;  		next->sched_class->put_prev_task(rq, next); @@ -8218,11 +8692,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  	__set_bit(MAX_RT_PRIO, array->bitmap);  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED -	rt_rq->highest_prio = MAX_RT_PRIO; +	rt_rq->highest_prio.curr = MAX_RT_PRIO; +#ifdef CONFIG_SMP +	rt_rq->highest_prio.next = MAX_RT_PRIO; +#endif  #endif  #ifdef CONFIG_SMP  	rt_rq->rt_nr_migratory = 0;  	rt_rq->overloaded = 0; +	plist_head_init(&rq->rt.pushable_tasks, &rq->lock);  #endif  	rt_rq->rt_time = 0; @@ -9224,6 +9702,16 @@ static int sched_rt_global_constraints(void)  	return ret;  } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ +	/* Don't accept realtime tasks when there is no way for them to run */ +	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) +		return 0; + +	return 1; +} +  #else /* !CONFIG_RT_GROUP_SCHED */  static int sched_rt_global_constraints(void)  { @@ -9317,8 +9805,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,  		      struct task_struct *tsk)  {  #ifdef CONFIG_RT_GROUP_SCHED -	/* Don't accept realtime tasks when there is no way for them to run */ -	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) +	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))  		return -EINVAL;  #else  	/* We don't support RT-tasks being in separate groups */ @@ -9481,7 +9968,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)  { -	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);  	u64 data;  #ifndef CONFIG_64BIT @@ -9500,7 +9987,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)  static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)  { -	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);  #ifndef CONFIG_64BIT  	/* @@ -9589,14 +10076,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)  	struct cpuacct *ca;  	int cpu; -	if (!cpuacct_subsys.active) +	if (unlikely(!cpuacct_subsys.active))  		return;  	cpu = task_cpu(tsk);  	ca = task_ca(tsk);  	for (; ca; ca = ca->parent) { -		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); +		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);  		*cpuusage += cputime;  	}  }  |