diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup.c | 3 | ||||
| -rw-r--r-- | kernel/fork.c | 2 | ||||
| -rw-r--r-- | kernel/sched/Makefile | 1 | ||||
| -rw-r--r-- | kernel/sched/core.c | 254 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.c | 296 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.h | 17 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 214 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 148 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 16 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 219 | 
10 files changed, 719 insertions, 451 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index eeb7e49946b..d3abce2d645 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4380,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  	 * need to invoke fork callbacks here. */  	BUG_ON(!list_empty(&init_task.tasks)); -	ss->active = 1;  	BUG_ON(online_css(ss, dummytop));  	mutex_unlock(&cgroup_mutex); @@ -4485,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	}  	write_unlock(&css_set_lock); -	ss->active = 1;  	ret = online_css(ss, dummytop);  	if (ret)  		goto err_unload; @@ -4526,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	mutex_lock(&cgroup_mutex);  	offline_css(ss, dummytop); -	ss->active = 0;  	if (ss->use_id)  		idr_destroy(&ss->idr); diff --git a/kernel/fork.c b/kernel/fork.c index 1766d324d5e..339f60dfd62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	p->utime = p->stime = p->gtime = 0;  	p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE  	p->prev_cputime.utime = p->prev_cputime.stime = 0;  #endif  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103..deaf90e4a1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o  obj-$(CONFIG_SCHEDSTATS) += stats.o  obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8285eb0cde..ebdb1954121 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1288,8 +1288,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)  static void  ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  { -	trace_sched_wakeup(p, true);  	check_preempt_curr(rq, p, wake_flags); +	trace_sched_wakeup(p, true);  	p->state = TASK_RUNNING;  #ifdef CONFIG_SMP @@ -3039,11 +3039,13 @@ EXPORT_SYMBOL(preempt_schedule);  asmlinkage void __sched preempt_schedule_irq(void)  {  	struct thread_info *ti = current_thread_info(); +	enum ctx_state prev_state;  	/* Catch callers which need to be fixed */  	BUG_ON(ti->preempt_count || !irqs_disabled()); -	user_exit(); +	prev_state = exception_enter(); +  	do {  		add_preempt_count(PREEMPT_ACTIVE);  		local_irq_enable(); @@ -3057,6 +3059,8 @@ asmlinkage void __sched preempt_schedule_irq(void)  		 */  		barrier();  	} while (need_resched()); + +	exception_exit(prev_state);  }  #endif /* CONFIG_PREEMPT */ @@ -6204,7 +6208,7 @@ static void sched_init_numa(void)  	 * 'level' contains the number of unique distances, excluding the  	 * identity distance node_distance(i,i).  	 * -	 * The sched_domains_nume_distance[] array includes the actual distance +	 * The sched_domains_numa_distance[] array includes the actual distance  	 * numbers.  	 */ @@ -6817,11 +6821,15 @@ int in_sched_functions(unsigned long addr)  }  #ifdef CONFIG_CGROUP_SCHED +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */  struct task_group root_task_group;  LIST_HEAD(task_groups);  #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);  void __init sched_init(void)  { @@ -6858,7 +6866,7 @@ void __init sched_init(void)  #endif /* CONFIG_RT_GROUP_SCHED */  #ifdef CONFIG_CPUMASK_OFFSTACK  		for_each_possible_cpu(i) { -			per_cpu(load_balance_tmpmask, i) = (void *)ptr; +			per_cpu(load_balance_mask, i) = (void *)ptr;  			ptr += cpumask_size();  		}  #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -6884,12 +6892,6 @@ void __init sched_init(void)  #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT -	root_cpuacct.cpustat = &kernel_cpustat; -	root_cpuacct.cpuusage = alloc_percpu(u64); -	/* Too early, not expected to fail */ -	BUG_ON(!root_cpuacct.cpuusage); -#endif  	for_each_possible_cpu(i) {  		struct rq *rq; @@ -7411,7 +7413,7 @@ unlock:  	return err;  } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)  {  	u64 rt_runtime, rt_period; @@ -7423,7 +7425,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)  	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);  } -long sched_group_rt_runtime(struct task_group *tg) +static long sched_group_rt_runtime(struct task_group *tg)  {  	u64 rt_runtime_us; @@ -7435,7 +7437,7 @@ long sched_group_rt_runtime(struct task_group *tg)  	return rt_runtime_us;  } -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)  {  	u64 rt_runtime, rt_period; @@ -7448,7 +7450,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)  	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);  } -long sched_group_rt_period(struct task_group *tg) +static long sched_group_rt_period(struct task_group *tg)  {  	u64 rt_period_us; @@ -7483,7 +7485,7 @@ static int sched_rt_global_constraints(void)  	return ret;  } -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)  {  	/* Don't accept realtime tasks when there is no way for them to run */  	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) @@ -7991,226 +7993,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {  #endif	/* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ -	struct cpuacct *ca; - -	if (!cgrp->parent) -		return &root_cpuacct.css; - -	ca = kzalloc(sizeof(*ca), GFP_KERNEL); -	if (!ca) -		goto out; - -	ca->cpuusage = alloc_percpu(u64); -	if (!ca->cpuusage) -		goto out_free_ca; - -	ca->cpustat = alloc_percpu(struct kernel_cpustat); -	if (!ca->cpustat) -		goto out_free_cpuusage; - -	return &ca->css; - -out_free_cpuusage: -	free_percpu(ca->cpuusage); -out_free_ca: -	kfree(ca); -out: -	return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ -	struct cpuacct *ca = cgroup_ca(cgrp); - -	free_percpu(ca->cpustat); -	free_percpu(ca->cpuusage); -	kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ -	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); -	u64 data; - -#ifndef CONFIG_64BIT -	/* -	 * Take rq->lock to make 64-bit read safe on 32-bit platforms. -	 */ -	raw_spin_lock_irq(&cpu_rq(cpu)->lock); -	data = *cpuusage; -	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else -	data = *cpuusage; -#endif - -	return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ -	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT -	/* -	 * Take rq->lock to make 64-bit write safe on 32-bit platforms. -	 */ -	raw_spin_lock_irq(&cpu_rq(cpu)->lock); -	*cpuusage = val; -	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else -	*cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ -	struct cpuacct *ca = cgroup_ca(cgrp); -	u64 totalcpuusage = 0; -	int i; - -	for_each_present_cpu(i) -		totalcpuusage += cpuacct_cpuusage_read(ca, i); - -	return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, -								u64 reset) -{ -	struct cpuacct *ca = cgroup_ca(cgrp); -	int err = 0; -	int i; - -	if (reset) { -		err = -EINVAL; -		goto out; -	} - -	for_each_present_cpu(i) -		cpuacct_cpuusage_write(ca, i, 0); - -out: -	return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, -				   struct seq_file *m) -{ -	struct cpuacct *ca = cgroup_ca(cgroup); -	u64 percpu; -	int i; - -	for_each_present_cpu(i) { -		percpu = cpuacct_cpuusage_read(ca, i); -		seq_printf(m, "%llu ", (unsigned long long) percpu); -	} -	seq_printf(m, "\n"); -	return 0; -} - -static const char *cpuacct_stat_desc[] = { -	[CPUACCT_STAT_USER] = "user", -	[CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, -			      struct cgroup_map_cb *cb) -{ -	struct cpuacct *ca = cgroup_ca(cgrp); -	int cpu; -	s64 val = 0; - -	for_each_online_cpu(cpu) { -		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -		val += kcpustat->cpustat[CPUTIME_USER]; -		val += kcpustat->cpustat[CPUTIME_NICE]; -	} -	val = cputime64_to_clock_t(val); -	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - -	val = 0; -	for_each_online_cpu(cpu) { -		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -		val += kcpustat->cpustat[CPUTIME_SYSTEM]; -		val += kcpustat->cpustat[CPUTIME_IRQ]; -		val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; -	} - -	val = cputime64_to_clock_t(val); -	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - -	return 0; -} - -static struct cftype files[] = { -	{ -		.name = "usage", -		.read_u64 = cpuusage_read, -		.write_u64 = cpuusage_write, -	}, -	{ -		.name = "usage_percpu", -		.read_seq_string = cpuacct_percpu_seq_read, -	}, -	{ -		.name = "stat", -		.read_map = cpuacct_stats_show, -	}, -	{ }	/* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ -	struct cpuacct *ca; -	int cpu; - -	if (unlikely(!cpuacct_subsys.active)) -		return; - -	cpu = task_cpu(tsk); - -	rcu_read_lock(); - -	ca = task_ca(tsk); - -	for (; ca; ca = parent_ca(ca)) { -		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); -		*cpuusage += cputime; -	} - -	rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { -	.name = "cpuacct", -	.css_alloc = cpuacct_css_alloc, -	.css_free = cpuacct_css_free, -	.subsys_id = cpuacct_subsys_id, -	.base_cftypes = files, -}; -#endif	/* CONFIG_CGROUP_CPUACCT */ -  void dump_cpu_task(int cpu)  {  	pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 00000000000..dbb7e2cd95e --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,296 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> +#include <linux/err.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { +	CPUACCT_STAT_USER,	/* ... user mode */ +	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */ + +	CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { +	struct cgroup_subsys_state css; +	/* cpuusage holds pointer to a u64-type object on every cpu */ +	u64 __percpu *cpuusage; +	struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ +	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), +			    struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ +	return container_of(task_subsys_state(tsk, cpuacct_subsys_id), +			    struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ +	return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ +	if (!ca->css.cgroup->parent) +		return NULL; +	return cgroup_ca(ca->css.cgroup->parent); +} + +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static struct cpuacct root_cpuacct = { +	.cpustat	= &kernel_cpustat, +	.cpuusage	= &root_cpuacct_cpuusage, +}; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ +	struct cpuacct *ca; + +	if (!cgrp->parent) +		return &root_cpuacct.css; + +	ca = kzalloc(sizeof(*ca), GFP_KERNEL); +	if (!ca) +		goto out; + +	ca->cpuusage = alloc_percpu(u64); +	if (!ca->cpuusage) +		goto out_free_ca; + +	ca->cpustat = alloc_percpu(struct kernel_cpustat); +	if (!ca->cpustat) +		goto out_free_cpuusage; + +	return &ca->css; + +out_free_cpuusage: +	free_percpu(ca->cpuusage); +out_free_ca: +	kfree(ca); +out: +	return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); + +	free_percpu(ca->cpustat); +	free_percpu(ca->cpuusage); +	kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); +	u64 data; + +#ifndef CONFIG_64BIT +	/* +	 * Take rq->lock to make 64-bit read safe on 32-bit platforms. +	 */ +	raw_spin_lock_irq(&cpu_rq(cpu)->lock); +	data = *cpuusage; +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else +	data = *cpuusage; +#endif + +	return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT +	/* +	 * Take rq->lock to make 64-bit write safe on 32-bit platforms. +	 */ +	raw_spin_lock_irq(&cpu_rq(cpu)->lock); +	*cpuusage = val; +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else +	*cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	u64 totalcpuusage = 0; +	int i; + +	for_each_present_cpu(i) +		totalcpuusage += cpuacct_cpuusage_read(ca, i); + +	return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, +								u64 reset) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	int err = 0; +	int i; + +	if (reset) { +		err = -EINVAL; +		goto out; +	} + +	for_each_present_cpu(i) +		cpuacct_cpuusage_write(ca, i, 0); + +out: +	return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, +				   struct seq_file *m) +{ +	struct cpuacct *ca = cgroup_ca(cgroup); +	u64 percpu; +	int i; + +	for_each_present_cpu(i) { +		percpu = cpuacct_cpuusage_read(ca, i); +		seq_printf(m, "%llu ", (unsigned long long) percpu); +	} +	seq_printf(m, "\n"); +	return 0; +} + +static const char * const cpuacct_stat_desc[] = { +	[CPUACCT_STAT_USER] = "user", +	[CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, +			      struct cgroup_map_cb *cb) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	int cpu; +	s64 val = 0; + +	for_each_online_cpu(cpu) { +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); +		val += kcpustat->cpustat[CPUTIME_USER]; +		val += kcpustat->cpustat[CPUTIME_NICE]; +	} +	val = cputime64_to_clock_t(val); +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + +	val = 0; +	for_each_online_cpu(cpu) { +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); +		val += kcpustat->cpustat[CPUTIME_SYSTEM]; +		val += kcpustat->cpustat[CPUTIME_IRQ]; +		val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; +	} + +	val = cputime64_to_clock_t(val); +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + +	return 0; +} + +static struct cftype files[] = { +	{ +		.name = "usage", +		.read_u64 = cpuusage_read, +		.write_u64 = cpuusage_write, +	}, +	{ +		.name = "usage_percpu", +		.read_seq_string = cpuacct_percpu_seq_read, +	}, +	{ +		.name = "stat", +		.read_map = cpuacct_stats_show, +	}, +	{ }	/* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +	struct cpuacct *ca; +	int cpu; + +	cpu = task_cpu(tsk); + +	rcu_read_lock(); + +	ca = task_ca(tsk); + +	while (true) { +		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); +		*cpuusage += cputime; + +		ca = parent_ca(ca); +		if (!ca) +			break; +	} + +	rcu_read_unlock(); +} + +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +	struct kernel_cpustat *kcpustat; +	struct cpuacct *ca; + +	rcu_read_lock(); +	ca = task_ca(p); +	while (ca != &root_cpuacct) { +		kcpustat = this_cpu_ptr(ca->cpustat); +		kcpustat->cpustat[index] += val; +		ca = __parent_ca(ca); +	} +	rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { +	.name		= "cpuacct", +	.css_alloc	= cpuacct_css_alloc, +	.css_free	= cpuacct_css_free, +	.subsys_id	= cpuacct_subsys_id, +	.base_cftypes	= files, +	.early_init	= 1, +}; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 00000000000..ed605624a5e --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,17 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e93cca92f38..ea32f02bf2c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)  static inline void task_group_account_field(struct task_struct *p, int index,  					    u64 tmp)  { -#ifdef CONFIG_CGROUP_CPUACCT -	struct kernel_cpustat *kcpustat; -	struct cpuacct *ca; -#endif  	/*  	 * Since all updates are sure to touch the root cgroup, we  	 * get ourselves ahead and touch it first. If the root cgroup @@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,  	 */  	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp; -#ifdef CONFIG_CGROUP_CPUACCT -	if (unlikely(!cpuacct_subsys.active)) -		return; - -	rcu_read_lock(); -	ca = task_ca(p); -	while (ca && (ca != &root_cpuacct)) { -		kcpustat = this_cpu_ptr(ca->cpustat); -		kcpustat->cpustat[index] += tmp; -		ca = parent_ca(ca); -	} -	rcu_read_unlock(); -#endif +	cpuacct_account_field(p, index, tmp);  }  /* @@ -388,82 +372,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_  						struct rq *rq) {}  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ -	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); -	struct rq *rq = this_rq(); - -	if (vtime_accounting_enabled()) -		return; - -	if (sched_clock_irqtime) { -		irqtime_account_process_tick(p, user_tick, rq); -		return; -	} - -	if (steal_account_process_tick()) -		return; - -	if (user_tick) -		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); -	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) -		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, -				    one_jiffy_scaled); -	else -		account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ -	account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - -	if (sched_clock_irqtime) { -		irqtime_account_idle_ticks(ticks); -		return; -	} - -	account_idle_time(jiffies_to_cputime(ticks)); -} -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -  /*   * Use precise platform statistics if available:   */  #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ -	*ut = p->utime; -	*st = p->stime; -} - -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ -	struct task_cputime cputime; - -	thread_group_cputime(p, &cputime); - -	*ut = cputime.utime; -	*st = cputime.stime; -}  #ifndef __ARCH_HAS_VTIME_TASK_SWITCH  void vtime_task_switch(struct task_struct *prev) @@ -518,21 +430,111 @@ void vtime_account_irq_enter(struct task_struct *tsk)  }  EXPORT_SYMBOL_GPL(vtime_account_irq_enter);  #endif /* __ARCH_HAS_VTIME_ACCOUNT */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	*ut = p->utime; +	*st = p->stime; +} -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	struct task_cputime cputime; -static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) +	thread_group_cputime(p, &cputime); + +	*ut = cputime.utime; +	*st = cputime.stime; +} +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick)  { -	u64 temp = (__force u64) rtime; +	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	struct rq *rq = this_rq(); -	temp *= (__force u64) stime; +	if (vtime_accounting_enabled()) +		return; + +	if (sched_clock_irqtime) { +		irqtime_account_process_tick(p, user_tick, rq); +		return; +	} + +	if (steal_account_process_tick()) +		return; -	if (sizeof(cputime_t) == 4) -		temp = div_u64(temp, (__force u32) total); +	if (user_tick) +		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) +		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, +				    one_jiffy_scaled);  	else -		temp = div64_u64(temp, (__force u64) total); +		account_idle_time(cputime_one_jiffy); +} -	return (__force cputime_t) temp; +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ +	account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + +	if (sched_clock_irqtime) { +		irqtime_account_idle_ticks(ticks); +		return; +	} + +	account_idle_time(jiffies_to_cputime(ticks)); +} + +/* + * Perform (stime * rtime) / total with reduced chances + * of multiplication overflows by using smaller factors + * like quotient and remainders of divisions between + * rtime and total. + */ +static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) +{ +	u64 rem, res, scaled; + +	if (rtime >= total) { +		/* +		 * Scale up to rtime / total then add +		 * the remainder scaled to stime / total. +		 */ +		res = div64_u64_rem(rtime, total, &rem); +		scaled = stime * res; +		scaled += div64_u64(stime * rem, total); +	} else { +		/* +		 * Same in reverse: scale down to total / rtime +		 * then substract that result scaled to +		 * to the remaining part. +		 */ +		res = div64_u64_rem(total, rtime, &rem); +		scaled = div64_u64(stime, res); +		scaled -= div64_u64(scaled * rem, total); +	} + +	return (__force cputime_t) scaled;  }  /* @@ -545,6 +547,12 @@ static void cputime_adjust(struct task_cputime *curr,  {  	cputime_t rtime, stime, total; +	if (vtime_accounting_enabled()) { +		*ut = curr->utime; +		*st = curr->stime; +		return; +	} +  	stime = curr->stime;  	total = stime + curr->utime; @@ -560,10 +568,14 @@ static void cputime_adjust(struct task_cputime *curr,  	 */  	rtime = nsecs_to_cputime(curr->sum_exec_runtime); -	if (total) -		stime = scale_stime(stime, rtime, total); -	else +	if (!rtime) { +		stime = 0; +	} else if (!total) {  		stime = rtime; +	} else { +		stime = scale_stime((__force u64)stime, +				    (__force u64)rtime, (__force u64)total); +	}  	/*  	 * If the tick based count grows faster than the scheduler one, @@ -597,7 +609,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime  	thread_group_cputime(p, &cputime);  	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);  } -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN  static unsigned long long vtime_delta(struct task_struct *tsk) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a33e5986fc..8bf7081b1ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);   * Scheduling class tree data structure manipulation methods:   */ -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)  { -	s64 delta = (s64)(vruntime - min_vruntime); +	s64 delta = (s64)(vruntime - max_vruntime);  	if (delta > 0) -		min_vruntime = vruntime; +		max_vruntime = vruntime; -	return min_vruntime; +	return max_vruntime;  }  static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) @@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)  			vruntime = min_vruntime(vruntime, se->vruntime);  	} +	/* ensure we never gain time by being placed backwards. */  	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);  #ifndef CONFIG_64BIT  	smp_wmb(); @@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  /* - * We calculate the vruntime slice of a to be inserted task + * We calculate the vruntime slice of a to-be-inserted task.   *   * vs = s/w   */ @@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,  		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);  	} /* migrations, e.g. sleep=0 leave decay_count == 0 */  } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ +	update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ +	update_rq_runnable_avg(this_rq, 0); +} +  #else  static inline void update_entity_load_avg(struct sched_entity *se,  					  int update_cfs_rq) {} @@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	int tsk_cache_hot = 0;  	/*  	 * We do not migrate tasks that are: -	 * 1) running (obviously), or +	 * 1) throttled_lb_pair, or  	 * 2) cannot be migrated to this CPU due to cpus_allowed, or -	 * 3) are cache-hot on their current CPU. +	 * 3) running (obviously), or +	 * 4) are cache-hot on their current CPU.  	 */ +	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) +		return 0; +  	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { -		int new_dst_cpu; +		int cpu;  		schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))  			return 0; -		new_dst_cpu = cpumask_first_and(env->dst_grpmask, -						tsk_cpus_allowed(p)); -		if (new_dst_cpu < nr_cpu_ids) { -			env->flags |= LBF_SOME_PINNED; -			env->new_dst_cpu = new_dst_cpu; +		/* Prevent to re-select dst_cpu via env's cpus */ +		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { +			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { +				env->flags |= LBF_SOME_PINNED; +				env->new_dst_cpu = cpu; +				break; +			}  		} +  		return 0;  	} @@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);  	if (!tsk_cache_hot ||  		env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS +  		if (tsk_cache_hot) {  			schedstat_inc(env->sd, lb_hot_gained[env->idle]);  			schedstat_inc(p, se.statistics.nr_forced_migrations);  		} -#endif +  		return 1;  	} -	if (tsk_cache_hot) { -		schedstat_inc(p, se.statistics.nr_failed_migrations_hot); -		return 0; -	} -	return 1; +	schedstat_inc(p, se.statistics.nr_failed_migrations_hot); +	return 0;  }  /* @@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)  	struct task_struct *p, *n;  	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { -		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) -			continue; -  		if (!can_migrate_task(p, env))  			continue; @@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)  			break;  		} -		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) +		if (!can_migrate_task(p, env))  			goto next;  		load = task_h_load(p); @@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)  		if ((load / 2) > env->imbalance)  			goto next; -		if (!can_migrate_task(p, env)) -			goto next; -  		move_task(p, env);  		pulled++;  		env->imbalance -= load; @@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,  	return load_idx;  } -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)  {  	return SCHED_POWER_SCALE;  } @@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)  	return default_scale_freq_power(sd, cpu);  } -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)  {  	unsigned long weight = sd->span_weight;  	unsigned long smt_gain = sd->smt_gain; @@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)  	return default_scale_smt_power(sd, cpu);  } -unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_power(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	u64 total, available, age_stamp, avg; @@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,  #define MAX_PINNED_INTERVAL	512  /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);  static int need_active_balance(struct lb_env *env)  { @@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,  			int *balance)  {  	int ld_moved, cur_ld_moved, active_balance = 0; -	int lb_iterations, max_lb_iterations;  	struct sched_group *group;  	struct rq *busiest;  	unsigned long flags; -	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); +	struct cpumask *cpus = __get_cpu_var(load_balance_mask);  	struct lb_env env = {  		.sd		= sd, @@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,  		.cpus		= cpus,  	}; +	/* +	 * For NEWLY_IDLE load_balancing, we don't need to consider +	 * other cpus in our group +	 */ +	if (idle == CPU_NEWLY_IDLE) +		env.dst_grpmask = NULL; +  	cpumask_copy(cpus, cpu_active_mask); -	max_lb_iterations = cpumask_weight(env.dst_grpmask);  	schedstat_inc(sd, lb_count[idle]); @@ -5034,7 +5059,6 @@ redo:  	schedstat_add(sd, lb_imbalance[idle], env.imbalance);  	ld_moved = 0; -	lb_iterations = 1;  	if (busiest->nr_running > 1) {  		/*  		 * Attempt to move tasks. If find_busiest_group has found @@ -5061,17 +5085,17 @@ more_balance:  		double_rq_unlock(env.dst_rq, busiest);  		local_irq_restore(flags); -		if (env.flags & LBF_NEED_BREAK) { -			env.flags &= ~LBF_NEED_BREAK; -			goto more_balance; -		} -  		/*  		 * some other cpu did the load balance for us.  		 */  		if (cur_ld_moved && env.dst_cpu != smp_processor_id())  			resched_cpu(env.dst_cpu); +		if (env.flags & LBF_NEED_BREAK) { +			env.flags &= ~LBF_NEED_BREAK; +			goto more_balance; +		} +  		/*  		 * Revisit (affine) tasks on src_cpu that couldn't be moved to  		 * us and move them to an alternate dst_cpu in our sched_group @@ -5091,14 +5115,17 @@ more_balance:  		 * moreover subsequent load balance cycles should correct the  		 * excess load moved.  		 */ -		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && -				lb_iterations++ < max_lb_iterations) { +		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {  			env.dst_rq	 = cpu_rq(env.new_dst_cpu);  			env.dst_cpu	 = env.new_dst_cpu;  			env.flags	&= ~LBF_SOME_PINNED;  			env.loop	 = 0;  			env.loop_break	 = sched_nr_migrate_break; + +			/* Prevent to re-select dst_cpu via env's cpus */ +			cpumask_clear_cpu(env.dst_cpu, env.cpus); +  			/*  			 * Go back to "more_balance" rather than "redo" since we  			 * need to continue with same src_cpu. @@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)  	if (this_rq->avg_idle < sysctl_sched_migration_cost)  		return; -	update_rq_runnable_avg(this_rq, 1); -  	/*  	 * Drop the rq->lock, but keep IRQ/preempt disabled.  	 */ @@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)  	struct sched_domain *sd;  	int cpu = smp_processor_id(); -	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) -		return; -	clear_bit(NOHZ_IDLE, nohz_flags(cpu)); -  	rcu_read_lock(); -	for_each_domain(cpu, sd) +	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + +	if (!sd || !sd->nohz_idle) +		goto unlock; +	sd->nohz_idle = 0; + +	for (; sd; sd = sd->parent)  		atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock:  	rcu_read_unlock();  } @@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)  	struct sched_domain *sd;  	int cpu = smp_processor_id(); -	if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) -		return; -	set_bit(NOHZ_IDLE, nohz_flags(cpu)); -  	rcu_read_lock(); -	for_each_domain(cpu, sd) +	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + +	if (!sd || sd->nohz_idle) +		goto unlock; +	sd->nohz_idle = 1; + +	for (; sd; sd = sd->parent)  		atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock:  	rcu_read_unlock();  } @@ -5468,7 +5499,7 @@ void update_max_interval(void)   * It checks each scheduling domain to see if it is due to be balanced,   * and initiates a balancing operation if so.   * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains.   */  static void rebalance_domains(int cpu, enum cpu_idle_type idle)  { @@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)  		if (time_after_eq(jiffies, sd->last_balance + interval)) {  			if (load_balance(cpu, rq, sd, idle, &balance)) {  				/* -				 * We've pulled tasks over so either we're no -				 * longer idle. +				 * The LBF_SOME_PINNED logic could have changed +				 * env->dst_cpu, so we can't know our idle +				 * state even if we migrated tasks. Update it.  				 */ -				idle = CPU_NOT_IDLE; +				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;  			}  			sd->last_balance = jiffies;  		} diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae..b8ce7732834 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)  {  	return task_cpu(p); /* IDLE tasks as never migrated */  } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ +	idle_exit_fair(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ +	idle_enter_fair(rq); +}  #endif /* CONFIG_SMP */  /*   * Idle tasks are unconditionally rescheduled: @@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl  static struct task_struct *pick_next_task_idle(struct rq *rq)  {  	schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP +	/* Trigger the post schedule to do an idle_enter for CFS */ +	rq->post_schedule = 1; +#endif  	return rq->idle;  } @@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_idle, +	.pre_schedule		= pre_schedule_idle, +	.post_schedule		= post_schedule_idle,  #endif  	.set_curr_task          = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfdf469..4c225c4c711 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -7,6 +7,7 @@  #include <linux/stop_machine.h>  #include "cpupri.h" +#include "cpuacct.h"  extern __read_mostly int scheduler_running; @@ -33,6 +34,31 @@ extern __read_mostly int scheduler_running;   */  #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */ +# define SCHED_LOAD_RESOLUTION	10 +# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION	0 +# define scale_load(w)		(w) +# define scale_load_down(w)	(w) +#endif + +#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT) +  #define NICE_0_LOAD		SCHED_LOAD_SCALE  #define NICE_0_SHIFT		SCHED_LOAD_SHIFT @@ -154,11 +180,6 @@ struct task_group {  #define MAX_SHARES	(1UL << 18)  #endif -/* Default task group. - *	Every task in system belong to this group at bootup. - */ -extern struct task_group root_task_group; -  typedef int (*tg_visitor)(struct task_group *, void *);  extern int walk_tg_tree_from(struct task_group *from, @@ -196,6 +217,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,  		struct sched_rt_entity *rt_se, int cpu,  		struct sched_rt_entity *parent); +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, +			       struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif +  #else /* CONFIG_CGROUP_SCHED */  struct cfs_bandwidth { }; @@ -547,6 +580,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)  DECLARE_PER_CPU(struct sched_domain *, sd_llc);  DECLARE_PER_CPU(int, sd_llc_id); +struct sched_group_power { +	atomic_t ref; +	/* +	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a +	 * single CPU. +	 */ +	unsigned int power, power_orig; +	unsigned long next_update; +	/* +	 * Number of busy cpus in this group. +	 */ +	atomic_t nr_busy_cpus; + +	unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { +	struct sched_group *next;	/* Must be a circular list */ +	atomic_t ref; + +	unsigned int group_weight; +	struct sched_group_power *sgp; + +	/* +	 * The CPUs this group covers. +	 * +	 * NOTE: this field is variable length. (Allocated dynamically +	 * by attaching extra space to the end of the structure, +	 * depending on how many CPUs the kernel has booted up with) +	 */ +	unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ +	return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ +	return to_cpumask(sg->sgp->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ +	return cpumask_first(sched_group_cpus(group)); +} +  extern int group_balance_cpu(struct sched_group *sg);  #endif /* CONFIG_SMP */ @@ -784,6 +873,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  }  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +/* + * wake flags + */ +#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */ +#define WF_FORK		0x02		/* child wakeup after fork */ +#define WF_MIGRATED	0x4		/* internal use, task got migrated */  static inline void update_load_add(struct load_weight *lw, unsigned long inc)  { @@ -856,14 +951,61 @@ static const u32 prio_to_wmult[40] = {   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,  }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { -	CPUACCT_STAT_USER,	/* ... user mode */ -	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */ +#define ENQUEUE_WAKEUP		1 +#define ENQUEUE_HEAD		2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING		0 +#endif -	CPUACCT_STAT_NSTATS, -}; +#define DEQUEUE_SLEEP		1 + +struct sched_class { +	const struct sched_class *next; + +	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); +	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); +	void (*yield_task) (struct rq *rq); +	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + +	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + +	struct task_struct * (*pick_next_task) (struct rq *rq); +	void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP +	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); +	void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + +	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); +	void (*post_schedule) (struct rq *this_rq); +	void (*task_waking) (struct task_struct *task); +	void (*task_woken) (struct rq *this_rq, struct task_struct *task); + +	void (*set_cpus_allowed)(struct task_struct *p, +				 const struct cpumask *newmask); +	void (*rq_online)(struct rq *rq); +	void (*rq_offline)(struct rq *rq); +#endif + +	void (*set_curr_task) (struct rq *rq); +	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); +	void (*task_fork) (struct task_struct *p); + +	void (*switched_from) (struct rq *this_rq, struct task_struct *task); +	void (*switched_to) (struct rq *this_rq, struct task_struct *task); +	void (*prio_changed) (struct rq *this_rq, struct task_struct *task, +			     int oldprio); + +	unsigned int (*get_rr_interval) (struct rq *rq, +					 struct task_struct *task); + +#ifdef CONFIG_FAIR_GROUP_SCHED +	void (*task_move_group) (struct task_struct *p, int on_rq); +#endif +};  #define sched_class_highest (&stop_sched_class)  #define for_each_class(class) \ @@ -877,9 +1019,23 @@ extern const struct sched_class idle_sched_class;  #ifdef CONFIG_SMP +extern void update_group_power(struct sched_domain *sd, int cpu); +  extern void trigger_load_balance(struct rq *rq, int cpu);  extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif +  #else	/* CONFIG_SMP */  static inline void idle_balance(int cpu, struct rq *rq) @@ -891,7 +1047,6 @@ static inline void idle_balance(int cpu, struct rq *rq)  extern void sysrq_sched_debug_show(void);  extern void sched_init_granularity(void);  extern void update_max_interval(void); -extern void update_group_power(struct sched_domain *sd, int cpu);  extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);  extern void init_sched_rt_class(void);  extern void init_sched_fair_class(void); @@ -904,45 +1059,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime  extern void update_idle_cpu_load(struct rq *this_rq); -#ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { -	struct cgroup_subsys_state css; -	/* cpuusage holds pointer to a u64-type object on every cpu */ -	u64 __percpu *cpuusage; -	struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ -	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), -			    struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ -	return container_of(task_subsys_state(tsk, cpuacct_subsys_id), -			    struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ -	if (!ca || !ca->css.cgroup->parent) -		return NULL; -	return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif -  #ifdef CONFIG_PARAVIRT  static inline u64 steal_ticks(u64 steal)  { @@ -1187,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);  enum rq_nohz_flag_bits {  	NOHZ_TICK_STOPPED,  	NOHZ_BALANCE_KICK, -	NOHZ_IDLE,  };  #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)  |