diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/Makefile | 20 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c | 258 | ||||
| -rw-r--r-- | kernel/sched/auto_group.h | 64 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 350 | ||||
| -rw-r--r-- | kernel/sched/core.c | 8150 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c | 241 | ||||
| -rw-r--r-- | kernel/sched/cpupri.h | 34 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 510 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 5596 | ||||
| -rw-r--r-- | kernel/sched/features.h | 70 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 99 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 2048 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 1166 | ||||
| -rw-r--r-- | kernel/sched/stats.c | 111 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 231 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 108 | 
16 files changed, 19056 insertions, 0 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 00000000000..9a7dd35102a --- /dev/null +++ b/kernel/sched/Makefile @@ -0,0 +1,20 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_clock.o = -pg +endif + +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) +# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is +# needed for x86 only.  Why this used to be enabled for all architectures is beyond +# me.  I suspect most platforms don't need this, but until we know that for sure +# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k +# to get a correct value for the wait-channel (WCHAN in ps). --davidm +CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer +endif + +obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o +obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o +obj-$(CONFIG_SCHEDSTATS) += stats.o +obj-$(CONFIG_SCHED_DEBUG) += debug.o + + diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c new file mode 100644 index 00000000000..e8a1f83ee0e --- /dev/null +++ b/kernel/sched/auto_group.c @@ -0,0 +1,258 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include "sched.h" + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/utsname.h> +#include <linux/security.h> +#include <linux/export.h> + +unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; +static struct autogroup autogroup_default; +static atomic_t autogroup_seq_nr; + +void __init autogroup_init(struct task_struct *init_task) +{ +	autogroup_default.tg = &root_task_group; +	kref_init(&autogroup_default.kref); +	init_rwsem(&autogroup_default.lock); +	init_task->signal->autogroup = &autogroup_default; +} + +void autogroup_free(struct task_group *tg) +{ +	kfree(tg->autogroup); +} + +static inline void autogroup_destroy(struct kref *kref) +{ +	struct autogroup *ag = container_of(kref, struct autogroup, kref); + +#ifdef CONFIG_RT_GROUP_SCHED +	/* We've redirected RT tasks to the root task group... */ +	ag->tg->rt_se = NULL; +	ag->tg->rt_rq = NULL; +#endif +	sched_destroy_group(ag->tg); +} + +static inline void autogroup_kref_put(struct autogroup *ag) +{ +	kref_put(&ag->kref, autogroup_destroy); +} + +static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) +{ +	kref_get(&ag->kref); +	return ag; +} + +static inline struct autogroup *autogroup_task_get(struct task_struct *p) +{ +	struct autogroup *ag; +	unsigned long flags; + +	if (!lock_task_sighand(p, &flags)) +		return autogroup_kref_get(&autogroup_default); + +	ag = autogroup_kref_get(p->signal->autogroup); +	unlock_task_sighand(p, &flags); + +	return ag; +} + +static inline struct autogroup *autogroup_create(void) +{ +	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); +	struct task_group *tg; + +	if (!ag) +		goto out_fail; + +	tg = sched_create_group(&root_task_group); + +	if (IS_ERR(tg)) +		goto out_free; + +	kref_init(&ag->kref); +	init_rwsem(&ag->lock); +	ag->id = atomic_inc_return(&autogroup_seq_nr); +	ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED +	/* +	 * Autogroup RT tasks are redirected to the root task group +	 * so we don't have to move tasks around upon policy change, +	 * or flail around trying to allocate bandwidth on the fly. +	 * A bandwidth exception in __sched_setscheduler() allows +	 * the policy change to proceed.  Thereafter, task_group() +	 * returns &root_task_group, so zero bandwidth is required. +	 */ +	free_rt_sched_group(tg); +	tg->rt_se = root_task_group.rt_se; +	tg->rt_rq = root_task_group.rt_rq; +#endif +	tg->autogroup = ag; + +	return ag; + +out_free: +	kfree(ag); +out_fail: +	if (printk_ratelimit()) { +		printk(KERN_WARNING "autogroup_create: %s failure.\n", +			ag ? "sched_create_group()" : "kmalloc()"); +	} + +	return autogroup_kref_get(&autogroup_default); +} + +bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) +{ +	if (tg != &root_task_group) +		return false; + +	if (p->sched_class != &fair_sched_class) +		return false; + +	/* +	 * We can only assume the task group can't go away on us if +	 * autogroup_move_group() can see us on ->thread_group list. +	 */ +	if (p->flags & PF_EXITING) +		return false; + +	return true; +} + +static void +autogroup_move_group(struct task_struct *p, struct autogroup *ag) +{ +	struct autogroup *prev; +	struct task_struct *t; +	unsigned long flags; + +	BUG_ON(!lock_task_sighand(p, &flags)); + +	prev = p->signal->autogroup; +	if (prev == ag) { +		unlock_task_sighand(p, &flags); +		return; +	} + +	p->signal->autogroup = autogroup_kref_get(ag); + +	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) +		goto out; + +	t = p; +	do { +		sched_move_task(t); +	} while_each_thread(p, t); + +out: +	unlock_task_sighand(p, &flags); +	autogroup_kref_put(prev); +} + +/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +void sched_autogroup_create_attach(struct task_struct *p) +{ +	struct autogroup *ag = autogroup_create(); + +	autogroup_move_group(p, ag); +	/* drop extra reference added by autogroup_create() */ +	autogroup_kref_put(ag); +} +EXPORT_SYMBOL(sched_autogroup_create_attach); + +/* Cannot be called under siglock.  Currently has no users */ +void sched_autogroup_detach(struct task_struct *p) +{ +	autogroup_move_group(p, &autogroup_default); +} +EXPORT_SYMBOL(sched_autogroup_detach); + +void sched_autogroup_fork(struct signal_struct *sig) +{ +	sig->autogroup = autogroup_task_get(current); +} + +void sched_autogroup_exit(struct signal_struct *sig) +{ +	autogroup_kref_put(sig->autogroup); +} + +static int __init setup_autogroup(char *str) +{ +	sysctl_sched_autogroup_enabled = 0; + +	return 1; +} + +__setup("noautogroup", setup_autogroup); + +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +{ +	static unsigned long next = INITIAL_JIFFIES; +	struct autogroup *ag; +	int err; + +	if (*nice < -20 || *nice > 19) +		return -EINVAL; + +	err = security_task_setnice(current, *nice); +	if (err) +		return err; + +	if (*nice < 0 && !can_nice(current, *nice)) +		return -EPERM; + +	/* this is a heavy operation taking global locks.. */ +	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) +		return -EAGAIN; + +	next = HZ / 10 + jiffies; +	ag = autogroup_task_get(p); + +	down_write(&ag->lock); +	err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); +	if (!err) +		ag->nice = *nice; +	up_write(&ag->lock); + +	autogroup_kref_put(ag); + +	return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ +	struct autogroup *ag = autogroup_task_get(p); + +	if (!task_group_is_autogroup(ag->tg)) +		goto out; + +	down_read(&ag->lock); +	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); +	up_read(&ag->lock); + +out: +	autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SCHED_DEBUG +int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ +	if (!task_group_is_autogroup(tg)) +		return 0; + +	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); +} +#endif /* CONFIG_SCHED_DEBUG */ + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h new file mode 100644 index 00000000000..8bd04714281 --- /dev/null +++ b/kernel/sched/auto_group.h @@ -0,0 +1,64 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include <linux/kref.h> +#include <linux/rwsem.h> + +struct autogroup { +	/* +	 * reference doesn't mean how many thread attach to this +	 * autogroup now. It just stands for the number of task +	 * could use this autogroup. +	 */ +	struct kref		kref; +	struct task_group	*tg; +	struct rw_semaphore	lock; +	unsigned long		id; +	int			nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ +	return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ +	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + +	if (enabled && task_wants_autogroup(p, tg)) +		return p->signal->autogroup->tg; + +	return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) {  } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ +	return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ +	return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ +	return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c new file mode 100644 index 00000000000..c685e31492d --- /dev/null +++ b/kernel/sched/clock.c @@ -0,0 +1,350 @@ +/* + * sched_clock for unstable cpu clocks + * + *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + *  Updates and enhancements: + *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> + * + * Based on code by: + *   Ingo Molnar <mingo@redhat.com> + *   Guillaume Chazarain <guichaz@gmail.com> + * + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !!                                                  # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i)       -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock()      -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + *  - GTOD (clock monotomic) + *  - sched_clock() + *  - explicit idle events + * + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. + * + * Furthermore, explicit sleep and wakeup hooks allow us to account for time + * that is otherwise invisible (TSC gets stopped). + * + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). + */ +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/ktime.h> +#include <linux/sched.h> + +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ +	return (unsigned long long)(jiffies - INITIAL_JIFFIES) +					* (NSEC_PER_SEC / HZ); +} +EXPORT_SYMBOL_GPL(sched_clock); + +__read_mostly int sched_clock_running; + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__read_mostly int sched_clock_stable; + +struct sched_clock_data { +	u64			tick_raw; +	u64			tick_gtod; +	u64			clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ +	return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ +	return &per_cpu(sched_clock_data, cpu); +} + +void sched_clock_init(void) +{ +	u64 ktime_now = ktime_to_ns(ktime_get()); +	int cpu; + +	for_each_possible_cpu(cpu) { +		struct sched_clock_data *scd = cpu_sdc(cpu); + +		scd->tick_raw = 0; +		scd->tick_gtod = ktime_now; +		scd->clock = ktime_now; +	} + +	sched_clock_running = 1; +} + +/* + * min, max except they take wrapping into account + */ + +static inline u64 wrap_min(u64 x, u64 y) +{ +	return (s64)(x - y) < 0 ? x : y; +} + +static inline u64 wrap_max(u64 x, u64 y) +{ +	return (s64)(x - y) > 0 ? x : y; +} + +/* + * update the percpu scd from the raw @now value + * + *  - filter out backward motion + *  - use the GTOD tick value to create a window to filter crazy TSC values + */ +static u64 sched_clock_local(struct sched_clock_data *scd) +{ +	u64 now, clock, old_clock, min_clock, max_clock; +	s64 delta; + +again: +	now = sched_clock(); +	delta = now - scd->tick_raw; +	if (unlikely(delta < 0)) +		delta = 0; + +	old_clock = scd->clock; + +	/* +	 * scd->clock = clamp(scd->tick_gtod + delta, +	 *		      max(scd->tick_gtod, scd->clock), +	 *		      scd->tick_gtod + TICK_NSEC); +	 */ + +	clock = scd->tick_gtod + delta; +	min_clock = wrap_max(scd->tick_gtod, old_clock); +	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); + +	clock = wrap_max(clock, min_clock); +	clock = wrap_min(clock, max_clock); + +	if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) +		goto again; + +	return clock; +} + +static u64 sched_clock_remote(struct sched_clock_data *scd) +{ +	struct sched_clock_data *my_scd = this_scd(); +	u64 this_clock, remote_clock; +	u64 *ptr, old_val, val; + +	sched_clock_local(my_scd); +again: +	this_clock = my_scd->clock; +	remote_clock = scd->clock; + +	/* +	 * Use the opportunity that we have both locks +	 * taken to couple the two clocks: we take the +	 * larger time as the latest time for both +	 * runqueues. (this creates monotonic movement) +	 */ +	if (likely((s64)(remote_clock - this_clock) < 0)) { +		ptr = &scd->clock; +		old_val = remote_clock; +		val = this_clock; +	} else { +		/* +		 * Should be rare, but possible: +		 */ +		ptr = &my_scd->clock; +		old_val = this_clock; +		val = remote_clock; +	} + +	if (cmpxchg64(ptr, old_val, val) != old_val) +		goto again; + +	return val; +} + +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ +u64 sched_clock_cpu(int cpu) +{ +	struct sched_clock_data *scd; +	u64 clock; + +	WARN_ON_ONCE(!irqs_disabled()); + +	if (sched_clock_stable) +		return sched_clock(); + +	if (unlikely(!sched_clock_running)) +		return 0ull; + +	scd = cpu_sdc(cpu); + +	if (cpu != smp_processor_id()) +		clock = sched_clock_remote(scd); +	else +		clock = sched_clock_local(scd); + +	return clock; +} + +void sched_clock_tick(void) +{ +	struct sched_clock_data *scd; +	u64 now, now_gtod; + +	if (sched_clock_stable) +		return; + +	if (unlikely(!sched_clock_running)) +		return; + +	WARN_ON_ONCE(!irqs_disabled()); + +	scd = this_scd(); +	now_gtod = ktime_to_ns(ktime_get()); +	now = sched_clock(); + +	scd->tick_raw = now; +	scd->tick_gtod = now_gtod; +	sched_clock_local(scd); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ +	sched_clock_cpu(smp_processor_id()); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ +	if (timekeeping_suspended) +		return; + +	sched_clock_tick(); +	touch_softlockup_watchdog(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !!                                                  # + * #################################################################### + */ +u64 cpu_clock(int cpu) +{ +	u64 clock; +	unsigned long flags; + +	local_irq_save(flags); +	clock = sched_clock_cpu(cpu); +	local_irq_restore(flags); + +	return clock; +} + +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ +	u64 clock; +	unsigned long flags; + +	local_irq_save(flags); +	clock = sched_clock_cpu(smp_processor_id()); +	local_irq_restore(flags); + +	return clock; +} + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +void sched_clock_init(void) +{ +	sched_clock_running = 1; +} + +u64 sched_clock_cpu(int cpu) +{ +	if (unlikely(!sched_clock_running)) +		return 0; + +	return sched_clock(); +} + +u64 cpu_clock(int cpu) +{ +	return sched_clock_cpu(cpu); +} + +u64 local_clock(void) +{ +	return sched_clock_cpu(0); +} + +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched/core.c b/kernel/sched/core.c new file mode 100644 index 00000000000..df00cb09263 --- /dev/null +++ b/kernel/sched/core.c @@ -0,0 +1,8150 @@ +/* + *  kernel/sched/core.c + * + *  Kernel scheduler and related syscalls + * + *  Copyright (C) 1991-2002  Linus Torvalds + * + *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and + *		make semaphores SMP safe + *  1998-11-19	Implemented schedule_timeout() and related stuff + *		by Andrea Arcangeli + *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar: + *		hybrid priority-list and round-robin design with + *		an array-switch method of distributing timeslices + *		and per-CPU runqueues.  Cleanups and useful suggestions + *		by Davide Libenzi, preemptible kernel bits by Robert Love. + *  2003-09-03	Interactivity tuning by Con Kolivas. + *  2004-04-02	Scheduler domains code by Nick Piggin + *  2007-04-15  Work begun on replacing all interactivity tuning with a + *              fair scheduling design by Con Kolivas. + *  2007-05-05  Load balancing (smp-nice) and other improvements + *              by Peter Williams + *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith + *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri + *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins, + *              Thomas Gleixner, Mike Kravetz + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/debug_locks.h> +#include <linux/perf_event.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/profile.h> +#include <linux/freezer.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/pid_namespace.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/percpu.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sysctl.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/tsacct_kern.h> +#include <linux/kprobes.h> +#include <linux/delayacct.h> +#include <linux/unistd.h> +#include <linux/pagemap.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> +#include <linux/ftrace.h> +#include <linux/slab.h> +#include <linux/init_task.h> + +#include <asm/tlb.h> +#include <asm/irq_regs.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif + +#include "sched.h" +#include "../workqueue_sched.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> + +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) +{ +	unsigned long delta; +	ktime_t soft, hard, now; + +	for (;;) { +		if (hrtimer_active(period_timer)) +			break; + +		now = hrtimer_cb_get_time(period_timer); +		hrtimer_forward(period_timer, now, period); + +		soft = hrtimer_get_softexpires(period_timer); +		hard = hrtimer_get_expires(period_timer); +		delta = ktime_to_ns(ktime_sub(hard, soft)); +		__hrtimer_start_range_ns(period_timer, soft, delta, +					 HRTIMER_MODE_ABS_PINNED, 0); +	} +} + +DEFINE_MUTEX(sched_domains_mutex); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +static void update_rq_clock_task(struct rq *rq, s64 delta); + +void update_rq_clock(struct rq *rq) +{ +	s64 delta; + +	if (rq->skip_clock_update > 0) +		return; + +	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; +	rq->clock += delta; +	update_rq_clock_task(rq, delta); +} + +/* + * Debugging: various feature bits + */ + +#define SCHED_FEAT(name, enabled)	\ +	(1UL << __SCHED_FEAT_##name) * enabled | + +const_debug unsigned int sysctl_sched_features = +#include "features.h" +	0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled)	\ +	#name , + +static __read_mostly char *sched_feat_names[] = { +#include "features.h" +	NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ +	int i; + +	for (i = 0; i < __SCHED_FEAT_NR; i++) { +		if (!(sysctl_sched_features & (1UL << i))) +			seq_puts(m, "NO_"); +		seq_printf(m, "%s ", sched_feat_names[i]); +	} +	seq_puts(m, "\n"); + +	return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true  jump_label_key_enabled +#define jump_label_key__false jump_label_key_disabled + +#define SCHED_FEAT(name, enabled)	\ +	jump_label_key__##enabled , + +struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ +	if (jump_label_enabled(&sched_feat_keys[i])) +		jump_label_dec(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ +	if (!jump_label_enabled(&sched_feat_keys[i])) +		jump_label_inc(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, +		size_t cnt, loff_t *ppos) +{ +	char buf[64]; +	char *cmp; +	int neg = 0; +	int i; + +	if (cnt > 63) +		cnt = 63; + +	if (copy_from_user(&buf, ubuf, cnt)) +		return -EFAULT; + +	buf[cnt] = 0; +	cmp = strstrip(buf); + +	if (strncmp(cmp, "NO_", 3) == 0) { +		neg = 1; +		cmp += 3; +	} + +	for (i = 0; i < __SCHED_FEAT_NR; i++) { +		if (strcmp(cmp, sched_feat_names[i]) == 0) { +			if (neg) { +				sysctl_sched_features &= ~(1UL << i); +				sched_feat_disable(i); +			} else { +				sysctl_sched_features |= (1UL << i); +				sched_feat_enable(i); +			} +			break; +		} +	} + +	if (i == __SCHED_FEAT_NR) +		return -EINVAL; + +	*ppos += cnt; + +	return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ +	return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { +	.open		= sched_feat_open, +	.write		= sched_feat_write, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static __init int sched_init_debug(void) +{ +	debugfs_create_file("sched_features", 0644, NULL, NULL, +			&sched_feat_fops); + +	return 0; +} +late_initcall(sched_init_debug); +#endif /* CONFIG_SCHED_DEBUG */ + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +__read_mostly int scheduler_running; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + + + +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	lockdep_assert_held(&p->pi_lock); + +	for (;;) { +		rq = task_rq(p); +		raw_spin_lock(&rq->lock); +		if (likely(rq == task_rq(p))) +			return rq; +		raw_spin_unlock(&rq->lock); +	} +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) +	__acquires(p->pi_lock) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	for (;;) { +		raw_spin_lock_irqsave(&p->pi_lock, *flags); +		rq = task_rq(p); +		raw_spin_lock(&rq->lock); +		if (likely(rq == task_rq(p))) +			return rq; +		raw_spin_unlock(&rq->lock); +		raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +	} +} + +static void __task_rq_unlock(struct rq *rq) +	__releases(rq->lock) +{ +	raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) +	__releases(rq->lock) +	__releases(p->pi_lock) +{ +	raw_spin_unlock(&rq->lock); +	raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static struct rq *this_rq_lock(void) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	local_irq_disable(); +	rq = this_rq(); +	raw_spin_lock(&rq->lock); + +	return rq; +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ + +static void hrtick_clear(struct rq *rq) +{ +	if (hrtimer_active(&rq->hrtick_timer)) +		hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ +	struct rq *rq = container_of(timer, struct rq, hrtick_timer); + +	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + +	raw_spin_lock(&rq->lock); +	update_rq_clock(rq); +	rq->curr->sched_class->task_tick(rq, rq->curr, 1); +	raw_spin_unlock(&rq->lock); + +	return HRTIMER_NORESTART; +} + +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) +{ +	struct rq *rq = arg; + +	raw_spin_lock(&rq->lock); +	hrtimer_restart(&rq->hrtick_timer); +	rq->hrtick_csd_pending = 0; +	raw_spin_unlock(&rq->lock); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ +	struct hrtimer *timer = &rq->hrtick_timer; +	ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + +	hrtimer_set_expires(timer, time); + +	if (rq == this_rq()) { +		hrtimer_restart(timer); +	} else if (!rq->hrtick_csd_pending) { +		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); +		rq->hrtick_csd_pending = 1; +	} +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	int cpu = (int)(long)hcpu; + +	switch (action) { +	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN: +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +	case CPU_DEAD: +	case CPU_DEAD_FROZEN: +		hrtick_clear(cpu_rq(cpu)); +		return NOTIFY_OK; +	} + +	return NOTIFY_DONE; +} + +static __init void init_hrtick(void) +{ +	hotcpu_notifier(hotplug_hrtick, 0); +} +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ +	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, +			HRTIMER_MODE_REL_PINNED, 0); +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP +	rq->hrtick_csd_pending = 0; + +	rq->hrtick_csd.flags = 0; +	rq->hrtick_csd.func = __hrtick_start; +	rq->hrtick_csd.info = rq; +#endif + +	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	rq->hrtick_timer.function = hrtick; +} +#else	/* CONFIG_SCHED_HRTICK */ +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +static inline void init_hrtick(void) +{ +} +#endif	/* CONFIG_SCHED_HRTICK */ + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +void resched_task(struct task_struct *p) +{ +	int cpu; + +	assert_raw_spin_locked(&task_rq(p)->lock); + +	if (test_tsk_need_resched(p)) +		return; + +	set_tsk_need_resched(p); + +	cpu = task_cpu(p); +	if (cpu == smp_processor_id()) +		return; + +	/* NEED_RESCHED must be visible before we test polling */ +	smp_mb(); +	if (!tsk_is_polling(p)) +		smp_send_reschedule(cpu); +} + +void resched_cpu(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	if (!raw_spin_trylock_irqsave(&rq->lock, flags)) +		return; +	resched_task(cpu_curr(cpu)); +	raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +#ifdef CONFIG_NO_HZ +/* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu.  This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ +	int cpu = smp_processor_id(); +	int i; +	struct sched_domain *sd; + +	rcu_read_lock(); +	for_each_domain(cpu, sd) { +		for_each_cpu(i, sched_domain_span(sd)) { +			if (!idle_cpu(i)) { +				cpu = i; +				goto unlock; +			} +		} +	} +unlock: +	rcu_read_unlock(); +	return cpu; +} +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +	if (cpu == smp_processor_id()) +		return; + +	/* +	 * This is safe, as this function is called with the timer +	 * wheel base lock of (cpu) held. When the CPU is on the way +	 * to idle and has not yet set rq->curr to idle then it will +	 * be serialized on the timer wheel base lock and take the new +	 * timer into account automatically. +	 */ +	if (rq->curr != rq->idle) +		return; + +	/* +	 * We can set TIF_RESCHED on the idle task of the other CPU +	 * lockless. The worst case is that the other CPU runs the +	 * idle task through an additional NOOP schedule() +	 */ +	set_tsk_need_resched(rq->idle); + +	/* NEED_RESCHED must be visible before we test polling */ +	smp_mb(); +	if (!tsk_is_polling(rq->idle)) +		smp_send_reschedule(cpu); +} + +static inline bool got_nohz_idle_kick(void) +{ +	int cpu = smp_processor_id(); +	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ +	return false; +} + +#endif /* CONFIG_NO_HZ */ + +void sched_avg_update(struct rq *rq) +{ +	s64 period = sched_avg_period(); + +	while ((s64)(rq->clock - rq->age_stamp) > period) { +		/* +		 * Inline assembly required to prevent the compiler +		 * optimising this loop into a divmod call. +		 * See __iter_div_u64_rem() for another example of this. +		 */ +		asm("" : "+rm" (rq->age_stamp)); +		rq->age_stamp += period; +		rq->rt_avg /= 2; +	} +} + +#else /* !CONFIG_SMP */ +void resched_task(struct task_struct *p) +{ +	assert_raw_spin_locked(&task_rq(p)->lock); +	set_tsk_need_resched(p); +} +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ +			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +/* + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +int walk_tg_tree_from(struct task_group *from, +			     tg_visitor down, tg_visitor up, void *data) +{ +	struct task_group *parent, *child; +	int ret; + +	parent = from; + +down: +	ret = (*down)(parent, data); +	if (ret) +		goto out; +	list_for_each_entry_rcu(child, &parent->children, siblings) { +		parent = child; +		goto down; + +up: +		continue; +	} +	ret = (*up)(parent, data); +	if (ret || parent == from) +		goto out; + +	child = parent; +	parent = parent->parent; +	if (parent) +		goto up; +out: +	return ret; +} + +int tg_nop(struct task_group *tg, void *data) +{ +	return 0; +} +#endif + +void update_cpu_load(struct rq *this_rq); + +static void set_load_weight(struct task_struct *p) +{ +	int prio = p->static_prio - MAX_RT_PRIO; +	struct load_weight *load = &p->se.load; + +	/* +	 * SCHED_IDLE tasks get minimal weight: +	 */ +	if (p->policy == SCHED_IDLE) { +		load->weight = scale_load(WEIGHT_IDLEPRIO); +		load->inv_weight = WMULT_IDLEPRIO; +		return; +	} + +	load->weight = scale_load(prio_to_weight[prio]); +	load->inv_weight = prio_to_wmult[prio]; +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +{ +	update_rq_clock(rq); +	sched_info_queued(p); +	p->sched_class->enqueue_task(rq, p, flags); +} + +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +{ +	update_rq_clock(rq); +	sched_info_dequeued(p); +	p->sched_class->dequeue_task(rq, p, flags); +} + +/* + * activate_task - move a task to the runqueue. + */ +void activate_task(struct rq *rq, struct task_struct *p, int flags) +{ +	if (task_contributes_to_load(p)) +		rq->nr_uninterruptible--; + +	enqueue_task(rq, p, flags); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +{ +	if (task_contributes_to_load(p)) +		rq->nr_uninterruptible++; + +	dequeue_task(rq, p, flags); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ +	__this_cpu_inc(irq_time_seq.sequence); +	smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ +	smp_wmb(); +	__this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ +	u64 irq_time; +	unsigned seq; + +	do { +		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); +		irq_time = per_cpu(cpu_softirq_time, cpu) + +			   per_cpu(cpu_hardirq_time, cpu); +	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + +	return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ +	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ +	unsigned long flags; +	s64 delta; +	int cpu; + +	if (!sched_clock_irqtime) +		return; + +	local_irq_save(flags); + +	cpu = smp_processor_id(); +	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); +	__this_cpu_add(irq_start_time, delta); + +	irq_time_write_begin(); +	/* +	 * We do not account for softirq time from ksoftirqd here. +	 * We want to continue accounting softirq time to ksoftirqd thread +	 * in that case, so as not to confuse scheduler with a special task +	 * that do not consume any time, but still wants to run. +	 */ +	if (hardirq_count()) +		__this_cpu_add(cpu_hardirq_time, delta); +	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) +		__this_cpu_add(cpu_softirq_time, delta); + +	irq_time_write_end(); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ +	if (unlikely(steal > NSEC_PER_SEC)) +		return div_u64(steal, TICK_NSEC); + +	return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +	s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + +	/* +	 * Since irq_time is only updated on {soft,}irq_exit, we might run into +	 * this case when a previous update_rq_clock() happened inside a +	 * {soft,}irq region. +	 * +	 * When this happens, we stop ->clock_task and only update the +	 * prev_irq_time stamp to account for the part that fit, so that a next +	 * update will consume the rest. This ensures ->clock_task is +	 * monotonic. +	 * +	 * It does however cause some slight miss-attribution of {soft,}irq +	 * time, a more accurate solution would be to update the irq_time using +	 * the current rq->clock timestamp, except that would require using +	 * atomic ops. +	 */ +	if (irq_delta > delta) +		irq_delta = delta; + +	rq->prev_irq_time += irq_delta; +	delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +	if (static_branch((¶virt_steal_rq_enabled))) { +		u64 st; + +		steal = paravirt_steal_clock(cpu_of(rq)); +		steal -= rq->prev_steal_time_rq; + +		if (unlikely(steal > delta)) +			steal = delta; + +		st = steal_ticks(steal); +		steal = st * TICK_NSEC; + +		rq->prev_steal_time_rq += steal; + +		delta -= steal; +	} +#endif + +	rq->clock_task += delta; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +	if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) +		sched_rt_avg_update(rq, irq_delta + steal); +#endif +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int irqtime_account_hi_update(void) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; +	unsigned long flags; +	u64 latest_ns; +	int ret = 0; + +	local_irq_save(flags); +	latest_ns = this_cpu_read(cpu_hardirq_time); +	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) +		ret = 1; +	local_irq_restore(flags); +	return ret; +} + +static int irqtime_account_si_update(void) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; +	unsigned long flags; +	u64 latest_ns; +	int ret = 0; + +	local_irq_save(flags); +	latest_ns = this_cpu_read(cpu_softirq_time); +	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) +		ret = 1; +	local_irq_restore(flags); +	return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime	(0) + +#endif + +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ +	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; +	struct task_struct *old_stop = cpu_rq(cpu)->stop; + +	if (stop) { +		/* +		 * Make it appear like a SCHED_FIFO task, its something +		 * userspace knows about and won't get confused about. +		 * +		 * Also, it will make PI more or less work without too +		 * much confusion -- but then, stop work should not +		 * rely on PI working anyway. +		 */ +		sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + +		stop->sched_class = &stop_sched_class; +	} + +	cpu_rq(cpu)->stop = stop; + +	if (old_stop) { +		/* +		 * Reset it back to a normal scheduling class so that +		 * it can die in pieces. +		 */ +		old_stop->sched_class = &rt_sched_class; +	} +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ +	return p->static_prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ +	int prio; + +	if (task_has_rt_policy(p)) +		prio = MAX_RT_PRIO-1 - p->rt_priority; +	else +		prio = __normal_prio(p); +	return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ +	p->normal_prio = normal_prio(p); +	/* +	 * If we are RT tasks or we were boosted to RT priority, +	 * keep the priority unchanged. Otherwise, update priority +	 * to the normal priority: +	 */ +	if (!rt_prio(p->prio)) +		return p->normal_prio; +	return p->prio; +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const struct task_struct *p) +{ +	return cpu_curr(task_cpu(p)) == p; +} + +static inline void check_class_changed(struct rq *rq, struct task_struct *p, +				       const struct sched_class *prev_class, +				       int oldprio) +{ +	if (prev_class != p->sched_class) { +		if (prev_class->switched_from) +			prev_class->switched_from(rq, p); +		p->sched_class->switched_to(rq, p); +	} else if (oldprio != p->prio) +		p->sched_class->prio_changed(rq, p, oldprio); +} + +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +{ +	const struct sched_class *class; + +	if (p->sched_class == rq->curr->sched_class) { +		rq->curr->sched_class->check_preempt_curr(rq, p, flags); +	} else { +		for_each_class(class) { +			if (class == rq->curr->sched_class) +				break; +			if (class == p->sched_class) { +				resched_task(rq->curr); +				break; +			} +		} +	} + +	/* +	 * A queue event has occurred, and we're going to schedule.  In +	 * this case, we can save a useless back to back clock update. +	 */ +	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) +		rq->skip_clock_update = 1; +} + +#ifdef CONFIG_SMP +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ +#ifdef CONFIG_SCHED_DEBUG +	/* +	 * We should never call set_task_cpu() on a blocked task, +	 * ttwu() will sort out the placement. +	 */ +	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && +			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP +	/* +	 * The caller should hold either p->pi_lock or rq->lock, when changing +	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. +	 * +	 * sched_move_task() holds both and thus holding either pins the cgroup, +	 * see set_task_rq(). +	 * +	 * Furthermore, all task_rq users should acquire both locks, see +	 * task_rq_lock(). +	 */ +	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || +				      lockdep_is_held(&task_rq(p)->lock))); +#endif +#endif + +	trace_sched_migrate_task(p, new_cpu); + +	if (task_cpu(p) != new_cpu) { +		p->se.nr_migrations++; +		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); +	} + +	__set_task_cpu(p, new_cpu); +} + +struct migration_arg { +	struct task_struct *task; +	int dest_cpu; +}; + +static int migration_cpu_stop(void *data); + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change.  If it changes, i.e. @p might have woken up, + * then return zero.  When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count).  If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ +	unsigned long flags; +	int running, on_rq; +	unsigned long ncsw; +	struct rq *rq; + +	for (;;) { +		/* +		 * We do the initial early heuristics without holding +		 * any task-queue locks at all. We'll only try to get +		 * the runqueue lock when things look like they will +		 * work out! +		 */ +		rq = task_rq(p); + +		/* +		 * If the task is actively running on another CPU +		 * still, just relax and busy-wait without holding +		 * any locks. +		 * +		 * NOTE! Since we don't hold any locks, it's not +		 * even sure that "rq" stays as the right runqueue! +		 * But we don't care, since "task_running()" will +		 * return false if the runqueue has changed and p +		 * is actually now running somewhere else! +		 */ +		while (task_running(rq, p)) { +			if (match_state && unlikely(p->state != match_state)) +				return 0; +			cpu_relax(); +		} + +		/* +		 * Ok, time to look more closely! We need the rq +		 * lock now, to be *sure*. If we're wrong, we'll +		 * just go back and repeat. +		 */ +		rq = task_rq_lock(p, &flags); +		trace_sched_wait_task(p); +		running = task_running(rq, p); +		on_rq = p->on_rq; +		ncsw = 0; +		if (!match_state || p->state == match_state) +			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ +		task_rq_unlock(rq, p, &flags); + +		/* +		 * If it changed from the expected state, bail out now. +		 */ +		if (unlikely(!ncsw)) +			break; + +		/* +		 * Was it really running after all now that we +		 * checked with the proper locks actually held? +		 * +		 * Oops. Go back and try again.. +		 */ +		if (unlikely(running)) { +			cpu_relax(); +			continue; +		} + +		/* +		 * It's not enough that it's not actively running, +		 * it must be off the runqueue _entirely_, and not +		 * preempted! +		 * +		 * So if it was still runnable (but just not actively +		 * running right now), it's preempted, and we should +		 * yield - it could be a while. +		 */ +		if (unlikely(on_rq)) { +			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + +			set_current_state(TASK_UNINTERRUPTIBLE); +			schedule_hrtimeout(&to, HRTIMER_MODE_REL); +			continue; +		} + +		/* +		 * Ahh, all good. It wasn't running, and it wasn't +		 * runnable, which means that it will never become +		 * running in the future either. We're all done! +		 */ +		break; +	} + +	return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesn't have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ +	int cpu; + +	preempt_disable(); +	cpu = task_cpu(p); +	if ((cpu != smp_processor_id()) && task_curr(p)) +		smp_send_reschedule(cpu); +	preempt_enable(); +} +EXPORT_SYMBOL_GPL(kick_process); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by both rq->lock and p->pi_lock + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ +	int dest_cpu; +	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + +	/* Look for allowed, online CPU in same node. */ +	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) +		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) +			return dest_cpu; + +	/* Any allowed, online CPU? */ +	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); +	if (dest_cpu < nr_cpu_ids) +		return dest_cpu; + +	/* No more Mr. Nice Guy. */ +	dest_cpu = cpuset_cpus_allowed_fallback(p); +	/* +	 * Don't tell them about moving exiting tasks or +	 * kernel threads (both mm NULL), since they never +	 * leave kernel. +	 */ +	if (p->mm && printk_ratelimit()) { +		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", +				task_pid_nr(p), p->comm, cpu); +	} + +	return dest_cpu; +} + +/* + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + */ +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ +	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + +	/* +	 * In order not to call set_task_cpu() on a blocking task we need +	 * to rely on ttwu() to place the task on a valid ->cpus_allowed +	 * cpu. +	 * +	 * Since this is common to all placement strategies, this lives here. +	 * +	 * [ this allows ->select_task() to simply return task_cpu(p) and +	 *   not worry about this generic constraint ] +	 */ +	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || +		     !cpu_online(cpu))) +		cpu = select_fallback_rq(task_cpu(p), p); + +	return cpu; +} + +static void update_avg(u64 *avg, u64 sample) +{ +	s64 diff = sample - *avg; +	*avg += diff >> 3; +} +#endif + +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +{ +#ifdef CONFIG_SCHEDSTATS +	struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP +	int this_cpu = smp_processor_id(); + +	if (cpu == this_cpu) { +		schedstat_inc(rq, ttwu_local); +		schedstat_inc(p, se.statistics.nr_wakeups_local); +	} else { +		struct sched_domain *sd; + +		schedstat_inc(p, se.statistics.nr_wakeups_remote); +		rcu_read_lock(); +		for_each_domain(this_cpu, sd) { +			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { +				schedstat_inc(sd, ttwu_wake_remote); +				break; +			} +		} +		rcu_read_unlock(); +	} + +	if (wake_flags & WF_MIGRATED) +		schedstat_inc(p, se.statistics.nr_wakeups_migrate); + +#endif /* CONFIG_SMP */ + +	schedstat_inc(rq, ttwu_count); +	schedstat_inc(p, se.statistics.nr_wakeups); + +	if (wake_flags & WF_SYNC) +		schedstat_inc(p, se.statistics.nr_wakeups_sync); + +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ +	activate_task(rq, p, en_flags); +	p->on_rq = 1; + +	/* if a worker is waking up, notify workqueue */ +	if (p->flags & PF_WQ_WORKER) +		wq_worker_waking_up(p, cpu_of(rq)); +} + +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ +	trace_sched_wakeup(p, true); +	check_preempt_curr(rq, p, wake_flags); + +	p->state = TASK_RUNNING; +#ifdef CONFIG_SMP +	if (p->sched_class->task_woken) +		p->sched_class->task_woken(rq, p); + +	if (rq->idle_stamp) { +		u64 delta = rq->clock - rq->idle_stamp; +		u64 max = 2*sysctl_sched_migration_cost; + +		if (delta > max) +			rq->avg_idle = max; +		else +			update_avg(&rq->avg_idle, delta); +		rq->idle_stamp = 0; +	} +#endif +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP +	if (p->sched_contributes_to_load) +		rq->nr_uninterruptible--; +#endif + +	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); +	ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ +	struct rq *rq; +	int ret = 0; + +	rq = __task_rq_lock(p); +	if (p->on_rq) { +		ttwu_do_wakeup(rq, p, wake_flags); +		ret = 1; +	} +	__task_rq_unlock(rq); + +	return ret; +} + +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ +	struct rq *rq = this_rq(); +	struct llist_node *llist = llist_del_all(&rq->wake_list); +	struct task_struct *p; + +	raw_spin_lock(&rq->lock); + +	while (llist) { +		p = llist_entry(llist, struct task_struct, wake_entry); +		llist = llist_next(llist); +		ttwu_do_activate(rq, p, 0); +	} + +	raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ +	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) +		return; + +	/* +	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since +	 * traditionally all their work was done from the interrupt return +	 * path. Now that we actually do some work, we need to make sure +	 * we do call them. +	 * +	 * Some archs already do call them, luckily irq_enter/exit nest +	 * properly. +	 * +	 * Arguably we should visit all archs and update all handlers, +	 * however a fair share of IPIs are still resched only so this would +	 * somewhat pessimize the simple resched case. +	 */ +	irq_enter(); +	sched_ttwu_pending(); + +	/* +	 * Check if someone kicked us for doing the nohz idle load balance. +	 */ +	if (unlikely(got_nohz_idle_kick() && !need_resched())) { +		this_rq()->idle_balance = 1; +		raise_softirq_irqoff(SCHED_SOFTIRQ); +	} +	irq_exit(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ +	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) +		smp_send_reschedule(cpu); +} + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +static int ttwu_activate_remote(struct task_struct *p, int wake_flags) +{ +	struct rq *rq; +	int ret = 0; + +	rq = __task_rq_lock(p); +	if (p->on_cpu) { +		ttwu_activate(rq, p, ENQUEUE_WAKEUP); +		ttwu_do_wakeup(rq, p, wake_flags); +		ret = 1; +	} +	__task_rq_unlock(rq); + +	return ret; + +} +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + +static inline int ttwu_share_cache(int this_cpu, int that_cpu) +{ +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); +} +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) +	if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { +		sched_clock_cpu(cpu); /* sync clocks x-cpu */ +		ttwu_queue_remote(p, cpu); +		return; +	} +#endif + +	raw_spin_lock(&rq->lock); +	ttwu_do_activate(rq, p, 0); +	raw_spin_unlock(&rq->lock); +} + +/** + * try_to_wake_up - wake up a thread + * @p: the thread to be awakened + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. + */ +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +{ +	unsigned long flags; +	int cpu, success = 0; + +	smp_wmb(); +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	if (!(p->state & state)) +		goto out; + +	success = 1; /* we're going to change ->state */ +	cpu = task_cpu(p); + +	if (p->on_rq && ttwu_remote(p, wake_flags)) +		goto stat; + +#ifdef CONFIG_SMP +	/* +	 * If the owning (remote) cpu is still in the middle of schedule() with +	 * this task as prev, wait until its done referencing the task. +	 */ +	while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +		/* +		 * In case the architecture enables interrupts in +		 * context_switch(), we cannot busy wait, since that +		 * would lead to deadlocks when an interrupt hits and +		 * tries to wake up @prev. So bail and do a complete +		 * remote wakeup. +		 */ +		if (ttwu_activate_remote(p, wake_flags)) +			goto stat; +#else +		cpu_relax(); +#endif +	} +	/* +	 * Pairs with the smp_wmb() in finish_lock_switch(). +	 */ +	smp_rmb(); + +	p->sched_contributes_to_load = !!task_contributes_to_load(p); +	p->state = TASK_WAKING; + +	if (p->sched_class->task_waking) +		p->sched_class->task_waking(p); + +	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +	if (task_cpu(p) != cpu) { +		wake_flags |= WF_MIGRATED; +		set_task_cpu(p, cpu); +	} +#endif /* CONFIG_SMP */ + +	ttwu_queue(p, cpu); +stat: +	ttwu_stat(p, cpu, wake_flags); +out: +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +	return success; +} + +/** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not already there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ +	struct rq *rq = task_rq(p); + +	BUG_ON(rq != this_rq()); +	BUG_ON(p == current); +	lockdep_assert_held(&rq->lock); + +	if (!raw_spin_trylock(&p->pi_lock)) { +		raw_spin_unlock(&rq->lock); +		raw_spin_lock(&p->pi_lock); +		raw_spin_lock(&rq->lock); +	} + +	if (!(p->state & TASK_NORMAL)) +		goto out; + +	if (!p->on_rq) +		ttwu_activate(rq, p, ENQUEUE_WAKEUP); + +	ttwu_do_wakeup(rq, p, 0); +	ttwu_stat(p, smp_processor_id(), 0); +out: +	raw_spin_unlock(&p->pi_lock); +} + +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes.  Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process(struct task_struct *p) +{ +	return try_to_wake_up(p, TASK_ALL, 0); +} +EXPORT_SYMBOL(wake_up_process); + +int wake_up_state(struct task_struct *p, unsigned int state) +{ +	return try_to_wake_up(p, state, 0); +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: + */ +static void __sched_fork(struct task_struct *p) +{ +	p->on_rq			= 0; + +	p->se.on_rq			= 0; +	p->se.exec_start		= 0; +	p->se.sum_exec_runtime		= 0; +	p->se.prev_sum_exec_runtime	= 0; +	p->se.nr_migrations		= 0; +	p->se.vruntime			= 0; +	INIT_LIST_HEAD(&p->se.group_node); + +#ifdef CONFIG_SCHEDSTATS +	memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif + +	INIT_LIST_HEAD(&p->rt.run_list); + +#ifdef CONFIG_PREEMPT_NOTIFIERS +	INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p) +{ +	unsigned long flags; +	int cpu = get_cpu(); + +	__sched_fork(p); +	/* +	 * We mark the process as running here. This guarantees that +	 * nobody will actually run it, and a signal or other external +	 * event cannot wake it up and insert it on the runqueue either. +	 */ +	p->state = TASK_RUNNING; + +	/* +	 * Make sure we do not leak PI boosting priority to the child. +	 */ +	p->prio = current->normal_prio; + +	/* +	 * Revert to default priority/policy on fork if requested. +	 */ +	if (unlikely(p->sched_reset_on_fork)) { +		if (task_has_rt_policy(p)) { +			p->policy = SCHED_NORMAL; +			p->static_prio = NICE_TO_PRIO(0); +			p->rt_priority = 0; +		} else if (PRIO_TO_NICE(p->static_prio) < 0) +			p->static_prio = NICE_TO_PRIO(0); + +		p->prio = p->normal_prio = __normal_prio(p); +		set_load_weight(p); + +		/* +		 * We don't need the reset flag anymore after the fork. It has +		 * fulfilled its duty: +		 */ +		p->sched_reset_on_fork = 0; +	} + +	if (!rt_prio(p->prio)) +		p->sched_class = &fair_sched_class; + +	if (p->sched_class->task_fork) +		p->sched_class->task_fork(p); + +	/* +	 * The child is not yet in the pid-hash so no cgroup attach races, +	 * and the cgroup is pinned to this child due to cgroup_fork() +	 * is ran before sched_fork(). +	 * +	 * Silence PROVE_RCU. +	 */ +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	set_task_cpu(p, cpu); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +	if (likely(sched_info_on())) +		memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) +	p->on_cpu = 0; +#endif +#ifdef CONFIG_PREEMPT_COUNT +	/* Want to start with kernel preemption disabled. */ +	task_thread_info(p)->preempt_count = 1; +#endif +#ifdef CONFIG_SMP +	plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif + +	put_cpu(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void wake_up_new_task(struct task_struct *p) +{ +	unsigned long flags; +	struct rq *rq; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_SMP +	/* +	 * Fork balancing, do it here and not earlier because: +	 *  - cpus_allowed can change in the fork path +	 *  - any previously selected cpu might disappear through hotplug +	 */ +	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); +#endif + +	rq = __task_rq_lock(p); +	activate_task(rq, p, 0); +	p->on_rq = 1; +	trace_sched_wakeup_new(p, true); +	check_preempt_curr(rq, p, WF_FORK); +#ifdef CONFIG_SMP +	if (p->sched_class->task_woken) +		p->sched_class->task_woken(rq, p); +#endif +	task_rq_unlock(rq, p, &flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ +	hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ +	hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +	struct preempt_notifier *notifier; +	struct hlist_node *node; + +	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) +		notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, +				 struct task_struct *next) +{ +	struct preempt_notifier *notifier; +	struct hlist_node *node; + +	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) +		notifier->ops->sched_out(notifier, next); +} + +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, +				 struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, +		    struct task_struct *next) +{ +	sched_info_switch(prev, next); +	perf_event_task_sched_out(prev, next); +	fire_sched_out_preempt_notifiers(prev, next); +	prepare_lock_switch(rq, next); +	prepare_arch_switch(next); +	trace_sched_switch(prev, next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(struct rq *rq, struct task_struct *prev) +	__releases(rq->lock) +{ +	struct mm_struct *mm = rq->prev_mm; +	long prev_state; + +	rq->prev_mm = NULL; + +	/* +	 * A task struct has one reference for the use as "current". +	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls +	 * schedule one last time. The schedule call will never return, and +	 * the scheduled task must drop that reference. +	 * The test for TASK_DEAD must occur while the runqueue locks are +	 * still held, otherwise prev could be scheduled on another cpu, die +	 * there before we look at prev->state, and then the reference would +	 * be dropped twice. +	 *		Manfred Spraul <manfred@colorfullife.com> +	 */ +	prev_state = prev->state; +	finish_arch_switch(prev); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +	perf_event_task_sched_in(prev, current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +	finish_lock_switch(rq, prev); +	trace_sched_stat_sleeptime(current, rq->clock); + +	fire_sched_in_preempt_notifiers(current); +	if (mm) +		mmdrop(mm); +	if (unlikely(prev_state == TASK_DEAD)) { +		/* +		 * Remove function-return probe instances associated with this +		 * task and put them back on the free list. +		 */ +		kprobe_flush_task(prev); +		put_task_struct(prev); +	} +} + +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ +	if (prev->sched_class->pre_schedule) +		prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ +	if (rq->post_schedule) { +		unsigned long flags; + +		raw_spin_lock_irqsave(&rq->lock, flags); +		if (rq->curr->sched_class->post_schedule) +			rq->curr->sched_class->post_schedule(rq); +		raw_spin_unlock_irqrestore(&rq->lock, flags); + +		rq->post_schedule = 0; +	} +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(struct task_struct *prev) +	__releases(rq->lock) +{ +	struct rq *rq = this_rq(); + +	finish_task_switch(rq, prev); + +	/* +	 * FIXME: do we need to worry about rq being invalidated by the +	 * task_switch? +	 */ +	post_schedule(rq); + +#ifdef __ARCH_WANT_UNLOCKED_CTXSW +	/* In this case, finish_task_switch does not reenable preemption */ +	preempt_enable(); +#endif +	if (current->set_child_tid) +		put_user(task_pid_vnr(current), current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline void +context_switch(struct rq *rq, struct task_struct *prev, +	       struct task_struct *next) +{ +	struct mm_struct *mm, *oldmm; + +	prepare_task_switch(rq, prev, next); + +	mm = next->mm; +	oldmm = prev->active_mm; +	/* +	 * For paravirt, this is coupled with an exit in switch_to to +	 * combine the page table reload and the switch backend into +	 * one hypercall. +	 */ +	arch_start_context_switch(prev); + +	if (!mm) { +		next->active_mm = oldmm; +		atomic_inc(&oldmm->mm_count); +		enter_lazy_tlb(oldmm, next); +	} else +		switch_mm(oldmm, mm, next); + +	if (!prev->mm) { +		prev->active_mm = NULL; +		rq->prev_mm = oldmm; +	} +	/* +	 * Since the runqueue lock will be released by the next +	 * task (which is an invalid locking op but in the case +	 * of the scheduler it's an obvious special-case), so we +	 * do an early lockdep release here: +	 */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif + +	/* Here we just switch the register state and the stack. */ +	switch_to(prev, next, prev); + +	barrier(); +	/* +	 * this_rq must be evaluated again because prev may have moved +	 * CPUs since it called schedule(), thus the 'rq' on its stack +	 * frame will be invalid. +	 */ +	finish_task_switch(this_rq(), prev); +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ +	unsigned long i, sum = 0; + +	for_each_online_cpu(i) +		sum += cpu_rq(i)->nr_running; + +	return sum; +} + +unsigned long nr_uninterruptible(void) +{ +	unsigned long i, sum = 0; + +	for_each_possible_cpu(i) +		sum += cpu_rq(i)->nr_uninterruptible; + +	/* +	 * Since we read the counters lockless, it might be slightly +	 * inaccurate. Do not allow it to go below zero though: +	 */ +	if (unlikely((long)sum < 0)) +		sum = 0; + +	return sum; +} + +unsigned long long nr_context_switches(void) +{ +	int i; +	unsigned long long sum = 0; + +	for_each_possible_cpu(i) +		sum += cpu_rq(i)->nr_switches; + +	return sum; +} + +unsigned long nr_iowait(void) +{ +	unsigned long i, sum = 0; + +	for_each_possible_cpu(i) +		sum += atomic_read(&cpu_rq(i)->nr_iowait); + +	return sum; +} + +unsigned long nr_iowait_cpu(int cpu) +{ +	struct rq *this = cpu_rq(cpu); +	return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ +	struct rq *this = this_rq(); +	return this->cpu_load[0]; +} + + +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +static long calc_load_fold_active(struct rq *this_rq) +{ +	long nr_active, delta = 0; + +	nr_active = this_rq->nr_running; +	nr_active += (long) this_rq->nr_uninterruptible; + +	if (nr_active != this_rq->calc_load_active) { +		delta = nr_active - this_rq->calc_load_active; +		this_rq->calc_load_active = nr_active; +	} + +	return delta; +} + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ +	load *= exp; +	load += active * (FIXED_1 - exp); +	load += 1UL << (FSHIFT - 1); +	return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ +/* + * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_tasks_idle; + +void calc_load_account_idle(struct rq *this_rq) +{ +	long delta; + +	delta = calc_load_fold_active(this_rq); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks_idle); +} + +static long calc_load_fold_idle(void) +{ +	long delta = 0; + +	/* +	 * Its got a race, we don't care... +	 */ +	if (atomic_long_read(&calc_load_tasks_idle)) +		delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + +	return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x:         base of the power + * @frac_bits: fractional bits of @x + * @n:         power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ +	unsigned long result = 1UL << frac_bits; + +	if (n) for (;;) { +		if (n & 1) { +			result *= x; +			result += 1UL << (frac_bits - 1); +			result >>= frac_bits; +		} +		n >>= 1; +		if (!n) +			break; +		x *= x; +		x += 1UL << (frac_bits - 1); +		x >>= frac_bits; +	} + +	return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + *    = (a0 * e + a * (1 - e)) * e + a * (1 - e) + *    = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + *  ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + *    = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + *              n         1 - x^(n+1) + *     S_n := \Sum x^i = ------------- + *             i=0          1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, +	    unsigned long active, unsigned int n) +{ + +	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(unsigned long ticks) +{ +	long delta, active, n; + +	if (time_before(jiffies, calc_load_update)) +		return; + +	/* +	 * If we crossed a calc_load_update boundary, make sure to fold +	 * any pending idle changes, the respective CPUs might have +	 * missed the tick driven calc_load_account_active() update +	 * due to NO_HZ. +	 */ +	delta = calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); + +	/* +	 * If we were idle for multiple load cycles, apply them. +	 */ +	if (ticks >= LOAD_FREQ) { +		n = ticks / LOAD_FREQ; + +		active = atomic_long_read(&calc_load_tasks); +		active = active > 0 ? active * FIXED_1 : 0; + +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + +		calc_load_update += n * LOAD_FREQ; +	} + +	/* +	 * Its possible the remainder of the above division also crosses +	 * a LOAD_FREQ period, the regular check in calc_global_load() +	 * which comes after this will take care of that. +	 * +	 * Consider us being 11 ticks before a cycle completion, and us +	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will +	 * age us 4 cycles, and the test in calc_global_load() will +	 * pick up the final one. +	 */ +} +#else +void calc_load_account_idle(struct rq *this_rq) +{ +} + +static inline long calc_load_fold_idle(void) +{ +	return 0; +} + +static void calc_global_nohz(unsigned long ticks) +{ +} +#endif + +/** + * get_avenrun - get the load average array + * @loads:	pointer to dest load array + * @offset:	offset to add + * @shift:	shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ +	loads[0] = (avenrun[0] + offset) << shift; +	loads[1] = (avenrun[1] + offset) << shift; +	loads[2] = (avenrun[2] + offset) << shift; +} + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ +	long active; + +	calc_global_nohz(ticks); + +	if (time_before(jiffies, calc_load_update + 10)) +		return; + +	active = atomic_long_read(&calc_load_tasks); +	active = active > 0 ? active * FIXED_1 : 0; + +	avenrun[0] = calc_load(avenrun[0], EXP_1, active); +	avenrun[1] = calc_load(avenrun[1], EXP_5, active); +	avenrun[2] = calc_load(avenrun[2], EXP_15, active); + +	calc_load_update += LOAD_FREQ; +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ +	long delta; + +	if (time_before(jiffies, this_rq->calc_load_update)) +		return; + +	delta  = calc_load_fold_active(this_rq); +	delta += calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); + +	this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT		7 +static const unsigned char +		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char +		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { +					{0, 0, 0, 0, 0, 0, 0, 0}, +					{64, 32, 8, 0, 0, 0, 0, 0}, +					{96, 72, 40, 12, 1, 0, 0}, +					{112, 98, 75, 43, 15, 1, 0}, +					{120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ +	int j = 0; + +	if (!missed_updates) +		return load; + +	if (missed_updates >= degrade_zero_ticks[idx]) +		return 0; + +	if (idx == 1) +		return load >> missed_updates; + +	while (missed_updates) { +		if (missed_updates % 2) +			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + +		missed_updates >>= 1; +		j++; +	} +	return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +void update_cpu_load(struct rq *this_rq) +{ +	unsigned long this_load = this_rq->load.weight; +	unsigned long curr_jiffies = jiffies; +	unsigned long pending_updates; +	int i, scale; + +	this_rq->nr_load_updates++; + +	/* Avoid repeated calls on same jiffy, when moving in and out of idle */ +	if (curr_jiffies == this_rq->last_load_update_tick) +		return; + +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	this_rq->last_load_update_tick = curr_jiffies; + +	/* Update our load: */ +	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ +	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { +		unsigned long old_load, new_load; + +		/* scale is effectively 1 << i now, and >> i divides by scale */ + +		old_load = this_rq->cpu_load[i]; +		old_load = decay_load_missed(old_load, pending_updates - 1, i); +		new_load = this_load; +		/* +		 * Round up the averaging division if load is increasing. This +		 * prevents us from getting stuck on 9 if the load is 10, for +		 * example. +		 */ +		if (new_load > old_load) +			new_load += scale - 1; + +		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; +	} + +	sched_avg_update(this_rq); +} + +static void update_cpu_load_active(struct rq *this_rq) +{ +	update_cpu_load(this_rq); + +	calc_load_account_active(this_rq); +} + +#ifdef CONFIG_SMP + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ +	struct task_struct *p = current; +	unsigned long flags; +	int dest_cpu; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); +	if (dest_cpu == smp_processor_id()) +		goto unlock; + +	if (likely(cpu_active(dest_cpu))) { +		struct migration_arg arg = { p, dest_cpu }; + +		raw_spin_unlock_irqrestore(&p->pi_lock, flags); +		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); +		return; +	} +unlock: +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); + +EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); + +/* + * Return any ns on the sched_clock that have not yet been accounted in + * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. + */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ +	u64 ns = 0; + +	if (task_current(rq, p)) { +		update_rq_clock(rq); +		ns = rq->clock_task - p->se.exec_start; +		if ((s64)ns < 0) +			ns = 0; +	} + +	return ns; +} + +unsigned long long task_delta_exec(struct task_struct *p) +{ +	unsigned long flags; +	struct rq *rq; +	u64 ns = 0; + +	rq = task_rq_lock(p, &flags); +	ns = do_task_delta_exec(p, rq); +	task_rq_unlock(rq, p, &flags); + +	return ns; +} + +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ +	unsigned long flags; +	struct rq *rq; +	u64 ns = 0; + +	rq = task_rq_lock(p, &flags); +	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); +	task_rq_unlock(rq, p, &flags); + +	return ns; +} + +#ifdef CONFIG_CGROUP_CPUACCT +struct cgroup_subsys cpuacct_subsys; +struct cpuacct root_cpuacct; +#endif + +static inline void task_group_account_field(struct task_struct *p, int index, +					    u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT +	struct kernel_cpustat *kcpustat; +	struct cpuacct *ca; +#endif +	/* +	 * Since all updates are sure to touch the root cgroup, we +	 * get ourselves ahead and touch it first. If the root cgroup +	 * is the only cgroup, then nothing else should be necessary. +	 * +	 */ +	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT +	if (unlikely(!cpuacct_subsys.active)) +		return; + +	rcu_read_lock(); +	ca = task_ca(p); +	while (ca && (ca != &root_cpuacct)) { +		kcpustat = this_cpu_ptr(ca->cpustat); +		kcpustat->cpustat[index] += tmp; +		ca = parent_ca(ca); +	} +	rcu_read_unlock(); +#endif +} + + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, +		       cputime_t cputime_scaled) +{ +	int index; + +	/* Add user time to process. */ +	p->utime += cputime; +	p->utimescaled += cputime_scaled; +	account_group_user_time(p, cputime); + +	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + +	/* Add user time to cpustat. */ +	task_group_account_field(p, index, (__force u64) cputime); + +	/* Account for user time used */ +	acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, +			       cputime_t cputime_scaled) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; + +	/* Add guest time to process. */ +	p->utime += cputime; +	p->utimescaled += cputime_scaled; +	account_group_user_time(p, cputime); +	p->gtime += cputime; + +	/* Add guest time to cpustat. */ +	if (TASK_NICE(p) > 0) { +		cpustat[CPUTIME_NICE] += (__force u64) cputime; +		cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; +	} else { +		cpustat[CPUTIME_USER] += (__force u64) cputime; +		cpustat[CPUTIME_GUEST] += (__force u64) cputime; +	} +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, +			cputime_t cputime_scaled, int index) +{ +	/* Add system time to process. */ +	p->stime += cputime; +	p->stimescaled += cputime_scaled; +	account_group_system_time(p, cputime); + +	/* Add system time to cpustat. */ +	task_group_account_field(p, index, (__force u64) cputime); + +	/* Account for system time used */ +	acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, +			 cputime_t cputime, cputime_t cputime_scaled) +{ +	int index; + +	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { +		account_guest_time(p, cputime, cputime_scaled); +		return; +	} + +	if (hardirq_count() - hardirq_offset) +		index = CPUTIME_IRQ; +	else if (in_serving_softirq()) +		index = CPUTIME_SOFTIRQ; +	else +		index = CPUTIME_SYSTEM; + +	__account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; + +	cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; +	struct rq *rq = this_rq(); + +	if (atomic_read(&rq->nr_iowait) > 0) +		cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; +	else +		cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT +	if (static_branch(¶virt_steal_enabled)) { +		u64 steal, st = 0; + +		steal = paravirt_steal_clock(smp_processor_id()); +		steal -= this_rq()->prev_steal_time; + +		st = steal_ticks(steal); +		this_rq()->prev_steal_time += st * TICK_NSEC; + +		account_steal_time(st); +		return st; +	} +#endif +	return false; +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + *   - check for guest_time + *   - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +						struct rq *rq) +{ +	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	u64 *cpustat = kcpustat_this_cpu->cpustat; + +	if (steal_account_process_tick()) +		return; + +	if (irqtime_account_hi_update()) { +		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; +	} else if (irqtime_account_si_update()) { +		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; +	} else if (this_cpu_ksoftirqd() == p) { +		/* +		 * ksoftirqd time do not get accounted in cpu_softirq_time. +		 * So, we have to handle it separately here. +		 * Also, p->stime needs to be updated for ksoftirqd. +		 */ +		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, +					CPUTIME_SOFTIRQ); +	} else if (user_tick) { +		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +	} else if (p == rq->idle) { +		account_idle_time(cputime_one_jiffy); +	} else if (p->flags & PF_VCPU) { /* System time or guest time */ +		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); +	} else { +		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, +					CPUTIME_SYSTEM); +	} +} + +static void irqtime_account_idle_ticks(int ticks) +{ +	int i; +	struct rq *rq = this_rq(); + +	for (i = 0; i < ticks; i++) +		irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +						struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ +	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	struct rq *rq = this_rq(); + +	if (sched_clock_irqtime) { +		irqtime_account_process_tick(p, user_tick, rq); +		return; +	} + +	if (steal_account_process_tick()) +		return; + +	if (user_tick) +		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) +		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, +				    one_jiffy_scaled); +	else +		account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ +	account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + +	if (sched_clock_irqtime) { +		irqtime_account_idle_ticks(ticks); +		return; +	} + +	account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	*ut = p->utime; +	*st = p->stime; +} + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	struct task_cputime cputime; + +	thread_group_cputime(p, &cputime); + +	*ut = cputime.utime; +	*st = cputime.stime; +} +#else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	cputime_t rtime, utime = p->utime, total = utime + p->stime; + +	/* +	 * Use CFS's precise accounting: +	 */ +	rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + +	if (total) { +		u64 temp = (__force u64) rtime; + +		temp *= (__force u64) utime; +		do_div(temp, (__force u32) total); +		utime = (__force cputime_t) temp; +	} else +		utime = rtime; + +	/* +	 * Compare with previous values, to keep monotonicity: +	 */ +	p->prev_utime = max(p->prev_utime, utime); +	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + +	*ut = p->prev_utime; +	*st = p->prev_stime; +} + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	struct signal_struct *sig = p->signal; +	struct task_cputime cputime; +	cputime_t rtime, utime, total; + +	thread_group_cputime(p, &cputime); + +	total = cputime.utime + cputime.stime; +	rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + +	if (total) { +		u64 temp = (__force u64) rtime; + +		temp *= (__force u64) cputime.utime; +		do_div(temp, (__force u32) total); +		utime = (__force cputime_t) temp; +	} else +		utime = rtime; + +	sig->prev_utime = max(sig->prev_utime, utime); +	sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); + +	*ut = sig->prev_utime; +	*st = sig->prev_stime; +} +#endif + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(void) +{ +	int cpu = smp_processor_id(); +	struct rq *rq = cpu_rq(cpu); +	struct task_struct *curr = rq->curr; + +	sched_clock_tick(); + +	raw_spin_lock(&rq->lock); +	update_rq_clock(rq); +	update_cpu_load_active(rq); +	curr->sched_class->task_tick(rq, curr, 0); +	raw_spin_unlock(&rq->lock); + +	perf_event_task_tick(); + +#ifdef CONFIG_SMP +	rq->idle_balance = idle_cpu(cpu); +	trigger_load_balance(rq, cpu); +#endif +} + +notrace unsigned long get_parent_ip(unsigned long addr) +{ +	if (in_lock_functions(addr)) { +		addr = CALLER_ADDR2; +		if (in_lock_functions(addr)) +			addr = CALLER_ADDR3; +	} +	return addr; +} + +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +				defined(CONFIG_PREEMPT_TRACER)) + +void __kprobes add_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT +	/* +	 * Underflow? +	 */ +	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) +		return; +#endif +	preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT +	/* +	 * Spinlock count overflowing soon? +	 */ +	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= +				PREEMPT_MASK - 10); +#endif +	if (preempt_count() == val) +		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +} +EXPORT_SYMBOL(add_preempt_count); + +void __kprobes sub_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT +	/* +	 * Underflow? +	 */ +	if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) +		return; +	/* +	 * Is the spinlock portion underflowing? +	 */ +	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && +			!(preempt_count() & PREEMPT_MASK))) +		return; +#endif + +	if (preempt_count() == val) +		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +	preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ +	struct pt_regs *regs = get_irq_regs(); + +	if (oops_in_progress) +		return; + +	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", +		prev->comm, prev->pid, preempt_count()); + +	debug_show_held_locks(prev); +	print_modules(); +	if (irqs_disabled()) +		print_irqtrace_events(prev); + +	if (regs) +		show_regs(regs); +	else +		dump_stack(); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ +	/* +	 * Test if we are atomic. Since do_exit() needs to call into +	 * schedule() atomically, we ignore that path for now. +	 * Otherwise, whine if we are scheduling when we should not be. +	 */ +	if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) +		__schedule_bug(prev); +	rcu_sleep_check(); + +	profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +	schedstat_inc(this_rq(), sched_count); +} + +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ +	if (prev->on_rq || rq->skip_clock_update < 0) +		update_rq_clock(rq); +	prev->sched_class->put_prev_task(rq, prev); +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq) +{ +	const struct sched_class *class; +	struct task_struct *p; + +	/* +	 * Optimization: we know that if all tasks are in +	 * the fair class we can call that function directly: +	 */ +	if (likely(rq->nr_running == rq->cfs.h_nr_running)) { +		p = fair_sched_class.pick_next_task(rq); +		if (likely(p)) +			return p; +	} + +	for_each_class(class) { +		p = class->pick_next_task(rq); +		if (p) +			return p; +	} + +	BUG(); /* the idle class will always have a runnable task */ +} + +/* + * __schedule() is the main scheduler function. + */ +static void __sched __schedule(void) +{ +	struct task_struct *prev, *next; +	unsigned long *switch_count; +	struct rq *rq; +	int cpu; + +need_resched: +	preempt_disable(); +	cpu = smp_processor_id(); +	rq = cpu_rq(cpu); +	rcu_note_context_switch(cpu); +	prev = rq->curr; + +	schedule_debug(prev); + +	if (sched_feat(HRTICK)) +		hrtick_clear(rq); + +	raw_spin_lock_irq(&rq->lock); + +	switch_count = &prev->nivcsw; +	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { +		if (unlikely(signal_pending_state(prev->state, prev))) { +			prev->state = TASK_RUNNING; +		} else { +			deactivate_task(rq, prev, DEQUEUE_SLEEP); +			prev->on_rq = 0; + +			/* +			 * If a worker went to sleep, notify and ask workqueue +			 * whether it wants to wake up a task to maintain +			 * concurrency. +			 */ +			if (prev->flags & PF_WQ_WORKER) { +				struct task_struct *to_wakeup; + +				to_wakeup = wq_worker_sleeping(prev, cpu); +				if (to_wakeup) +					try_to_wake_up_local(to_wakeup); +			} +		} +		switch_count = &prev->nvcsw; +	} + +	pre_schedule(rq, prev); + +	if (unlikely(!rq->nr_running)) +		idle_balance(cpu, rq); + +	put_prev_task(rq, prev); +	next = pick_next_task(rq); +	clear_tsk_need_resched(prev); +	rq->skip_clock_update = 0; + +	if (likely(prev != next)) { +		rq->nr_switches++; +		rq->curr = next; +		++*switch_count; + +		context_switch(rq, prev, next); /* unlocks the rq */ +		/* +		 * The context switch have flipped the stack from under us +		 * and restored the local variables which were saved when +		 * this task called schedule() in the past. prev == current +		 * is still correct, but it can be moved to another cpu/rq. +		 */ +		cpu = smp_processor_id(); +		rq = cpu_rq(cpu); +	} else +		raw_spin_unlock_irq(&rq->lock); + +	post_schedule(rq); + +	preempt_enable_no_resched(); +	if (need_resched()) +		goto need_resched; +} + +static inline void sched_submit_work(struct task_struct *tsk) +{ +	if (!tsk->state) +		return; +	/* +	 * If we are going to sleep and we have plugged IO queued, +	 * make sure to submit it to avoid deadlocks. +	 */ +	if (blk_needs_flush_plug(tsk)) +		blk_schedule_flush_plug(tsk); +} + +asmlinkage void __sched schedule(void) +{ +	struct task_struct *tsk = current; + +	sched_submit_work(tsk); +	__schedule(); +} +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ +	if (lock->owner != owner) +		return false; + +	/* +	 * Ensure we emit the owner->on_cpu, dereference _after_ checking +	 * lock->owner still matches owner, if that fails, owner might +	 * point to free()d memory, if it still matches, the rcu_read_lock() +	 * ensures the memory stays valid. +	 */ +	barrier(); + +	return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ +	if (!sched_feat(OWNER_SPIN)) +		return 0; + +	rcu_read_lock(); +	while (owner_running(lock, owner)) { +		if (need_resched()) +			break; + +		arch_mutex_cpu_relax(); +	} +	rcu_read_unlock(); + +	/* +	 * We break out the loop above on need_resched() and when the +	 * owner changed, which is a sign for heavy contention. Return +	 * success only when lock->owner is NULL. +	 */ +	return lock->owner == NULL; +} +#endif + +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched notrace preempt_schedule(void) +{ +	struct thread_info *ti = current_thread_info(); + +	/* +	 * If there is a non-zero preempt_count or interrupts are disabled, +	 * we do not want to preempt the current task. Just return.. +	 */ +	if (likely(ti->preempt_count || irqs_disabled())) +		return; + +	do { +		add_preempt_count_notrace(PREEMPT_ACTIVE); +		__schedule(); +		sub_preempt_count_notrace(PREEMPT_ACTIVE); + +		/* +		 * Check again in case we missed a preemption opportunity +		 * between schedule and now. +		 */ +		barrier(); +	} while (need_resched()); +} +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ +	struct thread_info *ti = current_thread_info(); + +	/* Catch callers which need to be fixed */ +	BUG_ON(ti->preempt_count || !irqs_disabled()); + +	do { +		add_preempt_count(PREEMPT_ACTIVE); +		local_irq_enable(); +		__schedule(); +		local_irq_disable(); +		sub_preempt_count(PREEMPT_ACTIVE); + +		/* +		 * Check again in case we missed a preemption opportunity +		 * between schedule and now. +		 */ +		barrier(); +	} while (need_resched()); +} + +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, +			  void *key) +{ +	return try_to_wake_up(curr->private, mode, wake_flags); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, int wake_flags, void *key) +{ +	wait_queue_t *curr, *next; + +	list_for_each_entry_safe(curr, next, &q->task_list, task_list) { +		unsigned flags = curr->flags; + +		if (curr->func(curr, mode, wake_flags, key) && +				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) +			break; +	} +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, 0, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ +	__wake_up_common(q, mode, 1, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ +	__wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; +	int wake_flags = WF_SYNC; + +	if (unlikely(!q)) +		return; + +	if (unlikely(!nr_exclusive)) +		wake_flags = 0; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, wake_flags, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ +	__wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */ + +/** + * complete: - signals a single thread waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done++; +	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done += UINT_MAX/2; +	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) +{ +	if (!x->done) { +		DECLARE_WAITQUEUE(wait, current); + +		__add_wait_queue_tail_exclusive(&x->wait, &wait); +		do { +			if (signal_pending_state(state, current)) { +				timeout = -ERESTARTSYS; +				break; +			} +			__set_current_state(state); +			spin_unlock_irq(&x->wait.lock); +			timeout = schedule_timeout(timeout); +			spin_lock_irq(&x->wait.lock); +		} while (!x->done && timeout); +		__remove_wait_queue(&x->wait, &wait); +		if (!x->done) +			return timeout; +	} +	x->done--; +	return timeout ?: 1; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ +	might_sleep(); + +	spin_lock_irq(&x->wait.lock); +	timeout = do_wait_for_common(x, timeout, state); +	spin_unlock_irq(&x->wait.lock); +	return timeout; +} + +/** + * wait_for_completion: - waits for completion of a task + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ +	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x:  holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, +					  unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, +				     unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + *	try_wait_for_completion - try to decrement a completion without blocking + *	@x:	completion structure + * + *	Returns: 0 if a decrement cannot be done without blocking + *		 1 if a decrement succeeded. + * + *	If a completion is being used as a counting completion, + *	attempt to decrement the counter without blocking. This + *	enables us to avoid waiting if the resource the completion + *	is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	else +		x->done--; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + *	completion_done - Test to see if a completion has any waiters + *	@x:	completion structure + * + *	Returns: 0 if there are waiters (wait_for_completion() in progress) + *		 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(completion_done); + +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) +{ +	unsigned long flags; +	wait_queue_t wait; + +	init_waitqueue_entry(&wait, current); + +	__set_current_state(state); + +	spin_lock_irqsave(&q->lock, flags); +	__add_wait_queue(q, &wait); +	spin_unlock(&q->lock); +	timeout = schedule_timeout(timeout); +	spin_lock_irq(&q->lock); +	__remove_wait_queue(q, &wait); +	spin_unlock_irqrestore(&q->lock, flags); + +	return timeout; +} + +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ +	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(interruptible_sleep_on); + +long __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ +	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void __sched sleep_on(wait_queue_head_t *q) +{ +	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(sleep_on); + +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ +	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ +	int oldprio, on_rq, running; +	struct rq *rq; +	const struct sched_class *prev_class; + +	BUG_ON(prio < 0 || prio > MAX_PRIO); + +	rq = __task_rq_lock(p); + +	trace_sched_pi_setprio(p, prio); +	oldprio = p->prio; +	prev_class = p->sched_class; +	on_rq = p->on_rq; +	running = task_current(rq, p); +	if (on_rq) +		dequeue_task(rq, p, 0); +	if (running) +		p->sched_class->put_prev_task(rq, p); + +	if (rt_prio(prio)) +		p->sched_class = &rt_sched_class; +	else +		p->sched_class = &fair_sched_class; + +	p->prio = prio; + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (on_rq) +		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + +	check_class_changed(rq, p, prev_class, oldprio); +	__task_rq_unlock(rq); +} + +#endif + +void set_user_nice(struct task_struct *p, long nice) +{ +	int old_prio, delta, on_rq; +	unsigned long flags; +	struct rq *rq; + +	if (TASK_NICE(p) == nice || nice < -20 || nice > 19) +		return; +	/* +	 * We have to be careful, if called from sys_setpriority(), +	 * the task might be in the middle of scheduling on another CPU. +	 */ +	rq = task_rq_lock(p, &flags); +	/* +	 * The RT priorities are set via sched_setscheduler(), but we still +	 * allow the 'normal' nice value to be set - but as expected +	 * it wont have any effect on scheduling until the task is +	 * SCHED_FIFO/SCHED_RR: +	 */ +	if (task_has_rt_policy(p)) { +		p->static_prio = NICE_TO_PRIO(nice); +		goto out_unlock; +	} +	on_rq = p->on_rq; +	if (on_rq) +		dequeue_task(rq, p, 0); + +	p->static_prio = NICE_TO_PRIO(nice); +	set_load_weight(p); +	old_prio = p->prio; +	p->prio = effective_prio(p); +	delta = p->prio - old_prio; + +	if (on_rq) { +		enqueue_task(rq, p, 0); +		/* +		 * If the task increased its priority or is running and +		 * lowered its priority, then reschedule its CPU: +		 */ +		if (delta < 0 || (delta > 0 && task_running(rq, p))) +			resched_task(rq->curr); +	} +out_unlock: +	task_rq_unlock(rq, p, &flags); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ +	/* convert nice value [19,-20] to rlimit style value [1,40] */ +	int nice_rlim = 20 - nice; + +	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || +		capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ +	long nice, retval; + +	/* +	 * Setpriority might change our priority at the same moment. +	 * We don't have to worry. Conceptually one call occurs first +	 * and we have a single winner. +	 */ +	if (increment < -40) +		increment = -40; +	if (increment > 40) +		increment = 40; + +	nice = TASK_NICE(current) + increment; +	if (nice < -20) +		nice = -20; +	if (nice > 19) +		nice = 19; + +	if (increment < 0 && !can_nice(current, nice)) +		return -EPERM; + +	retval = security_task_setnice(current, nice); +	if (retval) +		return retval; + +	set_user_nice(current, nice); +	return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const struct task_struct *p) +{ +	return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const struct task_struct *p) +{ +	return TASK_NICE(p); +} +EXPORT_SYMBOL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +	if (rq->curr != rq->idle) +		return 0; + +	if (rq->nr_running) +		return 0; + +#ifdef CONFIG_SMP +	if (!llist_empty(&rq->wake_list)) +		return 0; +#endif + +	return 1; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +struct task_struct *idle_task(int cpu) +{ +	return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ +	return pid ? find_task_by_vpid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +{ +	p->policy = policy; +	p->rt_priority = prio; +	p->normal_prio = normal_prio(p); +	/* we are holding p->pi_lock already */ +	p->prio = rt_mutex_getprio(p); +	if (rt_prio(p->prio)) +		p->sched_class = &rt_sched_class; +	else +		p->sched_class = &fair_sched_class; +	set_load_weight(p); +} + +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ +	const struct cred *cred = current_cred(), *pcred; +	bool match; + +	rcu_read_lock(); +	pcred = __task_cred(p); +	if (cred->user->user_ns == pcred->user->user_ns) +		match = (cred->euid == pcred->euid || +			 cred->euid == pcred->uid); +	else +		match = false; +	rcu_read_unlock(); +	return match; +} + +static int __sched_setscheduler(struct task_struct *p, int policy, +				const struct sched_param *param, bool user) +{ +	int retval, oldprio, oldpolicy = -1, on_rq, running; +	unsigned long flags; +	const struct sched_class *prev_class; +	struct rq *rq; +	int reset_on_fork; + +	/* may grab non-irq protected spin_locks */ +	BUG_ON(in_interrupt()); +recheck: +	/* double check policy once rq lock held */ +	if (policy < 0) { +		reset_on_fork = p->sched_reset_on_fork; +		policy = oldpolicy = p->policy; +	} else { +		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); +		policy &= ~SCHED_RESET_ON_FORK; + +		if (policy != SCHED_FIFO && policy != SCHED_RR && +				policy != SCHED_NORMAL && policy != SCHED_BATCH && +				policy != SCHED_IDLE) +			return -EINVAL; +	} + +	/* +	 * Valid priorities for SCHED_FIFO and SCHED_RR are +	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, +	 * SCHED_BATCH and SCHED_IDLE is 0. +	 */ +	if (param->sched_priority < 0 || +	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || +	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) +		return -EINVAL; +	if (rt_policy(policy) != (param->sched_priority != 0)) +		return -EINVAL; + +	/* +	 * Allow unprivileged RT tasks to decrease priority: +	 */ +	if (user && !capable(CAP_SYS_NICE)) { +		if (rt_policy(policy)) { +			unsigned long rlim_rtprio = +					task_rlimit(p, RLIMIT_RTPRIO); + +			/* can't set/change the rt policy */ +			if (policy != p->policy && !rlim_rtprio) +				return -EPERM; + +			/* can't increase priority */ +			if (param->sched_priority > p->rt_priority && +			    param->sched_priority > rlim_rtprio) +				return -EPERM; +		} + +		/* +		 * Treat SCHED_IDLE as nice 20. Only allow a switch to +		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. +		 */ +		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { +			if (!can_nice(p, TASK_NICE(p))) +				return -EPERM; +		} + +		/* can't change other user's priorities */ +		if (!check_same_owner(p)) +			return -EPERM; + +		/* Normal users shall not reset the sched_reset_on_fork flag */ +		if (p->sched_reset_on_fork && !reset_on_fork) +			return -EPERM; +	} + +	if (user) { +		retval = security_task_setscheduler(p); +		if (retval) +			return retval; +	} + +	/* +	 * make sure no PI-waiters arrive (or leave) while we are +	 * changing the priority of the task: +	 * +	 * To be able to change p->policy safely, the appropriate +	 * runqueue lock must be held. +	 */ +	rq = task_rq_lock(p, &flags); + +	/* +	 * Changing the policy of the stop threads its a very bad idea +	 */ +	if (p == rq->stop) { +		task_rq_unlock(rq, p, &flags); +		return -EINVAL; +	} + +	/* +	 * If not changing anything there's no need to proceed further: +	 */ +	if (unlikely(policy == p->policy && (!rt_policy(policy) || +			param->sched_priority == p->rt_priority))) { + +		__task_rq_unlock(rq); +		raw_spin_unlock_irqrestore(&p->pi_lock, flags); +		return 0; +	} + +#ifdef CONFIG_RT_GROUP_SCHED +	if (user) { +		/* +		 * Do not allow realtime tasks into groups that have no runtime +		 * assigned. +		 */ +		if (rt_bandwidth_enabled() && rt_policy(policy) && +				task_group(p)->rt_bandwidth.rt_runtime == 0 && +				!task_group_is_autogroup(task_group(p))) { +			task_rq_unlock(rq, p, &flags); +			return -EPERM; +		} +	} +#endif + +	/* recheck policy now with rq lock held */ +	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { +		policy = oldpolicy = -1; +		task_rq_unlock(rq, p, &flags); +		goto recheck; +	} +	on_rq = p->on_rq; +	running = task_current(rq, p); +	if (on_rq) +		deactivate_task(rq, p, 0); +	if (running) +		p->sched_class->put_prev_task(rq, p); + +	p->sched_reset_on_fork = reset_on_fork; + +	oldprio = p->prio; +	prev_class = p->sched_class; +	__setscheduler(rq, p, policy, param->sched_priority); + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (on_rq) +		activate_task(rq, p, 0); + +	check_class_changed(rq, p, prev_class, oldprio); +	task_rq_unlock(rq, p, &flags); + +	rt_mutex_adjust_pi(p); + +	return 0; +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, +		       const struct sched_param *param) +{ +	return __sched_setscheduler(p, policy, param, true); +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission.  For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, +			       const struct sched_param *param) +{ +	return __sched_setscheduler(p, policy, param, false); +} + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ +	struct sched_param lparam; +	struct task_struct *p; +	int retval; + +	if (!param || pid < 0) +		return -EINVAL; +	if (copy_from_user(&lparam, param, sizeof(struct sched_param))) +		return -EFAULT; + +	rcu_read_lock(); +	retval = -ESRCH; +	p = find_process_by_pid(pid); +	if (p != NULL) +		retval = sched_setscheduler(p, policy, &lparam); +	rcu_read_unlock(); + +	return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, +		struct sched_param __user *, param) +{ +	/* negative values for policy are not valid */ +	if (policy < 0) +		return -EINVAL; + +	return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ +	return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ +	struct task_struct *p; +	int retval; + +	if (pid < 0) +		return -EINVAL; + +	retval = -ESRCH; +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	if (p) { +		retval = security_task_getscheduler(p); +		if (!retval) +			retval = p->policy +				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); +	} +	rcu_read_unlock(); +	return retval; +} + +/** + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ +	struct sched_param lp; +	struct task_struct *p; +	int retval; + +	if (!param || pid < 0) +		return -EINVAL; + +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	retval = -ESRCH; +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	lp.sched_priority = p->rt_priority; +	rcu_read_unlock(); + +	/* +	 * This one might sleep, we cannot do it with a spinlock held ... +	 */ +	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +	return retval; + +out_unlock: +	rcu_read_unlock(); +	return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ +	cpumask_var_t cpus_allowed, new_mask; +	struct task_struct *p; +	int retval; + +	get_online_cpus(); +	rcu_read_lock(); + +	p = find_process_by_pid(pid); +	if (!p) { +		rcu_read_unlock(); +		put_online_cpus(); +		return -ESRCH; +	} + +	/* Prevent p going away */ +	get_task_struct(p); +	rcu_read_unlock(); + +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { +		retval = -ENOMEM; +		goto out_put_task; +	} +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { +		retval = -ENOMEM; +		goto out_free_cpus_allowed; +	} +	retval = -EPERM; +	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) +		goto out_unlock; + +	retval = security_task_setscheduler(p); +	if (retval) +		goto out_unlock; + +	cpuset_cpus_allowed(p, cpus_allowed); +	cpumask_and(new_mask, in_mask, cpus_allowed); +again: +	retval = set_cpus_allowed_ptr(p, new_mask); + +	if (!retval) { +		cpuset_cpus_allowed(p, cpus_allowed); +		if (!cpumask_subset(new_mask, cpus_allowed)) { +			/* +			 * We must have raced with a concurrent cpuset +			 * update. Just reset the cpus_allowed to the +			 * cpuset's cpus_allowed +			 */ +			cpumask_copy(new_mask, cpus_allowed); +			goto again; +		} +	} +out_unlock: +	free_cpumask_var(new_mask); +out_free_cpus_allowed: +	free_cpumask_var(cpus_allowed); +out_put_task: +	put_task_struct(p); +	put_online_cpus(); +	return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, +			     struct cpumask *new_mask) +{ +	if (len < cpumask_size()) +		cpumask_clear(new_mask); +	else if (len > cpumask_size()) +		len = cpumask_size(); + +	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, +		unsigned long __user *, user_mask_ptr) +{ +	cpumask_var_t new_mask; +	int retval; + +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) +		return -ENOMEM; + +	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); +	if (retval == 0) +		retval = sched_setaffinity(pid, new_mask); +	free_cpumask_var(new_mask); +	return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ +	struct task_struct *p; +	unsigned long flags; +	int retval; + +	get_online_cpus(); +	rcu_read_lock(); + +	retval = -ESRCH; +	p = find_process_by_pid(pid); +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +out_unlock: +	rcu_read_unlock(); +	put_online_cpus(); + +	return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, +		unsigned long __user *, user_mask_ptr) +{ +	int ret; +	cpumask_var_t mask; + +	if ((len * BITS_PER_BYTE) < nr_cpu_ids) +		return -EINVAL; +	if (len & (sizeof(unsigned long)-1)) +		return -EINVAL; + +	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) +		return -ENOMEM; + +	ret = sched_getaffinity(pid, mask); +	if (ret == 0) { +		size_t retlen = min_t(size_t, len, cpumask_size()); + +		if (copy_to_user(user_mask_ptr, mask, retlen)) +			ret = -EFAULT; +		else +			ret = retlen; +	} +	free_cpumask_var(mask); + +	return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + */ +SYSCALL_DEFINE0(sched_yield) +{ +	struct rq *rq = this_rq_lock(); + +	schedstat_inc(rq, yld_count); +	current->sched_class->yield_task(rq); + +	/* +	 * Since we are going to call schedule() anyway, there's +	 * no need to preempt or enable interrupts: +	 */ +	__release(rq->lock); +	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +	do_raw_spin_unlock(&rq->lock); +	preempt_enable_no_resched(); + +	schedule(); + +	return 0; +} + +static inline int should_resched(void) +{ +	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + +static void __cond_resched(void) +{ +	add_preempt_count(PREEMPT_ACTIVE); +	__schedule(); +	sub_preempt_count(PREEMPT_ACTIVE); +} + +int __sched _cond_resched(void) +{ +	if (should_resched()) { +		__cond_resched(); +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int __cond_resched_lock(spinlock_t *lock) +{ +	int resched = should_resched(); +	int ret = 0; + +	lockdep_assert_held(lock); + +	if (spin_needbreak(lock) || resched) { +		spin_unlock(lock); +		if (resched) +			__cond_resched(); +		else +			cpu_relax(); +		ret = 1; +		spin_lock(lock); +	} +	return ret; +} +EXPORT_SYMBOL(__cond_resched_lock); + +int __sched __cond_resched_softirq(void) +{ +	BUG_ON(!in_softirq()); + +	if (should_resched()) { +		local_bh_enable(); +		__cond_resched(); +		local_bh_disable(); +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL(__cond_resched_softirq); + +/** + * yield - yield the current processor to other threads. + * + * This is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ +	set_current_state(TASK_RUNNING); +	sys_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ +	struct task_struct *curr = current; +	struct rq *rq, *p_rq; +	unsigned long flags; +	bool yielded = 0; + +	local_irq_save(flags); +	rq = this_rq(); + +again: +	p_rq = task_rq(p); +	double_rq_lock(rq, p_rq); +	while (task_rq(p) != p_rq) { +		double_rq_unlock(rq, p_rq); +		goto again; +	} + +	if (!curr->sched_class->yield_to_task) +		goto out; + +	if (curr->sched_class != p->sched_class) +		goto out; + +	if (task_running(p_rq, p) || p->state) +		goto out; + +	yielded = curr->sched_class->yield_to_task(rq, p, preempt); +	if (yielded) { +		schedstat_inc(rq, yld_count); +		/* +		 * Make p's CPU reschedule; pick_next_entity takes care of +		 * fairness. +		 */ +		if (preempt && rq != p_rq) +			resched_task(p_rq->curr); +	} else { +		/* +		 * We might have set it in task_yield_fair(), but are +		 * not going to schedule(), so don't want to skip +		 * the next update. +		 */ +		rq->skip_clock_update = 0; +	} + +out: +	double_rq_unlock(rq, p_rq); +	local_irq_restore(flags); + +	if (yielded) +		schedule(); + +	return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + */ +void __sched io_schedule(void) +{ +	struct rq *rq = raw_rq(); + +	delayacct_blkio_start(); +	atomic_inc(&rq->nr_iowait); +	blk_flush_plug(current); +	current->in_iowait = 1; +	schedule(); +	current->in_iowait = 0; +	atomic_dec(&rq->nr_iowait); +	delayacct_blkio_end(); +} +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ +	struct rq *rq = raw_rq(); +	long ret; + +	delayacct_blkio_start(); +	atomic_inc(&rq->nr_iowait); +	blk_flush_plug(current); +	current->in_iowait = 1; +	ret = schedule_timeout(timeout); +	current->in_iowait = 0; +	atomic_dec(&rq->nr_iowait); +	delayacct_blkio_end(); +	return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ +	int ret = -EINVAL; + +	switch (policy) { +	case SCHED_FIFO: +	case SCHED_RR: +		ret = MAX_USER_RT_PRIO-1; +		break; +	case SCHED_NORMAL: +	case SCHED_BATCH: +	case SCHED_IDLE: +		ret = 0; +		break; +	} +	return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ +	int ret = -EINVAL; + +	switch (policy) { +	case SCHED_FIFO: +	case SCHED_RR: +		ret = 1; +		break; +	case SCHED_NORMAL: +	case SCHED_BATCH: +	case SCHED_IDLE: +		ret = 0; +	} +	return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, +		struct timespec __user *, interval) +{ +	struct task_struct *p; +	unsigned int time_slice; +	unsigned long flags; +	struct rq *rq; +	int retval; +	struct timespec t; + +	if (pid < 0) +		return -EINVAL; + +	retval = -ESRCH; +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	rq = task_rq_lock(p, &flags); +	time_slice = p->sched_class->get_rr_interval(rq, p); +	task_rq_unlock(rq, p, &flags); + +	rcu_read_unlock(); +	jiffies_to_timespec(time_slice, &t); +	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +	return retval; + +out_unlock: +	rcu_read_unlock(); +	return retval; +} + +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + +void sched_show_task(struct task_struct *p) +{ +	unsigned long free = 0; +	unsigned state; + +	state = p->state ? __ffs(p->state) + 1 : 0; +	printk(KERN_INFO "%-15.15s %c", p->comm, +		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 +	if (state == TASK_RUNNING) +		printk(KERN_CONT " running  "); +	else +		printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else +	if (state == TASK_RUNNING) +		printk(KERN_CONT "  running task    "); +	else +		printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE +	free = stack_not_used(p); +#endif +	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, +		task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), +		(unsigned long)task_thread_info(p)->flags); + +	show_stack(p, NULL); +} + +void show_state_filter(unsigned long state_filter) +{ +	struct task_struct *g, *p; + +#if BITS_PER_LONG == 32 +	printk(KERN_INFO +		"  task                PC stack   pid father\n"); +#else +	printk(KERN_INFO +		"  task                        PC stack   pid father\n"); +#endif +	rcu_read_lock(); +	do_each_thread(g, p) { +		/* +		 * reset the NMI-timeout, listing all files on a slow +		 * console might take a lot of time: +		 */ +		touch_nmi_watchdog(); +		if (!state_filter || (p->state & state_filter)) +			sched_show_task(p); +	} while_each_thread(g, p); + +	touch_all_softlockup_watchdogs(); + +#ifdef CONFIG_SCHED_DEBUG +	sysrq_sched_debug_show(); +#endif +	rcu_read_unlock(); +	/* +	 * Only show locks if all tasks are dumped: +	 */ +	if (!state_filter) +		debug_show_all_locks(); +} + +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ +	idle->sched_class = &idle_sched_class; +} + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void __cpuinit init_idle(struct task_struct *idle, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	raw_spin_lock_irqsave(&rq->lock, flags); + +	__sched_fork(idle); +	idle->state = TASK_RUNNING; +	idle->se.exec_start = sched_clock(); + +	do_set_cpus_allowed(idle, cpumask_of(cpu)); +	/* +	 * We're having a chicken and egg problem, even though we are +	 * holding rq->lock, the cpu isn't yet set to this cpu so the +	 * lockdep check in task_group() will fail. +	 * +	 * Similar case to sched_fork(). / Alternatively we could +	 * use task_rq_lock() here and obtain the other rq->lock. +	 * +	 * Silence PROVE_RCU +	 */ +	rcu_read_lock(); +	__set_task_cpu(idle, cpu); +	rcu_read_unlock(); + +	rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) +	idle->on_cpu = 1; +#endif +	raw_spin_unlock_irqrestore(&rq->lock, flags); + +	/* Set the preempt count _outside_ the spinlocks! */ +	task_thread_info(idle)->preempt_count = 0; + +	/* +	 * The idle tasks have their own, simple scheduling class: +	 */ +	idle->sched_class = &idle_sched_class; +	ftrace_graph_init_idle_task(idle, cpu); +#if defined(CONFIG_SMP) +	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif +} + +#ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ +	if (p->sched_class && p->sched_class->set_cpus_allowed) +		p->sched_class->set_cpus_allowed(p, new_mask); + +	cpumask_copy(&p->cpus_allowed, new_mask); +	p->rt.nr_cpus_allowed = cpumask_weight(new_mask); +} + +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + *    stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + *    off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + *    it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + *    is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ +	unsigned long flags; +	struct rq *rq; +	unsigned int dest_cpu; +	int ret = 0; + +	rq = task_rq_lock(p, &flags); + +	if (cpumask_equal(&p->cpus_allowed, new_mask)) +		goto out; + +	if (!cpumask_intersects(new_mask, cpu_active_mask)) { +		ret = -EINVAL; +		goto out; +	} + +	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { +		ret = -EINVAL; +		goto out; +	} + +	do_set_cpus_allowed(p, new_mask); + +	/* Can the task run on the task's current CPU? If so, we're done */ +	if (cpumask_test_cpu(task_cpu(p), new_mask)) +		goto out; + +	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); +	if (p->on_rq) { +		struct migration_arg arg = { p, dest_cpu }; +		/* Need help from migration thread: drop lock and wait. */ +		task_rq_unlock(rq, p, &flags); +		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); +		tlb_migrate_finish(p->mm); +		return 0; +	} +out: +	task_rq_unlock(rq, p, &flags); + +	return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. + */ +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ +	struct rq *rq_dest, *rq_src; +	int ret = 0; + +	if (unlikely(!cpu_active(dest_cpu))) +		return ret; + +	rq_src = cpu_rq(src_cpu); +	rq_dest = cpu_rq(dest_cpu); + +	raw_spin_lock(&p->pi_lock); +	double_rq_lock(rq_src, rq_dest); +	/* Already moved. */ +	if (task_cpu(p) != src_cpu) +		goto done; +	/* Affinity changed (again). */ +	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) +		goto fail; + +	/* +	 * If we're not on a rq, the next wake-up will ensure we're +	 * placed properly. +	 */ +	if (p->on_rq) { +		deactivate_task(rq_src, p, 0); +		set_task_cpu(p, dest_cpu); +		activate_task(rq_dest, p, 0); +		check_preempt_curr(rq_dest, p, 0); +	} +done: +	ret = 1; +fail: +	double_rq_unlock(rq_src, rq_dest); +	raw_spin_unlock(&p->pi_lock); +	return ret; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ +	struct migration_arg *arg = data; + +	/* +	 * The original target cpu might have gone down and we might +	 * be on another cpu but it doesn't matter. +	 */ +	local_irq_disable(); +	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); +	local_irq_enable(); +	return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ +	struct mm_struct *mm = current->active_mm; + +	BUG_ON(cpu_online(smp_processor_id())); + +	if (mm != &init_mm) +		switch_mm(mm, &init_mm, current); +	mmdrop(mm); +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(struct rq *rq_src) +{ +	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); + +	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; +	rq_src->nr_uninterruptible = 0; +} + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ +	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +	rq->calc_load_active = 0; +} + +/* + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. + */ +static void migrate_tasks(unsigned int dead_cpu) +{ +	struct rq *rq = cpu_rq(dead_cpu); +	struct task_struct *next, *stop = rq->stop; +	int dest_cpu; + +	/* +	 * Fudge the rq selection such that the below task selection loop +	 * doesn't get stuck on the currently eligible stop task. +	 * +	 * We're currently inside stop_machine() and the rq is either stuck +	 * in the stop_machine_cpu_stop() loop, or we're executing this code, +	 * either way we should never end up calling schedule() until we're +	 * done here. +	 */ +	rq->stop = NULL; + +	/* Ensure any throttled groups are reachable by pick_next_task */ +	unthrottle_offline_cfs_rqs(rq); + +	for ( ; ; ) { +		/* +		 * There's this thread running, bail when that's the only +		 * remaining thread. +		 */ +		if (rq->nr_running == 1) +			break; + +		next = pick_next_task(rq); +		BUG_ON(!next); +		next->sched_class->put_prev_task(rq, next); + +		/* Find suitable destination for @next, with force if needed. */ +		dest_cpu = select_fallback_rq(dead_cpu, next); +		raw_spin_unlock(&rq->lock); + +		__migrate_task(next, dead_cpu, dest_cpu); + +		raw_spin_lock(&rq->lock); +	} + +	rq->stop = stop; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { +	{ +		.procname	= "sched_domain", +		.mode		= 0555, +	}, +	{} +}; + +static struct ctl_table sd_ctl_root[] = { +	{ +		.procname	= "kernel", +		.mode		= 0555, +		.child		= sd_ctl_dir, +	}, +	{} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ +	struct ctl_table *entry = +		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + +	return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ +	struct ctl_table *entry; + +	/* +	 * In the intermediate directories, both the child directory and +	 * procname are dynamically allocated and could fail but the mode +	 * will always be set. In the lowest directory the names are +	 * static strings and all have proc handlers. +	 */ +	for (entry = *tablep; entry->mode; entry++) { +		if (entry->child) +			sd_free_ctl_entry(&entry->child); +		if (entry->proc_handler == NULL) +			kfree(entry->procname); +	} + +	kfree(*tablep); +	*tablep = NULL; +} + +static void +set_table_entry(struct ctl_table *entry, +		const char *procname, void *data, int maxlen, +		umode_t mode, proc_handler *proc_handler) +{ +	entry->procname = procname; +	entry->data = data; +	entry->maxlen = maxlen; +	entry->mode = mode; +	entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ +	struct ctl_table *table = sd_alloc_ctl_entry(13); + +	if (table == NULL) +		return NULL; + +	set_table_entry(&table[0], "min_interval", &sd->min_interval, +		sizeof(long), 0644, proc_doulongvec_minmax); +	set_table_entry(&table[1], "max_interval", &sd->max_interval, +		sizeof(long), 0644, proc_doulongvec_minmax); +	set_table_entry(&table[2], "busy_idx", &sd->busy_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[3], "idle_idx", &sd->idle_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[5], "wake_idx", &sd->wake_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[7], "busy_factor", &sd->busy_factor, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[9], "cache_nice_tries", +		&sd->cache_nice_tries, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[10], "flags", &sd->flags, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[11], "name", sd->name, +		CORENAME_MAX_SIZE, 0444, proc_dostring); +	/* &table[12] is terminator */ + +	return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ +	struct ctl_table *entry, *table; +	struct sched_domain *sd; +	int domain_num = 0, i; +	char buf[32]; + +	for_each_domain(cpu, sd) +		domain_num++; +	entry = table = sd_alloc_ctl_entry(domain_num + 1); +	if (table == NULL) +		return NULL; + +	i = 0; +	for_each_domain(cpu, sd) { +		snprintf(buf, 32, "domain%d", i); +		entry->procname = kstrdup(buf, GFP_KERNEL); +		entry->mode = 0555; +		entry->child = sd_alloc_ctl_domain_table(sd); +		entry++; +		i++; +	} +	return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void register_sched_domain_sysctl(void) +{ +	int i, cpu_num = num_possible_cpus(); +	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); +	char buf[32]; + +	WARN_ON(sd_ctl_dir[0].child); +	sd_ctl_dir[0].child = entry; + +	if (entry == NULL) +		return; + +	for_each_possible_cpu(i) { +		snprintf(buf, 32, "cpu%d", i); +		entry->procname = kstrdup(buf, GFP_KERNEL); +		entry->mode = 0555; +		entry->child = sd_alloc_ctl_cpu_table(i); +		entry++; +	} + +	WARN_ON(sd_sysctl_header); +	sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +static void unregister_sched_domain_sysctl(void) +{ +	if (sd_sysctl_header) +		unregister_sysctl_table(sd_sysctl_header); +	sd_sysctl_header = NULL; +	if (sd_ctl_dir[0].child) +		sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#else +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) +{ +} +#endif + +static void set_rq_online(struct rq *rq) +{ +	if (!rq->online) { +		const struct sched_class *class; + +		cpumask_set_cpu(rq->cpu, rq->rd->online); +		rq->online = 1; + +		for_each_class(class) { +			if (class->rq_online) +				class->rq_online(rq); +		} +	} +} + +static void set_rq_offline(struct rq *rq) +{ +	if (rq->online) { +		const struct sched_class *class; + +		for_each_class(class) { +			if (class->rq_offline) +				class->rq_offline(rq); +		} + +		cpumask_clear_cpu(rq->cpu, rq->rd->online); +		rq->online = 0; +	} +} + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int __cpuinit +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	int cpu = (long)hcpu; +	unsigned long flags; +	struct rq *rq = cpu_rq(cpu); + +	switch (action & ~CPU_TASKS_FROZEN) { + +	case CPU_UP_PREPARE: +		rq->calc_load_update = calc_load_update; +		break; + +	case CPU_ONLINE: +		/* Update our root-domain */ +		raw_spin_lock_irqsave(&rq->lock, flags); +		if (rq->rd) { +			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + +			set_rq_online(rq); +		} +		raw_spin_unlock_irqrestore(&rq->lock, flags); +		break; + +#ifdef CONFIG_HOTPLUG_CPU +	case CPU_DYING: +		sched_ttwu_pending(); +		/* Update our root-domain */ +		raw_spin_lock_irqsave(&rq->lock, flags); +		if (rq->rd) { +			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); +			set_rq_offline(rq); +		} +		migrate_tasks(cpu); +		BUG_ON(rq->nr_running != 1); /* the migration thread */ +		raw_spin_unlock_irqrestore(&rq->lock, flags); + +		migrate_nr_uninterruptible(rq); +		calc_global_load_remove(rq); +		break; +#endif +	} + +	update_max_interval(); + +	return NOTIFY_OK; +} + +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else.  This has to be lower priority than + * the notifier in the perf_event subsystem, though. + */ +static struct notifier_block __cpuinitdata migration_notifier = { +	.notifier_call = migration_call, +	.priority = CPU_PRI_MIGRATION, +}; + +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +				      unsigned long action, void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_ONLINE: +	case CPU_DOWN_FAILED: +		set_cpu_active((long)hcpu, true); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +					unsigned long action, void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		set_cpu_active((long)hcpu, false); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +static int __init migration_init(void) +{ +	void *cpu = (void *)(long)smp_processor_id(); +	int err; + +	/* Initialize migration for the boot CPU */ +	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); +	BUG_ON(err == NOTIFY_BAD); +	migration_call(&migration_notifier, CPU_ONLINE, cpu); +	register_cpu_notifier(&migration_notifier); + +	/* Register cpu active notifiers */ +	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); +	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + +	return 0; +} +early_initcall(migration_init); +#endif + +#ifdef CONFIG_SMP + +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + +#ifdef CONFIG_SCHED_DEBUG + +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ +	sched_domain_debug_enabled = 1; + +	return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, +				  struct cpumask *groupmask) +{ +	struct sched_group *group = sd->groups; +	char str[256]; + +	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); +	cpumask_clear(groupmask); + +	printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + +	if (!(sd->flags & SD_LOAD_BALANCE)) { +		printk("does not load-balance\n"); +		if (sd->parent) +			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" +					" has parent"); +		return -1; +	} + +	printk(KERN_CONT "span %s level %s\n", str, sd->name); + +	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { +		printk(KERN_ERR "ERROR: domain->span does not contain " +				"CPU%d\n", cpu); +	} +	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { +		printk(KERN_ERR "ERROR: domain->groups does not contain" +				" CPU%d\n", cpu); +	} + +	printk(KERN_DEBUG "%*s groups:", level + 1, ""); +	do { +		if (!group) { +			printk("\n"); +			printk(KERN_ERR "ERROR: group is NULL\n"); +			break; +		} + +		if (!group->sgp->power) { +			printk(KERN_CONT "\n"); +			printk(KERN_ERR "ERROR: domain->cpu_power not " +					"set\n"); +			break; +		} + +		if (!cpumask_weight(sched_group_cpus(group))) { +			printk(KERN_CONT "\n"); +			printk(KERN_ERR "ERROR: empty group\n"); +			break; +		} + +		if (cpumask_intersects(groupmask, sched_group_cpus(group))) { +			printk(KERN_CONT "\n"); +			printk(KERN_ERR "ERROR: repeated CPUs\n"); +			break; +		} + +		cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + +		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + +		printk(KERN_CONT " %s", str); +		if (group->sgp->power != SCHED_POWER_SCALE) { +			printk(KERN_CONT " (cpu_power = %d)", +				group->sgp->power); +		} + +		group = group->next; +	} while (group != sd->groups); +	printk(KERN_CONT "\n"); + +	if (!cpumask_equal(sched_domain_span(sd), groupmask)) +		printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + +	if (sd->parent && +	    !cpumask_subset(groupmask, sched_domain_span(sd->parent))) +		printk(KERN_ERR "ERROR: parent span is not a superset " +			"of domain->span\n"); +	return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ +	int level = 0; + +	if (!sched_domain_debug_enabled) +		return; + +	if (!sd) { +		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); +		return; +	} + +	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + +	for (;;) { +		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) +			break; +		level++; +		sd = sd->parent; +		if (!sd) +			break; +	} +} +#else /* !CONFIG_SCHED_DEBUG */ +# define sched_domain_debug(sd, cpu) do { } while (0) +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ +	if (cpumask_weight(sched_domain_span(sd)) == 1) +		return 1; + +	/* Following flags need at least 2 groups */ +	if (sd->flags & (SD_LOAD_BALANCE | +			 SD_BALANCE_NEWIDLE | +			 SD_BALANCE_FORK | +			 SD_BALANCE_EXEC | +			 SD_SHARE_CPUPOWER | +			 SD_SHARE_PKG_RESOURCES)) { +		if (sd->groups != sd->groups->next) +			return 0; +	} + +	/* Following flags don't use groups */ +	if (sd->flags & (SD_WAKE_AFFINE)) +		return 0; + +	return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ +	unsigned long cflags = sd->flags, pflags = parent->flags; + +	if (sd_degenerate(parent)) +		return 1; + +	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) +		return 0; + +	/* Flags needing groups don't count if only 1 group in parent */ +	if (parent->groups == parent->groups->next) { +		pflags &= ~(SD_LOAD_BALANCE | +				SD_BALANCE_NEWIDLE | +				SD_BALANCE_FORK | +				SD_BALANCE_EXEC | +				SD_SHARE_CPUPOWER | +				SD_SHARE_PKG_RESOURCES); +		if (nr_node_ids == 1) +			pflags &= ~SD_SERIALIZE; +	} +	if (~cflags & pflags) +		return 0; + +	return 1; +} + +static void free_rootdomain(struct rcu_head *rcu) +{ +	struct root_domain *rd = container_of(rcu, struct root_domain, rcu); + +	cpupri_cleanup(&rd->cpupri); +	free_cpumask_var(rd->rto_mask); +	free_cpumask_var(rd->online); +	free_cpumask_var(rd->span); +	kfree(rd); +} + +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ +	struct root_domain *old_rd = NULL; +	unsigned long flags; + +	raw_spin_lock_irqsave(&rq->lock, flags); + +	if (rq->rd) { +		old_rd = rq->rd; + +		if (cpumask_test_cpu(rq->cpu, old_rd->online)) +			set_rq_offline(rq); + +		cpumask_clear_cpu(rq->cpu, old_rd->span); + +		/* +		 * If we dont want to free the old_rt yet then +		 * set old_rd to NULL to skip the freeing later +		 * in this function: +		 */ +		if (!atomic_dec_and_test(&old_rd->refcount)) +			old_rd = NULL; +	} + +	atomic_inc(&rd->refcount); +	rq->rd = rd; + +	cpumask_set_cpu(rq->cpu, rd->span); +	if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) +		set_rq_online(rq); + +	raw_spin_unlock_irqrestore(&rq->lock, flags); + +	if (old_rd) +		call_rcu_sched(&old_rd->rcu, free_rootdomain); +} + +static int init_rootdomain(struct root_domain *rd) +{ +	memset(rd, 0, sizeof(*rd)); + +	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) +		goto out; +	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) +		goto free_span; +	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +		goto free_online; + +	if (cpupri_init(&rd->cpupri) != 0) +		goto free_rto_mask; +	return 0; + +free_rto_mask: +	free_cpumask_var(rd->rto_mask); +free_online: +	free_cpumask_var(rd->online); +free_span: +	free_cpumask_var(rd->span); +out: +	return -ENOMEM; +} + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +struct root_domain def_root_domain; + +static void init_defrootdomain(void) +{ +	init_rootdomain(&def_root_domain); + +	atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ +	struct root_domain *rd; + +	rd = kmalloc(sizeof(*rd), GFP_KERNEL); +	if (!rd) +		return NULL; + +	if (init_rootdomain(rd) != 0) { +		kfree(rd); +		return NULL; +	} + +	return rd; +} + +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ +	struct sched_group *tmp, *first; + +	if (!sg) +		return; + +	first = sg; +	do { +		tmp = sg->next; + +		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) +			kfree(sg->sgp); + +		kfree(sg); +		sg = tmp; +	} while (sg != first); +} + +static void free_sched_domain(struct rcu_head *rcu) +{ +	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + +	/* +	 * If its an overlapping domain it has private groups, iterate and +	 * nuke them all. +	 */ +	if (sd->flags & SD_OVERLAP) { +		free_sched_groups(sd->groups, 1); +	} else if (atomic_dec_and_test(&sd->groups->ref)) { +		kfree(sd->groups->sgp); +		kfree(sd->groups); +	} +	kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ +	call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ +	for (; sd; sd = sd->parent) +		destroy_sched_domain(sd, cpu); +} + +/* + * Keep a special pointer to the highest sched_domain that has + * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this + * allows us to avoid some pointer chasing select_idle_sibling(). + * + * Also keep a unique ID per domain (we use the first cpu number in + * the cpumask of the domain), this allows us to quickly tell if + * two cpus are in the same cache domain, see ttwu_share_cache(). + */ +DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_id); + +static void update_top_cache_domain(int cpu) +{ +	struct sched_domain *sd; +	int id = cpu; + +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); +	if (sd) +		id = cpumask_first(sched_domain_span(sd)); + +	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); +	per_cpu(sd_llc_id, cpu) = id; +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	struct sched_domain *tmp; + +	/* Remove the sched domains which do not contribute to scheduling. */ +	for (tmp = sd; tmp; ) { +		struct sched_domain *parent = tmp->parent; +		if (!parent) +			break; + +		if (sd_parent_degenerate(tmp, parent)) { +			tmp->parent = parent->parent; +			if (parent->parent) +				parent->parent->child = tmp; +			destroy_sched_domain(parent, cpu); +		} else +			tmp = tmp->parent; +	} + +	if (sd && sd_degenerate(sd)) { +		tmp = sd; +		sd = sd->parent; +		destroy_sched_domain(tmp, cpu); +		if (sd) +			sd->child = NULL; +	} + +	sched_domain_debug(sd, cpu); + +	rq_attach_root(rq, rd); +	tmp = rq->sd; +	rcu_assign_pointer(rq->sd, sd); +	destroy_sched_domains(tmp, cpu); + +	update_top_cache_domain(cpu); +} + +/* cpus with isolated domains */ +static cpumask_var_t cpu_isolated_map; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ +	alloc_bootmem_cpumask_var(&cpu_isolated_map); +	cpulist_parse(str, cpu_isolated_map); +	return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); + +#ifdef CONFIG_NUMA + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, nodemask_t *used_nodes) +{ +	int i, n, val, min_val, best_node = -1; + +	min_val = INT_MAX; + +	for (i = 0; i < nr_node_ids; i++) { +		/* Start at @node */ +		n = (node + i) % nr_node_ids; + +		if (!nr_cpus_node(n)) +			continue; + +		/* Skip already used nodes */ +		if (node_isset(n, *used_nodes)) +			continue; + +		/* Simple min distance search */ +		val = node_distance(node, n); + +		if (val < min_val) { +			min_val = val; +			best_node = n; +		} +	} + +	if (best_node != -1) +		node_set(best_node, *used_nodes); +	return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @span: resulting cpumask + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static void sched_domain_node_span(int node, struct cpumask *span) +{ +	nodemask_t used_nodes; +	int i; + +	cpumask_clear(span); +	nodes_clear(used_nodes); + +	cpumask_or(span, span, cpumask_of_node(node)); +	node_set(node, used_nodes); + +	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { +		int next_node = find_next_best_node(node, &used_nodes); +		if (next_node < 0) +			break; +		cpumask_or(span, span, cpumask_of_node(next_node)); +	} +} + +static const struct cpumask *cpu_node_mask(int cpu) +{ +	lockdep_assert_held(&sched_domains_mutex); + +	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + +	return sched_domains_tmpmask; +} + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ +	return cpu_possible_mask; +} +#endif /* CONFIG_NUMA */ + +static const struct cpumask *cpu_cpu_mask(int cpu) +{ +	return cpumask_of_node(cpu_to_node(cpu)); +} + +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; + +struct sd_data { +	struct sched_domain **__percpu sd; +	struct sched_group **__percpu sg; +	struct sched_group_power **__percpu sgp; +}; + +struct s_data { +	struct sched_domain ** __percpu sd; +	struct root_domain	*rd; +}; + +enum s_alloc { +	sa_rootdomain, +	sa_sd, +	sa_sd_storage, +	sa_none, +}; + +struct sched_domain_topology_level; + +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + +#define SDTL_OVERLAP	0x01 + +struct sched_domain_topology_level { +	sched_domain_init_f init; +	sched_domain_mask_f mask; +	int		    flags; +	struct sd_data      data; +}; + +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ +	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; +	const struct cpumask *span = sched_domain_span(sd); +	struct cpumask *covered = sched_domains_tmpmask; +	struct sd_data *sdd = sd->private; +	struct sched_domain *child; +	int i; + +	cpumask_clear(covered); + +	for_each_cpu(i, span) { +		struct cpumask *sg_span; + +		if (cpumask_test_cpu(i, covered)) +			continue; + +		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), +				GFP_KERNEL, cpu_to_node(cpu)); + +		if (!sg) +			goto fail; + +		sg_span = sched_group_cpus(sg); + +		child = *per_cpu_ptr(sdd->sd, i); +		if (child->child) { +			child = child->child; +			cpumask_copy(sg_span, sched_domain_span(child)); +		} else +			cpumask_set_cpu(i, sg_span); + +		cpumask_or(covered, covered, sg_span); + +		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); +		atomic_inc(&sg->sgp->ref); + +		if (cpumask_test_cpu(cpu, sg_span)) +			groups = sg; + +		if (!first) +			first = sg; +		if (last) +			last->next = sg; +		last = sg; +		last->next = first; +	} +	sd->groups = groups; + +	return 0; + +fail: +	free_sched_groups(first, 0); + +	return -ENOMEM; +} + +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) +{ +	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); +	struct sched_domain *child = sd->child; + +	if (child) +		cpu = cpumask_first(sched_domain_span(child)); + +	if (sg) { +		*sg = *per_cpu_ptr(sdd->sg, cpu); +		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); +		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ +	} + +	return cpu; +} + +/* + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed + */ +static int +build_sched_groups(struct sched_domain *sd, int cpu) +{ +	struct sched_group *first = NULL, *last = NULL; +	struct sd_data *sdd = sd->private; +	const struct cpumask *span = sched_domain_span(sd); +	struct cpumask *covered; +	int i; + +	get_group(cpu, sdd, &sd->groups); +	atomic_inc(&sd->groups->ref); + +	if (cpu != cpumask_first(sched_domain_span(sd))) +		return 0; + +	lockdep_assert_held(&sched_domains_mutex); +	covered = sched_domains_tmpmask; + +	cpumask_clear(covered); + +	for_each_cpu(i, span) { +		struct sched_group *sg; +		int group = get_group(i, sdd, &sg); +		int j; + +		if (cpumask_test_cpu(i, covered)) +			continue; + +		cpumask_clear(sched_group_cpus(sg)); +		sg->sgp->power = 0; + +		for_each_cpu(j, span) { +			if (get_group(j, sdd, NULL) != group) +				continue; + +			cpumask_set_cpu(j, covered); +			cpumask_set_cpu(j, sched_group_cpus(sg)); +		} + +		if (!first) +			first = sg; +		if (last) +			last->next = sg; +		last = sg; +	} +	last->next = first; + +	return 0; +} + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ +	struct sched_group *sg = sd->groups; + +	WARN_ON(!sd || !sg); + +	do { +		sg->group_weight = cpumask_weight(sched_group_cpus(sg)); +		sg = sg->next; +	} while (sg != sd->groups); + +	if (cpu != group_first_cpu(sg)) +		return; + +	update_group_power(sd, cpu); +	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ +       return 0*SD_ASYM_PACKING; +} + +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type)		sd->name = #type +#else +# define SD_INIT_NAME(sd, type)		do { } while (0) +#endif + +#define SD_INIT_FUNC(type)						\ +static noinline struct sched_domain *					\ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\ +{									\ +	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\ +	*sd = SD_##type##_INIT;						\ +	SD_INIT_NAME(sd, type);						\ +	sd->private = &tl->data;					\ +	return sd;							\ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif + +static int default_relax_domain_level = -1; +int sched_domain_level_max; + +static int __init setup_relax_domain_level(char *str) +{ +	unsigned long val; + +	val = simple_strtoul(str, NULL, 0); +	if (val < sched_domain_level_max) +		default_relax_domain_level = val; + +	return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, +				 struct sched_domain_attr *attr) +{ +	int request; + +	if (!attr || attr->relax_domain_level < 0) { +		if (default_relax_domain_level < 0) +			return; +		else +			request = default_relax_domain_level; +	} else +		request = attr->relax_domain_level; +	if (request < sd->level) { +		/* turn off idle balance on this domain */ +		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); +	} else { +		/* turn on idle balance on this domain */ +		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); +	} +} + +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, +				 const struct cpumask *cpu_map) +{ +	switch (what) { +	case sa_rootdomain: +		if (!atomic_read(&d->rd->refcount)) +			free_rootdomain(&d->rd->rcu); /* fall through */ +	case sa_sd: +		free_percpu(d->sd); /* fall through */ +	case sa_sd_storage: +		__sdt_free(cpu_map); /* fall through */ +	case sa_none: +		break; +	} +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, +						   const struct cpumask *cpu_map) +{ +	memset(d, 0, sizeof(*d)); + +	if (__sdt_alloc(cpu_map)) +		return sa_sd_storage; +	d->sd = alloc_percpu(struct sched_domain *); +	if (!d->sd) +		return sa_sd_storage; +	d->rd = alloc_rootdomain(); +	if (!d->rd) +		return sa_sd; +	return sa_rootdomain; +} + +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ +	struct sd_data *sdd = sd->private; + +	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); +	*per_cpu_ptr(sdd->sd, cpu) = NULL; + +	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) +		*per_cpu_ptr(sdd->sg, cpu) = NULL; + +	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) +		*per_cpu_ptr(sdd->sgp, cpu) = NULL; +} + +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) +{ +	return topology_thread_cpumask(cpu); +} +#endif + +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT +	{ sd_init_SIBLING, cpu_smt_mask, }, +#endif +#ifdef CONFIG_SCHED_MC +	{ sd_init_MC, cpu_coregroup_mask, }, +#endif +#ifdef CONFIG_SCHED_BOOK +	{ sd_init_BOOK, cpu_book_mask, }, +#endif +	{ sd_init_CPU, cpu_cpu_mask, }, +#ifdef CONFIG_NUMA +	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, +	{ sd_init_ALLNODES, cpu_allnodes_mask, }, +#endif +	{ NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ +	struct sched_domain_topology_level *tl; +	int j; + +	for (tl = sched_domain_topology; tl->init; tl++) { +		struct sd_data *sdd = &tl->data; + +		sdd->sd = alloc_percpu(struct sched_domain *); +		if (!sdd->sd) +			return -ENOMEM; + +		sdd->sg = alloc_percpu(struct sched_group *); +		if (!sdd->sg) +			return -ENOMEM; + +		sdd->sgp = alloc_percpu(struct sched_group_power *); +		if (!sdd->sgp) +			return -ENOMEM; + +		for_each_cpu(j, cpu_map) { +			struct sched_domain *sd; +			struct sched_group *sg; +			struct sched_group_power *sgp; + +		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), +					GFP_KERNEL, cpu_to_node(j)); +			if (!sd) +				return -ENOMEM; + +			*per_cpu_ptr(sdd->sd, j) = sd; + +			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), +					GFP_KERNEL, cpu_to_node(j)); +			if (!sg) +				return -ENOMEM; + +			*per_cpu_ptr(sdd->sg, j) = sg; + +			sgp = kzalloc_node(sizeof(struct sched_group_power), +					GFP_KERNEL, cpu_to_node(j)); +			if (!sgp) +				return -ENOMEM; + +			*per_cpu_ptr(sdd->sgp, j) = sgp; +		} +	} + +	return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ +	struct sched_domain_topology_level *tl; +	int j; + +	for (tl = sched_domain_topology; tl->init; tl++) { +		struct sd_data *sdd = &tl->data; + +		for_each_cpu(j, cpu_map) { +			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); +			if (sd && (sd->flags & SD_OVERLAP)) +				free_sched_groups(sd->groups, 0); +			kfree(*per_cpu_ptr(sdd->sd, j)); +			kfree(*per_cpu_ptr(sdd->sg, j)); +			kfree(*per_cpu_ptr(sdd->sgp, j)); +		} +		free_percpu(sdd->sd); +		free_percpu(sdd->sg); +		free_percpu(sdd->sgp); +	} +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, +		struct s_data *d, const struct cpumask *cpu_map, +		struct sched_domain_attr *attr, struct sched_domain *child, +		int cpu) +{ +	struct sched_domain *sd = tl->init(tl, cpu); +	if (!sd) +		return child; + +	set_domain_attribute(sd, attr); +	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); +	if (child) { +		sd->level = child->level + 1; +		sched_domain_level_max = max(sched_domain_level_max, sd->level); +		child->parent = sd; +	} +	sd->child = child; + +	return sd; +} + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int build_sched_domains(const struct cpumask *cpu_map, +			       struct sched_domain_attr *attr) +{ +	enum s_alloc alloc_state = sa_none; +	struct sched_domain *sd; +	struct s_data d; +	int i, ret = -ENOMEM; + +	alloc_state = __visit_domain_allocation_hell(&d, cpu_map); +	if (alloc_state != sa_rootdomain) +		goto error; + +	/* Set up domains for cpus specified by the cpu_map. */ +	for_each_cpu(i, cpu_map) { +		struct sched_domain_topology_level *tl; + +		sd = NULL; +		for (tl = sched_domain_topology; tl->init; tl++) { +			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); +			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) +				sd->flags |= SD_OVERLAP; +			if (cpumask_equal(cpu_map, sched_domain_span(sd))) +				break; +		} + +		while (sd->child) +			sd = sd->child; + +		*per_cpu_ptr(d.sd, i) = sd; +	} + +	/* Build the groups for the domains */ +	for_each_cpu(i, cpu_map) { +		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { +			sd->span_weight = cpumask_weight(sched_domain_span(sd)); +			if (sd->flags & SD_OVERLAP) { +				if (build_overlap_sched_groups(sd, i)) +					goto error; +			} else { +				if (build_sched_groups(sd, i)) +					goto error; +			} +		} +	} + +	/* Calculate CPU power for physical packages and nodes */ +	for (i = nr_cpumask_bits-1; i >= 0; i--) { +		if (!cpumask_test_cpu(i, cpu_map)) +			continue; + +		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { +			claim_allocations(i, sd); +			init_sched_groups_power(i, sd); +		} +	} + +	/* Attach the domains */ +	rcu_read_lock(); +	for_each_cpu(i, cpu_map) { +		sd = *per_cpu_ptr(d.sd, i); +		cpu_attach_domain(sd, d.rd, i); +	} +	rcu_read_unlock(); + +	ret = 0; +error: +	__free_domain_allocs(&d, alloc_state, cpu_map); +	return ret; +} + +static cpumask_var_t *doms_cur;	/* current sched domains */ +static int ndoms_cur;		/* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; +				/* attribues of custom domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +static cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) +{ +	return 0; +} + +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ +	int i; +	cpumask_var_t *doms; + +	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); +	if (!doms) +		return NULL; +	for (i = 0; i < ndoms; i++) { +		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { +			free_sched_domains(doms, i); +			return NULL; +		} +	} +	return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ +	unsigned int i; +	for (i = 0; i < ndoms; i++) +		free_cpumask_var(doms[i]); +	kfree(doms); +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ +static int init_sched_domains(const struct cpumask *cpu_map) +{ +	int err; + +	arch_update_cpu_topology(); +	ndoms_cur = 1; +	doms_cur = alloc_sched_domains(ndoms_cur); +	if (!doms_cur) +		doms_cur = &fallback_doms; +	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); +	dattr_cur = NULL; +	err = build_sched_domains(doms_cur[0], NULL); +	register_sched_domain_sysctl(); + +	return err; +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ +	int i; + +	rcu_read_lock(); +	for_each_cpu(i, cpu_map) +		cpu_attach_domain(NULL, &def_root_domain, i); +	rcu_read_unlock(); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, +			struct sched_domain_attr *new, int idx_new) +{ +	struct sched_domain_attr tmp; + +	/* fast path */ +	if (!new && !cur) +		return 1; + +	tmp = SD_ATTR_INIT; +	return !memcmp(cur ? (cur + idx_cur) : &tmp, +			new ? (new + idx_new) : &tmp, +			sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains.  This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], +			     struct sched_domain_attr *dattr_new) +{ +	int i, j, n; +	int new_topology; + +	mutex_lock(&sched_domains_mutex); + +	/* always unregister in case we don't destroy any domains */ +	unregister_sched_domain_sysctl(); + +	/* Let architecture update cpu core mappings. */ +	new_topology = arch_update_cpu_topology(); + +	n = doms_new ? ndoms_new : 0; + +	/* Destroy deleted domains */ +	for (i = 0; i < ndoms_cur; i++) { +		for (j = 0; j < n && !new_topology; j++) { +			if (cpumask_equal(doms_cur[i], doms_new[j]) +			    && dattrs_equal(dattr_cur, i, dattr_new, j)) +				goto match1; +		} +		/* no match - a current sched domain not in new doms_new[] */ +		detach_destroy_domains(doms_cur[i]); +match1: +		; +	} + +	if (doms_new == NULL) { +		ndoms_cur = 0; +		doms_new = &fallback_doms; +		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); +		WARN_ON_ONCE(dattr_new); +	} + +	/* Build new domains */ +	for (i = 0; i < ndoms_new; i++) { +		for (j = 0; j < ndoms_cur && !new_topology; j++) { +			if (cpumask_equal(doms_new[i], doms_cur[j]) +			    && dattrs_equal(dattr_new, i, dattr_cur, j)) +				goto match2; +		} +		/* no match - add a new doms_new */ +		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); +match2: +		; +	} + +	/* Remember the new sched domains */ +	if (doms_cur != &fallback_doms) +		free_sched_domains(doms_cur, ndoms_cur); +	kfree(dattr_cur);	/* kfree(NULL) is safe */ +	doms_cur = doms_new; +	dattr_cur = dattr_new; +	ndoms_cur = ndoms_new; + +	register_sched_domain_sysctl(); + +	mutex_unlock(&sched_domains_mutex); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +static void reinit_sched_domains(void) +{ +	get_online_cpus(); + +	/* Destroy domains first to force the rebuild */ +	partition_sched_domains(0, NULL, NULL); + +	rebuild_sched_domains(); +	put_online_cpus(); +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ +	unsigned int level = 0; + +	if (sscanf(buf, "%u", &level) != 1) +		return -EINVAL; + +	/* +	 * level is always be positive so don't check for +	 * level < POWERSAVINGS_BALANCE_NONE which is 0 +	 * What happens on 0 or 1 byte write, +	 * need to check for count as well? +	 */ + +	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) +		return -EINVAL; + +	if (smt) +		sched_smt_power_savings = level; +	else +		sched_mc_power_savings = level; + +	reinit_sched_domains(); + +	return count; +} + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct device *dev, +					   struct device_attribute *attr, +					   char *buf) +{ +	return sprintf(buf, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct device *dev, +					    struct device_attribute *attr, +					    const char *buf, size_t count) +{ +	return sched_power_savings_store(buf, count, 0); +} +static DEVICE_ATTR(sched_mc_power_savings, 0644, +		   sched_mc_power_savings_show, +		   sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct device *dev, +					    struct device_attribute *attr, +					    char *buf) +{ +	return sprintf(buf, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct device *dev, +					    struct device_attribute *attr, +					     const char *buf, size_t count) +{ +	return sched_power_savings_store(buf, count, 1); +} +static DEVICE_ATTR(sched_smt_power_savings, 0644, +		   sched_smt_power_savings_show, +		   sched_smt_power_savings_store); +#endif + +int __init sched_create_sysfs_power_savings_entries(struct device *dev) +{ +	int err = 0; + +#ifdef CONFIG_SCHED_SMT +	if (smt_capable()) +		err = device_create_file(dev, &dev_attr_sched_smt_power_savings); +#endif +#ifdef CONFIG_SCHED_MC +	if (!err && mc_capable()) +		err = device_create_file(dev, &dev_attr_sched_mc_power_savings); +#endif +	return err; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + +/* + * Update cpusets according to cpu_active mask.  If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). + */ +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, +			     void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_ONLINE: +	case CPU_DOWN_FAILED: +		cpuset_update_active_cpus(); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, +			       void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		cpuset_update_active_cpus(); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +void __init sched_init_smp(void) +{ +	cpumask_var_t non_isolated_cpus; + +	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); +	alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + +	get_online_cpus(); +	mutex_lock(&sched_domains_mutex); +	init_sched_domains(cpu_active_mask); +	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); +	if (cpumask_empty(non_isolated_cpus)) +		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); +	mutex_unlock(&sched_domains_mutex); +	put_online_cpus(); + +	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); +	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); + +	/* RT runtime code needs to handle some hotplug events */ +	hotcpu_notifier(update_runtime, 0); + +	init_hrtick(); + +	/* Move init over to a non-isolated CPU */ +	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) +		BUG(); +	sched_init_granularity(); +	free_cpumask_var(non_isolated_cpus); + +	init_sched_rt_class(); +} +#else +void __init sched_init_smp(void) +{ +	sched_init_granularity(); +} +#endif /* CONFIG_SMP */ + +const_debug unsigned int sysctl_timer_migration = 1; + +int in_sched_functions(unsigned long addr) +{ +	return in_lock_functions(addr) || +		(addr >= (unsigned long)__sched_text_start +		&& addr < (unsigned long)__sched_text_end); +} + +#ifdef CONFIG_CGROUP_SCHED +struct task_group root_task_group; +#endif + +DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + +void __init sched_init(void) +{ +	int i, j; +	unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED +	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED +	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_CPUMASK_OFFSTACK +	alloc_size += num_possible_cpus() * cpumask_size(); +#endif +	if (alloc_size) { +		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + +#ifdef CONFIG_FAIR_GROUP_SCHED +		root_task_group.se = (struct sched_entity **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +		root_task_group.cfs_rq = (struct cfs_rq **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED +		root_task_group.rt_se = (struct sched_rt_entity **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +		root_task_group.rt_rq = (struct rt_rq **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK +		for_each_possible_cpu(i) { +			per_cpu(load_balance_tmpmask, i) = (void *)ptr; +			ptr += cpumask_size(); +		} +#endif /* CONFIG_CPUMASK_OFFSTACK */ +	} + +#ifdef CONFIG_SMP +	init_defrootdomain(); +#endif + +	init_rt_bandwidth(&def_rt_bandwidth, +			global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED +	init_rt_bandwidth(&root_task_group.rt_bandwidth, +			global_rt_period(), global_rt_runtime()); +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_CGROUP_SCHED +	list_add(&root_task_group.list, &task_groups); +	INIT_LIST_HEAD(&root_task_group.children); +	INIT_LIST_HEAD(&root_task_group.siblings); +	autogroup_init(&init_task); + +#endif /* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT +	root_cpuacct.cpustat = &kernel_cpustat; +	root_cpuacct.cpuusage = alloc_percpu(u64); +	/* Too early, not expected to fail */ +	BUG_ON(!root_cpuacct.cpuusage); +#endif +	for_each_possible_cpu(i) { +		struct rq *rq; + +		rq = cpu_rq(i); +		raw_spin_lock_init(&rq->lock); +		rq->nr_running = 0; +		rq->calc_load_active = 0; +		rq->calc_load_update = jiffies + LOAD_FREQ; +		init_cfs_rq(&rq->cfs); +		init_rt_rq(&rq->rt, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED +		root_task_group.shares = ROOT_TASK_GROUP_LOAD; +		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); +		/* +		 * How much cpu bandwidth does root_task_group get? +		 * +		 * In case of task-groups formed thr' the cgroup filesystem, it +		 * gets 100% of the cpu resources in the system. This overall +		 * system cpu resource is divided among the tasks of +		 * root_task_group and its child task-groups in a fair manner, +		 * based on each entity's (task or task-group's) weight +		 * (se->load.weight). +		 * +		 * In other words, if root_task_group has 10 tasks of weight +		 * 1024) and two child groups A0 and A1 (of weight 1024 each), +		 * then A0's share of the cpu resource is: +		 * +		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% +		 * +		 * We achieve this by letting root_task_group's tasks sit +		 * directly in rq->cfs (i.e root_task_group->se[] = NULL). +		 */ +		init_cfs_bandwidth(&root_task_group.cfs_bandwidth); +		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#ifdef CONFIG_RT_GROUP_SCHED +		INIT_LIST_HEAD(&rq->leaf_rt_rq_list); +		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); +#endif + +		for (j = 0; j < CPU_LOAD_IDX_MAX; j++) +			rq->cpu_load[j] = 0; + +		rq->last_load_update_tick = jiffies; + +#ifdef CONFIG_SMP +		rq->sd = NULL; +		rq->rd = NULL; +		rq->cpu_power = SCHED_POWER_SCALE; +		rq->post_schedule = 0; +		rq->active_balance = 0; +		rq->next_balance = jiffies; +		rq->push_cpu = 0; +		rq->cpu = i; +		rq->online = 0; +		rq->idle_stamp = 0; +		rq->avg_idle = 2*sysctl_sched_migration_cost; +		rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ +		rq->nohz_flags = 0; +#endif +#endif +		init_rq_hrtick(rq); +		atomic_set(&rq->nr_iowait, 0); +	} + +	set_load_weight(&init_task); + +#ifdef CONFIG_PREEMPT_NOTIFIERS +	INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + +#ifdef CONFIG_RT_MUTEXES +	plist_head_init(&init_task.pi_waiters); +#endif + +	/* +	 * The boot idle thread does lazy MMU switching as well: +	 */ +	atomic_inc(&init_mm.mm_count); +	enter_lazy_tlb(&init_mm, current); + +	/* +	 * Make us the idle thread. Technically, schedule() should not be +	 * called from this thread, however somewhere below it might be, +	 * but because we are the idle thread, we just pick up running again +	 * when this runqueue becomes "idle". +	 */ +	init_idle(current, smp_processor_id()); + +	calc_load_update = jiffies + LOAD_FREQ; + +	/* +	 * During early bootup we pretend to be a normal task: +	 */ +	current->sched_class = &fair_sched_class; + +#ifdef CONFIG_SMP +	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); +	/* May be allocated at isolcpus cmdline parse time */ +	if (cpu_isolated_map == NULL) +		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); +#endif +	init_sched_fair_class(); + +	scheduler_running = 1; +} + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +static inline int preempt_count_equals(int preempt_offset) +{ +	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + +	return (nested == preempt_offset); +} + +void __might_sleep(const char *file, int line, int preempt_offset) +{ +	static unsigned long prev_jiffy;	/* ratelimiting */ + +	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ +	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || +	    system_state != SYSTEM_RUNNING || oops_in_progress) +		return; +	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) +		return; +	prev_jiffy = jiffies; + +	printk(KERN_ERR +		"BUG: sleeping function called from invalid context at %s:%d\n", +			file, line); +	printk(KERN_ERR +		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", +			in_atomic(), irqs_disabled(), +			current->pid, current->comm); + +	debug_show_held_locks(current); +	if (irqs_disabled()) +		print_irqtrace_events(current); +	dump_stack(); +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ +	const struct sched_class *prev_class = p->sched_class; +	int old_prio = p->prio; +	int on_rq; + +	on_rq = p->on_rq; +	if (on_rq) +		deactivate_task(rq, p, 0); +	__setscheduler(rq, p, SCHED_NORMAL, 0); +	if (on_rq) { +		activate_task(rq, p, 0); +		resched_task(rq->curr); +	} + +	check_class_changed(rq, p, prev_class, old_prio); +} + +void normalize_rt_tasks(void) +{ +	struct task_struct *g, *p; +	unsigned long flags; +	struct rq *rq; + +	read_lock_irqsave(&tasklist_lock, flags); +	do_each_thread(g, p) { +		/* +		 * Only normalize user tasks: +		 */ +		if (!p->mm) +			continue; + +		p->se.exec_start		= 0; +#ifdef CONFIG_SCHEDSTATS +		p->se.statistics.wait_start	= 0; +		p->se.statistics.sleep_start	= 0; +		p->se.statistics.block_start	= 0; +#endif + +		if (!rt_task(p)) { +			/* +			 * Renice negative nice level userspace +			 * tasks back to 0: +			 */ +			if (TASK_NICE(p) < 0 && p->mm) +				set_user_nice(p, 0); +			continue; +		} + +		raw_spin_lock(&p->pi_lock); +		rq = __task_rq_lock(p); + +		normalize_task(rq, p); + +		__task_rq_unlock(rq); +		raw_spin_unlock(&p->pi_lock); +	} while_each_thread(g, p); + +	read_unlock_irqrestore(&tasklist_lock, flags); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ + +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +/* + * These functions are only useful for the IA64 MCA handling, or kdb. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +struct task_struct *curr_task(int cpu) +{ +	return cpu_curr(cpu); +} + +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ + +#ifdef CONFIG_IA64 +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, struct task_struct *p) +{ +	cpu_curr(cpu) = p; +} + +#endif + +#ifdef CONFIG_CGROUP_SCHED +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + +static void free_sched_group(struct task_group *tg) +{ +	free_fair_sched_group(tg); +	free_rt_sched_group(tg); +	autogroup_free(tg); +	kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ +	struct task_group *tg; +	unsigned long flags; + +	tg = kzalloc(sizeof(*tg), GFP_KERNEL); +	if (!tg) +		return ERR_PTR(-ENOMEM); + +	if (!alloc_fair_sched_group(tg, parent)) +		goto err; + +	if (!alloc_rt_sched_group(tg, parent)) +		goto err; + +	spin_lock_irqsave(&task_group_lock, flags); +	list_add_rcu(&tg->list, &task_groups); + +	WARN_ON(!parent); /* root should already exist */ + +	tg->parent = parent; +	INIT_LIST_HEAD(&tg->children); +	list_add_rcu(&tg->siblings, &parent->children); +	spin_unlock_irqrestore(&task_group_lock, flags); + +	return tg; + +err: +	free_sched_group(tg); +	return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group_rcu(struct rcu_head *rhp) +{ +	/* now it should be safe to free those cfs_rqs */ +	free_sched_group(container_of(rhp, struct task_group, rcu)); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ +	unsigned long flags; +	int i; + +	/* end participation in shares distribution */ +	for_each_possible_cpu(i) +		unregister_fair_sched_group(tg, i); + +	spin_lock_irqsave(&task_group_lock, flags); +	list_del_rcu(&tg->list); +	list_del_rcu(&tg->siblings); +	spin_unlock_irqrestore(&task_group_lock, flags); + +	/* wait for possible concurrent references to cfs_rqs complete */ +	call_rcu(&tg->rcu, free_sched_group_rcu); +} + +/* change task's runqueue when it moves between groups. + *	The caller of this function should have put the task in its new group + *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + *	reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ +	int on_rq, running; +	unsigned long flags; +	struct rq *rq; + +	rq = task_rq_lock(tsk, &flags); + +	running = task_current(rq, tsk); +	on_rq = tsk->on_rq; + +	if (on_rq) +		dequeue_task(rq, tsk, 0); +	if (unlikely(running)) +		tsk->sched_class->put_prev_task(rq, tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +	if (tsk->sched_class->task_move_group) +		tsk->sched_class->task_move_group(tsk, on_rq); +	else +#endif +		set_task_rq(tsk, task_cpu(tsk)); + +	if (unlikely(running)) +		tsk->sched_class->set_curr_task(rq); +	if (on_rq) +		enqueue_task(rq, tsk, 0); + +	task_rq_unlock(rq, tsk, &flags); +} +#endif /* CONFIG_CGROUP_SCHED */ + +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) +static unsigned long to_ratio(u64 period, u64 runtime) +{ +	if (runtime == RUNTIME_INF) +		return 1ULL << 20; + +	return div64_u64(runtime << 20, period); +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ +	struct task_struct *g, *p; + +	do_each_thread(g, p) { +		if (rt_task(p) && task_rq(p)->rt.tg == tg) +			return 1; +	} while_each_thread(g, p); + +	return 0; +} + +struct rt_schedulable_data { +	struct task_group *tg; +	u64 rt_period; +	u64 rt_runtime; +}; + +static int tg_rt_schedulable(struct task_group *tg, void *data) +{ +	struct rt_schedulable_data *d = data; +	struct task_group *child; +	unsigned long total, sum = 0; +	u64 period, runtime; + +	period = ktime_to_ns(tg->rt_bandwidth.rt_period); +	runtime = tg->rt_bandwidth.rt_runtime; + +	if (tg == d->tg) { +		period = d->rt_period; +		runtime = d->rt_runtime; +	} + +	/* +	 * Cannot have more runtime than the period. +	 */ +	if (runtime > period && runtime != RUNTIME_INF) +		return -EINVAL; + +	/* +	 * Ensure we don't starve existing RT tasks. +	 */ +	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) +		return -EBUSY; + +	total = to_ratio(period, runtime); + +	/* +	 * Nobody can have more than the global setting allows. +	 */ +	if (total > to_ratio(global_rt_period(), global_rt_runtime())) +		return -EINVAL; + +	/* +	 * The sum of our children's runtime should not exceed our own. +	 */ +	list_for_each_entry_rcu(child, &tg->children, siblings) { +		period = ktime_to_ns(child->rt_bandwidth.rt_period); +		runtime = child->rt_bandwidth.rt_runtime; + +		if (child == d->tg) { +			period = d->rt_period; +			runtime = d->rt_runtime; +		} + +		sum += to_ratio(period, runtime); +	} + +	if (sum > total) +		return -EINVAL; + +	return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ +	int ret; + +	struct rt_schedulable_data data = { +		.tg = tg, +		.rt_period = period, +		.rt_runtime = runtime, +	}; + +	rcu_read_lock(); +	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); +	rcu_read_unlock(); + +	return ret; +} + +static int tg_set_rt_bandwidth(struct task_group *tg, +		u64 rt_period, u64 rt_runtime) +{ +	int i, err = 0; + +	mutex_lock(&rt_constraints_mutex); +	read_lock(&tasklist_lock); +	err = __rt_schedulable(tg, rt_period, rt_runtime); +	if (err) +		goto unlock; + +	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); +	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); +	tg->rt_bandwidth.rt_runtime = rt_runtime; + +	for_each_possible_cpu(i) { +		struct rt_rq *rt_rq = tg->rt_rq[i]; + +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		rt_rq->rt_runtime = rt_runtime; +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +	} +	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: +	read_unlock(&tasklist_lock); +	mutex_unlock(&rt_constraints_mutex); + +	return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ +	u64 rt_runtime, rt_period; + +	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); +	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; +	if (rt_runtime_us < 0) +		rt_runtime = RUNTIME_INF; + +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ +	u64 rt_runtime_us; + +	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) +		return -1; + +	rt_runtime_us = tg->rt_bandwidth.rt_runtime; +	do_div(rt_runtime_us, NSEC_PER_USEC); +	return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ +	u64 rt_runtime, rt_period; + +	rt_period = (u64)rt_period_us * NSEC_PER_USEC; +	rt_runtime = tg->rt_bandwidth.rt_runtime; + +	if (rt_period == 0) +		return -EINVAL; + +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ +	u64 rt_period_us; + +	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); +	do_div(rt_period_us, NSEC_PER_USEC); +	return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ +	u64 runtime, period; +	int ret = 0; + +	if (sysctl_sched_rt_period <= 0) +		return -EINVAL; + +	runtime = global_rt_runtime(); +	period = global_rt_period(); + +	/* +	 * Sanity check on the sysctl variables. +	 */ +	if (runtime > period && runtime != RUNTIME_INF) +		return -EINVAL; + +	mutex_lock(&rt_constraints_mutex); +	read_lock(&tasklist_lock); +	ret = __rt_schedulable(NULL, 0, 0); +	read_unlock(&tasklist_lock); +	mutex_unlock(&rt_constraints_mutex); + +	return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ +	/* Don't accept realtime tasks when there is no way for them to run */ +	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) +		return 0; + +	return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ +	unsigned long flags; +	int i; + +	if (sysctl_sched_rt_period <= 0) +		return -EINVAL; + +	/* +	 * There's always some RT tasks in the root group +	 * -- migration, kstopmachine etc.. +	 */ +	if (sysctl_sched_rt_runtime == 0) +		return -EBUSY; + +	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); +	for_each_possible_cpu(i) { +		struct rt_rq *rt_rq = &cpu_rq(i)->rt; + +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		rt_rq->rt_runtime = global_rt_runtime(); +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +	} +	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + +	return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, +		loff_t *ppos) +{ +	int ret; +	int old_period, old_runtime; +	static DEFINE_MUTEX(mutex); + +	mutex_lock(&mutex); +	old_period = sysctl_sched_rt_period; +	old_runtime = sysctl_sched_rt_runtime; + +	ret = proc_dointvec(table, write, buffer, lenp, ppos); + +	if (!ret && write) { +		ret = sched_rt_global_constraints(); +		if (ret) { +			sysctl_sched_rt_period = old_period; +			sysctl_sched_rt_runtime = old_runtime; +		} else { +			def_rt_bandwidth.rt_runtime = global_rt_runtime(); +			def_rt_bandwidth.rt_period = +				ns_to_ktime(global_rt_period()); +		} +	} +	mutex_unlock(&mutex); + +	return ret; +} + +#ifdef CONFIG_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +{ +	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), +			    struct task_group, css); +} + +static struct cgroup_subsys_state * +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct task_group *tg, *parent; + +	if (!cgrp->parent) { +		/* This is early initialization for the top cgroup */ +		return &root_task_group.css; +	} + +	parent = cgroup_tg(cgrp->parent); +	tg = sched_create_group(parent); +	if (IS_ERR(tg)) +		return ERR_PTR(-ENOMEM); + +	return &tg->css; +} + +static void +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct task_group *tg = cgroup_tg(cgrp); + +	sched_destroy_group(tg); +} + +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +				 struct cgroup_taskset *tset) +{ +	struct task_struct *task; + +	cgroup_taskset_for_each(task, cgrp, tset) { +#ifdef CONFIG_RT_GROUP_SCHED +		if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) +			return -EINVAL; +#else +		/* We don't support RT-tasks being in separate groups */ +		if (task->sched_class != &fair_sched_class) +			return -EINVAL; +#endif +	} +	return 0; +} + +static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +			      struct cgroup_taskset *tset) +{ +	struct task_struct *task; + +	cgroup_taskset_for_each(task, cgrp, tset) +		sched_move_task(task); +} + +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, +		struct cgroup *old_cgrp, struct task_struct *task) +{ +	/* +	 * cgroup_exit() is called in the copy_process() failure path. +	 * Ignore this case since the task hasn't ran yet, this avoids +	 * trying to poke a half freed task state from generic code. +	 */ +	if (!(task->flags & PF_EXITING)) +		return; + +	sched_move_task(task); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, +				u64 shareval) +{ +	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); +} + +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ +	struct task_group *tg = cgroup_tg(cgrp); + +	return (u64) scale_load_down(tg->shares); +} + +#ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ +	int i, ret = 0, runtime_enabled, runtime_was_enabled; +	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + +	if (tg == &root_task_group) +		return -EINVAL; + +	/* +	 * Ensure we have at some amount of bandwidth every period.  This is +	 * to prevent reaching a state of large arrears when throttled via +	 * entity_tick() resulting in prolonged exit starvation. +	 */ +	if (quota < min_cfs_quota_period || period < min_cfs_quota_period) +		return -EINVAL; + +	/* +	 * Likewise, bound things on the otherside by preventing insane quota +	 * periods.  This also allows us to normalize in computing quota +	 * feasibility. +	 */ +	if (period > max_cfs_quota_period) +		return -EINVAL; + +	mutex_lock(&cfs_constraints_mutex); +	ret = __cfs_schedulable(tg, period, quota); +	if (ret) +		goto out_unlock; + +	runtime_enabled = quota != RUNTIME_INF; +	runtime_was_enabled = cfs_b->quota != RUNTIME_INF; +	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); +	raw_spin_lock_irq(&cfs_b->lock); +	cfs_b->period = ns_to_ktime(period); +	cfs_b->quota = quota; + +	__refill_cfs_bandwidth_runtime(cfs_b); +	/* restart the period timer (if active) to handle new period expiry */ +	if (runtime_enabled && cfs_b->timer_active) { +		/* force a reprogram */ +		cfs_b->timer_active = 0; +		__start_cfs_bandwidth(cfs_b); +	} +	raw_spin_unlock_irq(&cfs_b->lock); + +	for_each_possible_cpu(i) { +		struct cfs_rq *cfs_rq = tg->cfs_rq[i]; +		struct rq *rq = cfs_rq->rq; + +		raw_spin_lock_irq(&rq->lock); +		cfs_rq->runtime_enabled = runtime_enabled; +		cfs_rq->runtime_remaining = 0; + +		if (cfs_rq->throttled) +			unthrottle_cfs_rq(cfs_rq); +		raw_spin_unlock_irq(&rq->lock); +	} +out_unlock: +	mutex_unlock(&cfs_constraints_mutex); + +	return ret; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ +	u64 quota, period; + +	period = ktime_to_ns(tg->cfs_bandwidth.period); +	if (cfs_quota_us < 0) +		quota = RUNTIME_INF; +	else +		quota = (u64)cfs_quota_us * NSEC_PER_USEC; + +	return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ +	u64 quota_us; + +	if (tg->cfs_bandwidth.quota == RUNTIME_INF) +		return -1; + +	quota_us = tg->cfs_bandwidth.quota; +	do_div(quota_us, NSEC_PER_USEC); + +	return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ +	u64 quota, period; + +	period = (u64)cfs_period_us * NSEC_PER_USEC; +	quota = tg->cfs_bandwidth.quota; + +	return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ +	u64 cfs_period_us; + +	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); +	do_div(cfs_period_us, NSEC_PER_USEC); + +	return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ +	return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, +				s64 cfs_quota_us) +{ +	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ +	return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, +				u64 cfs_period_us) +{ +	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +struct cfs_schedulable_data { +	struct task_group *tg; +	u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, +			       struct cfs_schedulable_data *d) +{ +	u64 quota, period; + +	if (tg == d->tg) { +		period = d->period; +		quota = d->quota; +	} else { +		period = tg_get_cfs_period(tg); +		quota = tg_get_cfs_quota(tg); +	} + +	/* note: these should typically be equivalent */ +	if (quota == RUNTIME_INF || quota == -1) +		return RUNTIME_INF; + +	return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ +	struct cfs_schedulable_data *d = data; +	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; +	s64 quota = 0, parent_quota = -1; + +	if (!tg->parent) { +		quota = RUNTIME_INF; +	} else { +		struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; + +		quota = normalize_cfs_quota(tg, d); +		parent_quota = parent_b->hierarchal_quota; + +		/* +		 * ensure max(child_quota) <= parent_quota, inherit when no +		 * limit is set +		 */ +		if (quota == RUNTIME_INF) +			quota = parent_quota; +		else if (parent_quota != RUNTIME_INF && quota > parent_quota) +			return -EINVAL; +	} +	cfs_b->hierarchal_quota = quota; + +	return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ +	int ret; +	struct cfs_schedulable_data data = { +		.tg = tg, +		.period = period, +		.quota = quota, +	}; + +	if (quota != RUNTIME_INF) { +		do_div(data.period, NSEC_PER_USEC); +		do_div(data.quota, NSEC_PER_USEC); +	} + +	rcu_read_lock(); +	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); +	rcu_read_unlock(); + +	return ret; +} + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +		struct cgroup_map_cb *cb) +{ +	struct task_group *tg = cgroup_tg(cgrp); +	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + +	cb->fill(cb, "nr_periods", cfs_b->nr_periods); +	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); +	cb->fill(cb, "throttled_time", cfs_b->throttled_time); + +	return 0; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, +				s64 val) +{ +	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} + +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ +	return sched_group_rt_runtime(cgroup_tg(cgrp)); +} + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, +		u64 rt_period_us) +{ +	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ +	return sched_group_rt_period(cgroup_tg(cgrp)); +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED +	{ +		.name = "shares", +		.read_u64 = cpu_shares_read_u64, +		.write_u64 = cpu_shares_write_u64, +	}, +#endif +#ifdef CONFIG_CFS_BANDWIDTH +	{ +		.name = "cfs_quota_us", +		.read_s64 = cpu_cfs_quota_read_s64, +		.write_s64 = cpu_cfs_quota_write_s64, +	}, +	{ +		.name = "cfs_period_us", +		.read_u64 = cpu_cfs_period_read_u64, +		.write_u64 = cpu_cfs_period_write_u64, +	}, +	{ +		.name = "stat", +		.read_map = cpu_stats_show, +	}, +#endif +#ifdef CONFIG_RT_GROUP_SCHED +	{ +		.name = "rt_runtime_us", +		.read_s64 = cpu_rt_runtime_read, +		.write_s64 = cpu_rt_runtime_write, +	}, +	{ +		.name = "rt_period_us", +		.read_u64 = cpu_rt_period_read_uint, +		.write_u64 = cpu_rt_period_write_uint, +	}, +#endif +}; + +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ +	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); +} + +struct cgroup_subsys cpu_cgroup_subsys = { +	.name		= "cpu", +	.create		= cpu_cgroup_create, +	.destroy	= cpu_cgroup_destroy, +	.can_attach	= cpu_cgroup_can_attach, +	.attach		= cpu_cgroup_attach, +	.exit		= cpu_cgroup_exit, +	.populate	= cpu_cgroup_populate, +	.subsys_id	= cpu_cgroup_subsys_id, +	.early_init	= 1, +}; + +#endif	/* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_create( +	struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct cpuacct *ca; + +	if (!cgrp->parent) +		return &root_cpuacct.css; + +	ca = kzalloc(sizeof(*ca), GFP_KERNEL); +	if (!ca) +		goto out; + +	ca->cpuusage = alloc_percpu(u64); +	if (!ca->cpuusage) +		goto out_free_ca; + +	ca->cpustat = alloc_percpu(struct kernel_cpustat); +	if (!ca->cpustat) +		goto out_free_cpuusage; + +	return &ca->css; + +out_free_cpuusage: +	free_percpu(ca->cpuusage); +out_free_ca: +	kfree(ca); +out: +	return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); + +	free_percpu(ca->cpustat); +	free_percpu(ca->cpuusage); +	kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); +	u64 data; + +#ifndef CONFIG_64BIT +	/* +	 * Take rq->lock to make 64-bit read safe on 32-bit platforms. +	 */ +	raw_spin_lock_irq(&cpu_rq(cpu)->lock); +	data = *cpuusage; +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else +	data = *cpuusage; +#endif + +	return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT +	/* +	 * Take rq->lock to make 64-bit write safe on 32-bit platforms. +	 */ +	raw_spin_lock_irq(&cpu_rq(cpu)->lock); +	*cpuusage = val; +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else +	*cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	u64 totalcpuusage = 0; +	int i; + +	for_each_present_cpu(i) +		totalcpuusage += cpuacct_cpuusage_read(ca, i); + +	return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, +								u64 reset) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	int err = 0; +	int i; + +	if (reset) { +		err = -EINVAL; +		goto out; +	} + +	for_each_present_cpu(i) +		cpuacct_cpuusage_write(ca, i, 0); + +out: +	return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, +				   struct seq_file *m) +{ +	struct cpuacct *ca = cgroup_ca(cgroup); +	u64 percpu; +	int i; + +	for_each_present_cpu(i) { +		percpu = cpuacct_cpuusage_read(ca, i); +		seq_printf(m, "%llu ", (unsigned long long) percpu); +	} +	seq_printf(m, "\n"); +	return 0; +} + +static const char *cpuacct_stat_desc[] = { +	[CPUACCT_STAT_USER] = "user", +	[CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, +			      struct cgroup_map_cb *cb) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	int cpu; +	s64 val = 0; + +	for_each_online_cpu(cpu) { +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); +		val += kcpustat->cpustat[CPUTIME_USER]; +		val += kcpustat->cpustat[CPUTIME_NICE]; +	} +	val = cputime64_to_clock_t(val); +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + +	val = 0; +	for_each_online_cpu(cpu) { +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); +		val += kcpustat->cpustat[CPUTIME_SYSTEM]; +		val += kcpustat->cpustat[CPUTIME_IRQ]; +		val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; +	} + +	val = cputime64_to_clock_t(val); +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + +	return 0; +} + +static struct cftype files[] = { +	{ +		.name = "usage", +		.read_u64 = cpuusage_read, +		.write_u64 = cpuusage_write, +	}, +	{ +		.name = "usage_percpu", +		.read_seq_string = cpuacct_percpu_seq_read, +	}, +	{ +		.name = "stat", +		.read_map = cpuacct_stats_show, +	}, +}; + +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); +} + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +	struct cpuacct *ca; +	int cpu; + +	if (unlikely(!cpuacct_subsys.active)) +		return; + +	cpu = task_cpu(tsk); + +	rcu_read_lock(); + +	ca = task_ca(tsk); + +	for (; ca; ca = parent_ca(ca)) { +		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); +		*cpuusage += cputime; +	} + +	rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { +	.name = "cpuacct", +	.create = cpuacct_create, +	.destroy = cpuacct_destroy, +	.populate = cpuacct_populate, +	.subsys_id = cpuacct_subsys_id, +}; +#endif	/* CONFIG_CGROUP_CPUACCT */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c new file mode 100644 index 00000000000..b0d798eaf13 --- /dev/null +++ b/kernel/sched/cpupri.c @@ -0,0 +1,241 @@ +/* + *  kernel/sched/cpupri.c + * + *  CPU priority management + * + *  Copyright (C) 2007-2008 Novell + * + *  Author: Gregory Haskins <ghaskins@novell.com> + * + *  This code tracks the priority of each CPU so that global migration + *  decisions are easy to calculate.  Each CPU can be in a state as follows: + * + *                 (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + *  going from the lowest priority to the highest.  CPUs in the INVALID state + *  are not eligible for routing.  The system maintains this state with + *  a 2 dimensional bitmap (the first for priority class, the second for cpus + *  in that class).  Therefore a typical application without affinity + *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + *  searches).  For tasks with affinity restrictions, the algorithm has a + *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that + *  yields the worst case search is fairly contrived. + * + *  This program is free software; you can redistribute it and/or + *  modify it under the terms of the GNU General Public License + *  as published by the Free Software Foundation; version 2 + *  of the License. + */ + +#include <linux/gfp.h> +#include "cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ +	int cpupri; + +	if (prio == CPUPRI_INVALID) +		cpupri = CPUPRI_INVALID; +	else if (prio == MAX_PRIO) +		cpupri = CPUPRI_IDLE; +	else if (prio >= MAX_RT_PRIO) +		cpupri = CPUPRI_NORMAL; +	else +		cpupri = MAX_RT_PRIO - prio + 1; + +	return cpupri; +} + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * + * Note: This function returns the recommended CPUs as calculated during the + * current invocation.  By the time the call returns, the CPUs may have in + * fact changed priorities any number of times.  While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, +		struct cpumask *lowest_mask) +{ +	int                  idx      = 0; +	int                  task_pri = convert_prio(p->prio); + +	if (task_pri >= MAX_RT_PRIO) +		return 0; + +	for (idx = 0; idx < task_pri; idx++) { +		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx]; +		int skip = 0; + +		if (!atomic_read(&(vec)->count)) +			skip = 1; +		/* +		 * When looking at the vector, we need to read the counter, +		 * do a memory barrier, then read the mask. +		 * +		 * Note: This is still all racey, but we can deal with it. +		 *  Ideally, we only want to look at masks that are set. +		 * +		 *  If a mask is not set, then the only thing wrong is that we +		 *  did a little more work than necessary. +		 * +		 *  If we read a zero count but the mask is set, because of the +		 *  memory barriers, that can only happen when the highest prio +		 *  task for a run queue has left the run queue, in which case, +		 *  it will be followed by a pull. If the task we are processing +		 *  fails to find a proper place to go, that pull request will +		 *  pull this task if the run queue is running at a lower +		 *  priority. +		 */ +		smp_rmb(); + +		/* Need to do the rmb for every iteration */ +		if (skip) +			continue; + +		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) +			continue; + +		if (lowest_mask) { +			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + +			/* +			 * We have to ensure that we have at least one bit +			 * still set in the array, since the map could have +			 * been concurrently emptied between the first and +			 * second reads of vec->mask.  If we hit this +			 * condition, simply act as though we never hit this +			 * priority level and continue on. +			 */ +			if (cpumask_any(lowest_mask) >= nr_cpu_ids) +				continue; +		} + +		return 1; +	} + +	return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ +	int                 *currpri = &cp->cpu_to_pri[cpu]; +	int                  oldpri  = *currpri; +	int                  do_mb = 0; + +	newpri = convert_prio(newpri); + +	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + +	if (newpri == oldpri) +		return; + +	/* +	 * If the cpu was currently mapped to a different value, we +	 * need to map it to the new value then remove the old value. +	 * Note, we must add the new value first, otherwise we risk the +	 * cpu being missed by the priority loop in cpupri_find. +	 */ +	if (likely(newpri != CPUPRI_INVALID)) { +		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + +		cpumask_set_cpu(cpu, vec->mask); +		/* +		 * When adding a new vector, we update the mask first, +		 * do a write memory barrier, and then update the count, to +		 * make sure the vector is visible when count is set. +		 */ +		smp_mb__before_atomic_inc(); +		atomic_inc(&(vec)->count); +		do_mb = 1; +	} +	if (likely(oldpri != CPUPRI_INVALID)) { +		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri]; + +		/* +		 * Because the order of modification of the vec->count +		 * is important, we must make sure that the update +		 * of the new prio is seen before we decrement the +		 * old prio. This makes sure that the loop sees +		 * one or the other when we raise the priority of +		 * the run queue. We don't care about when we lower the +		 * priority, as that will trigger an rt pull anyway. +		 * +		 * We only need to do a memory barrier if we updated +		 * the new priority vec. +		 */ +		if (do_mb) +			smp_mb__after_atomic_inc(); + +		/* +		 * When removing from the vector, we decrement the counter first +		 * do a memory barrier and then clear the mask. +		 */ +		atomic_dec(&(vec)->count); +		smp_mb__after_atomic_inc(); +		cpumask_clear_cpu(cpu, vec->mask); +	} + +	*currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * @bootmem: true if allocations need to use bootmem + * + * Returns: -ENOMEM if memory fails. + */ +int cpupri_init(struct cpupri *cp) +{ +	int i; + +	memset(cp, 0, sizeof(*cp)); + +	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { +		struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + +		atomic_set(&vec->count, 0); +		if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) +			goto cleanup; +	} + +	for_each_possible_cpu(i) +		cp->cpu_to_pri[i] = CPUPRI_INVALID; +	return 0; + +cleanup: +	for (i--; i >= 0; i--) +		free_cpumask_var(cp->pri_to_cpu[i].mask); +	return -ENOMEM; +} + +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ +	int i; + +	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) +		free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h new file mode 100644 index 00000000000..f6d75617349 --- /dev/null +++ b/kernel/sched/cpupri.h @@ -0,0 +1,34 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include <linux/sched.h> + +#define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO + 2) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE     0 +#define CPUPRI_NORMAL   1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { +	atomic_t	count; +	cpumask_var_t	mask; +}; + +struct cpupri { +	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; +	int               cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int  cpupri_find(struct cpupri *cp, +		 struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c new file mode 100644 index 00000000000..2a075e10004 --- /dev/null +++ b/kernel/sched/debug.c @@ -0,0 +1,510 @@ +/* + * kernel/sched/debug.c + * + * Print the CFS rbtree + * + * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/utsname.h> + +#include "sched.h" + +static DEFINE_SPINLOCK(sched_debug_lock); + +/* + * This allows printing both to /proc/sched_debug and + * to the console + */ +#define SEQ_printf(m, x...)			\ + do {						\ +	if (m)					\ +		seq_printf(m, x);		\ +	else					\ +		printk(x);			\ + } while (0) + +/* + * Ease the printing of nsec fields: + */ +static long long nsec_high(unsigned long long nsec) +{ +	if ((long long)nsec < 0) { +		nsec = -nsec; +		do_div(nsec, 1000000); +		return -nsec; +	} +	do_div(nsec, 1000000); + +	return nsec; +} + +static unsigned long nsec_low(unsigned long long nsec) +{ +	if ((long long)nsec < 0) +		nsec = -nsec; + +	return do_div(nsec, 1000000); +} + +#define SPLIT_NS(x) nsec_high(x), nsec_low(x) + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) +{ +	struct sched_entity *se = tg->se[cpu]; +	if (!se) +		return; + +#define P(F) \ +	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F) +#define PN(F) \ +	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + +	PN(se->exec_start); +	PN(se->vruntime); +	PN(se->sum_exec_runtime); +#ifdef CONFIG_SCHEDSTATS +	PN(se->statistics.wait_start); +	PN(se->statistics.sleep_start); +	PN(se->statistics.block_start); +	PN(se->statistics.sleep_max); +	PN(se->statistics.block_max); +	PN(se->statistics.exec_max); +	PN(se->statistics.slice_max); +	PN(se->statistics.wait_max); +	PN(se->statistics.wait_sum); +	P(se->statistics.wait_count); +#endif +	P(se->load.weight); +#undef PN +#undef P +} +#endif + +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ +	if (autogroup_path(tg, group_path, PATH_MAX)) +		return group_path; + +	/* +	 * May be NULL if the underlying cgroup isn't fully-created yet +	 */ +	if (!tg->css.cgroup) { +		group_path[0] = '\0'; +		return group_path; +	} +	cgroup_path(tg->css.cgroup, group_path, PATH_MAX); +	return group_path; +} +#endif + +static void +print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +{ +	if (rq->curr == p) +		SEQ_printf(m, "R"); +	else +		SEQ_printf(m, " "); + +	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", +		p->comm, p->pid, +		SPLIT_NS(p->se.vruntime), +		(long long)(p->nvcsw + p->nivcsw), +		p->prio); +#ifdef CONFIG_SCHEDSTATS +	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", +		SPLIT_NS(p->se.vruntime), +		SPLIT_NS(p->se.sum_exec_runtime), +		SPLIT_NS(p->se.statistics.sum_sleep_runtime)); +#else +	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", +		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); +#endif +#ifdef CONFIG_CGROUP_SCHED +	SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif + +	SEQ_printf(m, "\n"); +} + +static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +{ +	struct task_struct *g, *p; +	unsigned long flags; + +	SEQ_printf(m, +	"\nrunnable tasks:\n" +	"            task   PID         tree-key  switches  prio" +	"     exec-runtime         sum-exec        sum-sleep\n" +	"------------------------------------------------------" +	"----------------------------------------------------\n"); + +	read_lock_irqsave(&tasklist_lock, flags); + +	do_each_thread(g, p) { +		if (!p->on_rq || task_cpu(p) != rq_cpu) +			continue; + +		print_task(m, rq, p); +	} while_each_thread(g, p); + +	read_unlock_irqrestore(&tasklist_lock, flags); +} + +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +{ +	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, +		spread, rq0_min_vruntime, spread0; +	struct rq *rq = cpu_rq(cpu); +	struct sched_entity *last; +	unsigned long flags; + +#ifdef CONFIG_FAIR_GROUP_SCHED +	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else +	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock", +			SPLIT_NS(cfs_rq->exec_clock)); + +	raw_spin_lock_irqsave(&rq->lock, flags); +	if (cfs_rq->rb_leftmost) +		MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; +	last = __pick_last_entity(cfs_rq); +	if (last) +		max_vruntime = last->vruntime; +	min_vruntime = cfs_rq->min_vruntime; +	rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; +	raw_spin_unlock_irqrestore(&rq->lock, flags); +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime", +			SPLIT_NS(MIN_vruntime)); +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime", +			SPLIT_NS(min_vruntime)); +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime", +			SPLIT_NS(max_vruntime)); +	spread = max_vruntime - MIN_vruntime; +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", +			SPLIT_NS(spread)); +	spread0 = min_vruntime - rq0_min_vruntime; +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0", +			SPLIT_NS(spread0)); +	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over", +			cfs_rq->nr_spread_over); +	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); +	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg", +			SPLIT_NS(cfs_rq->load_avg)); +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period", +			SPLIT_NS(cfs_rq->load_period)); +	SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib", +			cfs_rq->load_contribution); +	SEQ_printf(m, "  .%-30s: %d\n", "load_tg", +			atomic_read(&cfs_rq->tg->load_weight)); +#endif + +	print_cfs_group_stats(m, cpu, cfs_rq->tg); +#endif +} + +void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) +{ +#ifdef CONFIG_RT_GROUP_SCHED +	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else +	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif + +#define P(x) \ +	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PN(x) \ +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + +	P(rt_nr_running); +	P(rt_throttled); +	PN(rt_time); +	PN(rt_runtime); + +#undef PN +#undef P +} + +extern __read_mostly int sched_clock_running; + +static void print_cpu(struct seq_file *m, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +#ifdef CONFIG_X86 +	{ +		unsigned int freq = cpu_khz ? : 1; + +		SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", +			   cpu, freq / 1000, (freq % 1000)); +	} +#else +	SEQ_printf(m, "\ncpu#%d\n", cpu); +#endif + +#define P(x) \ +	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define PN(x) \ +	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) + +	P(nr_running); +	SEQ_printf(m, "  .%-30s: %lu\n", "load", +		   rq->load.weight); +	P(nr_switches); +	P(nr_load_updates); +	P(nr_uninterruptible); +	PN(next_balance); +	P(curr->pid); +	PN(clock); +	P(cpu_load[0]); +	P(cpu_load[1]); +	P(cpu_load[2]); +	P(cpu_load[3]); +	P(cpu_load[4]); +#undef P +#undef PN + +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n); + +	P(yld_count); + +	P(sched_switch); +	P(sched_count); +	P(sched_goidle); +#ifdef CONFIG_SMP +	P64(avg_idle); +#endif + +	P(ttwu_count); +	P(ttwu_local); + +#undef P +#undef P64 +#endif +	spin_lock_irqsave(&sched_debug_lock, flags); +	print_cfs_stats(m, cpu); +	print_rt_stats(m, cpu); + +	rcu_read_lock(); +	print_rq(m, rq, cpu); +	rcu_read_unlock(); +	spin_unlock_irqrestore(&sched_debug_lock, flags); +} + +static const char *sched_tunable_scaling_names[] = { +	"none", +	"logaritmic", +	"linear" +}; + +static int sched_debug_show(struct seq_file *m, void *v) +{ +	u64 ktime, sched_clk, cpu_clk; +	unsigned long flags; +	int cpu; + +	local_irq_save(flags); +	ktime = ktime_to_ns(ktime_get()); +	sched_clk = sched_clock(); +	cpu_clk = local_clock(); +	local_irq_restore(flags); + +	SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", +		init_utsname()->release, +		(int)strcspn(init_utsname()->version, " "), +		init_utsname()->version); + +#define P(x) \ +	SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ +	SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) +	PN(ktime); +	PN(sched_clk); +	PN(cpu_clk); +	P(jiffies); +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +	P(sched_clock_stable); +#endif +#undef PN +#undef P + +	SEQ_printf(m, "\n"); +	SEQ_printf(m, "sysctl_sched\n"); + +#define P(x) \ +	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ +	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) +	PN(sysctl_sched_latency); +	PN(sysctl_sched_min_granularity); +	PN(sysctl_sched_wakeup_granularity); +	P(sysctl_sched_child_runs_first); +	P(sysctl_sched_features); +#undef PN +#undef P + +	SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", +		sysctl_sched_tunable_scaling, +		sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + +	for_each_online_cpu(cpu) +		print_cpu(m, cpu); + +	SEQ_printf(m, "\n"); + +	return 0; +} + +void sysrq_sched_debug_show(void) +{ +	sched_debug_show(NULL, NULL); +} + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ +	return single_open(filp, sched_debug_show, NULL); +} + +static const struct file_operations sched_debug_fops = { +	.open		= sched_debug_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static int __init init_sched_debug_procfs(void) +{ +	struct proc_dir_entry *pe; + +	pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); +	if (!pe) +		return -ENOMEM; +	return 0; +} + +__initcall(init_sched_debug_procfs); + +void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +{ +	unsigned long nr_switches; + +	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, +						get_nr_threads(p)); +	SEQ_printf(m, +		"---------------------------------------------------------\n"); +#define __P(F) \ +	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) +#define P(F) \ +	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ +	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ +	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + +	PN(se.exec_start); +	PN(se.vruntime); +	PN(se.sum_exec_runtime); + +	nr_switches = p->nvcsw + p->nivcsw; + +#ifdef CONFIG_SCHEDSTATS +	PN(se.statistics.wait_start); +	PN(se.statistics.sleep_start); +	PN(se.statistics.block_start); +	PN(se.statistics.sleep_max); +	PN(se.statistics.block_max); +	PN(se.statistics.exec_max); +	PN(se.statistics.slice_max); +	PN(se.statistics.wait_max); +	PN(se.statistics.wait_sum); +	P(se.statistics.wait_count); +	PN(se.statistics.iowait_sum); +	P(se.statistics.iowait_count); +	P(se.nr_migrations); +	P(se.statistics.nr_migrations_cold); +	P(se.statistics.nr_failed_migrations_affine); +	P(se.statistics.nr_failed_migrations_running); +	P(se.statistics.nr_failed_migrations_hot); +	P(se.statistics.nr_forced_migrations); +	P(se.statistics.nr_wakeups); +	P(se.statistics.nr_wakeups_sync); +	P(se.statistics.nr_wakeups_migrate); +	P(se.statistics.nr_wakeups_local); +	P(se.statistics.nr_wakeups_remote); +	P(se.statistics.nr_wakeups_affine); +	P(se.statistics.nr_wakeups_affine_attempts); +	P(se.statistics.nr_wakeups_passive); +	P(se.statistics.nr_wakeups_idle); + +	{ +		u64 avg_atom, avg_per_cpu; + +		avg_atom = p->se.sum_exec_runtime; +		if (nr_switches) +			do_div(avg_atom, nr_switches); +		else +			avg_atom = -1LL; + +		avg_per_cpu = p->se.sum_exec_runtime; +		if (p->se.nr_migrations) { +			avg_per_cpu = div64_u64(avg_per_cpu, +						p->se.nr_migrations); +		} else { +			avg_per_cpu = -1LL; +		} + +		__PN(avg_atom); +		__PN(avg_per_cpu); +	} +#endif +	__P(nr_switches); +	SEQ_printf(m, "%-35s:%21Ld\n", +		   "nr_voluntary_switches", (long long)p->nvcsw); +	SEQ_printf(m, "%-35s:%21Ld\n", +		   "nr_involuntary_switches", (long long)p->nivcsw); + +	P(se.load.weight); +	P(policy); +	P(prio); +#undef PN +#undef __PN +#undef P +#undef __P + +	{ +		unsigned int this_cpu = raw_smp_processor_id(); +		u64 t0, t1; + +		t0 = cpu_clock(this_cpu); +		t1 = cpu_clock(this_cpu); +		SEQ_printf(m, "%-35s:%21Ld\n", +			   "clock-delta", (long long)(t1-t0)); +	} +} + +void proc_sched_set_task(struct task_struct *p) +{ +#ifdef CONFIG_SCHEDSTATS +	memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c new file mode 100644 index 00000000000..84adb2d66cb --- /dev/null +++ b/kernel/sched/fair.c @@ -0,0 +1,5596 @@ +/* + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) + * + *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + *  Interactivity improvements by Mike Galbraith + *  (C) 2007 Mike Galbraith <efault@gmx.de> + * + *  Various enhancements by Dmitry Adamushko. + *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> + * + *  Group scheduling enhancements by Srivatsa Vaddagiri + *  Copyright IBM Corporation, 2007 + *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> + * + *  Scaled math optimizations by Thomas Gleixner + *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> + * + *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra + *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/latencytop.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/slab.h> +#include <linux/profile.h> +#include <linux/interrupt.h> + +#include <trace/events/sched.h> + +#include "sched.h" + +/* + * Targeted preemption latency for CPU-bound tasks: + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + * + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. + * + * (to see the precise effective timeslice length of your workload, + *  run vmstat and monitor the context-switches (cs) field) + */ +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling +	= SCHED_TUNABLESCALING_LOG; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; + +/* + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + */ +static unsigned int sched_nr_latency = 8; + +/* + * After fork, child runs first. If set to 0 (default) then + * parent will (try to) run first. + */ +unsigned int sysctl_sched_child_runs_first __read_mostly; + +/* + * SCHED_OTHER wake-up granularity. + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +/* + * The exponential sliding  window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + +#ifdef CONFIG_CFS_BANDWIDTH +/* + * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool + * each time a cfs_rq requests quota. + * + * Note: in the case that the slice exceeds the runtime remaining (either due + * to consumption or the quota being specified to be smaller than the slice) + * we will always only issue the remaining available time. + * + * default: 5 msec, units: microseconds +  */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static int get_update_sysctl_factor(void) +{ +	unsigned int cpus = min_t(int, num_online_cpus(), 8); +	unsigned int factor; + +	switch (sysctl_sched_tunable_scaling) { +	case SCHED_TUNABLESCALING_NONE: +		factor = 1; +		break; +	case SCHED_TUNABLESCALING_LINEAR: +		factor = cpus; +		break; +	case SCHED_TUNABLESCALING_LOG: +	default: +		factor = 1 + ilog2(cpus); +		break; +	} + +	return factor; +} + +static void update_sysctl(void) +{ +	unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ +	(sysctl_##name = (factor) * normalized_sysctl_##name) +	SET_SYSCTL(sched_min_granularity); +	SET_SYSCTL(sched_latency); +	SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} + +void sched_init_granularity(void) +{ +	update_sysctl(); +} + +#if BITS_PER_LONG == 32 +# define WMULT_CONST	(~0UL) +#else +# define WMULT_CONST	(1UL << 32) +#endif + +#define WMULT_SHIFT	32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, +		struct load_weight *lw) +{ +	u64 tmp; + +	/* +	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched +	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than +	 * 2^SCHED_LOAD_RESOLUTION. +	 */ +	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) +		tmp = (u64)delta_exec * scale_load_down(weight); +	else +		tmp = (u64)delta_exec; + +	if (!lw->inv_weight) { +		unsigned long w = scale_load_down(lw->weight); + +		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) +			lw->inv_weight = 1; +		else if (unlikely(!w)) +			lw->inv_weight = WMULT_CONST; +		else +			lw->inv_weight = WMULT_CONST / w; +	} + +	/* +	 * Check whether we'd overflow the 64-bit multiplication: +	 */ +	if (unlikely(tmp > WMULT_CONST)) +		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, +			WMULT_SHIFT/2); +	else +		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + +	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + + +const struct sched_class fair_sched_class; + +/************************************************************** + * CFS operations on generic schedulable entities: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* cpu runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ +	return cfs_rq->rq; +} + +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se)	(!se->my_q) + +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG +	WARN_ON_ONCE(!entity_is_task(se)); +#endif +	return container_of(se, struct task_struct, se); +} + +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ +		for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ +	return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ +	return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ +	return grp->my_q; +} + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +	if (!cfs_rq->on_list) { +		/* +		 * Ensure we either appear before our parent (if already +		 * enqueued) or force our parent to appear after us when it is +		 * enqueued.  The fact that we always enqueue bottom-up +		 * reduces this to two cases. +		 */ +		if (cfs_rq->tg->parent && +		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { +			list_add_rcu(&cfs_rq->leaf_cfs_rq_list, +				&rq_of(cfs_rq)->leaf_cfs_rq_list); +		} else { +			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, +				&rq_of(cfs_rq)->leaf_cfs_rq_list); +		} + +		cfs_rq->on_list = 1; +	} +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +	if (cfs_rq->on_list) { +		list_del_rcu(&cfs_rq->leaf_cfs_rq_list); +		cfs_rq->on_list = 0; +	} +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ +	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ +	if (se->cfs_rq == pse->cfs_rq) +		return 1; + +	return 0; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ +	return se->parent; +} + +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ +	int depth = 0; + +	for_each_sched_entity(se) +		depth++; + +	return depth; +} + +static void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ +	int se_depth, pse_depth; + +	/* +	 * preemption test can be made between sibling entities who are in the +	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of +	 * both tasks until we find their ancestors who are siblings of common +	 * parent. +	 */ + +	/* First walk up until both entities are at same depth */ +	se_depth = depth_se(*se); +	pse_depth = depth_se(*pse); + +	while (se_depth > pse_depth) { +		se_depth--; +		*se = parent_entity(*se); +	} + +	while (pse_depth > se_depth) { +		pse_depth--; +		*pse = parent_entity(*pse); +	} + +	while (!is_same_group(*se, *pse)) { +		*se = parent_entity(*se); +		*pse = parent_entity(*pse); +	} +} + +#else	/* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ +	return container_of(se, struct task_struct, se); +} + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ +	return container_of(cfs_rq, struct rq, cfs); +} + +#define entity_is_task(se)	1 + +#define for_each_sched_entity(se) \ +		for (; se; se = NULL) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ +	return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ +	struct task_struct *p = task_of(se); +	struct rq *rq = task_rq(p); + +	return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ +	return NULL; +} + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ +		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ +	return 1; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ +	return NULL; +} + +static inline void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ +} + +#endif	/* CONFIG_FAIR_GROUP_SCHED */ + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, +				   unsigned long delta_exec); + +/************************************************************** + * Scheduling class tree data structure manipulation methods: + */ + +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +{ +	s64 delta = (s64)(vruntime - min_vruntime); +	if (delta > 0) +		min_vruntime = vruntime; + +	return min_vruntime; +} + +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +{ +	s64 delta = (s64)(vruntime - min_vruntime); +	if (delta < 0) +		min_vruntime = vruntime; + +	return min_vruntime; +} + +static inline int entity_before(struct sched_entity *a, +				struct sched_entity *b) +{ +	return (s64)(a->vruntime - b->vruntime) < 0; +} + +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ +	u64 vruntime = cfs_rq->min_vruntime; + +	if (cfs_rq->curr) +		vruntime = cfs_rq->curr->vruntime; + +	if (cfs_rq->rb_leftmost) { +		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, +						   struct sched_entity, +						   run_node); + +		if (!cfs_rq->curr) +			vruntime = se->vruntime; +		else +			vruntime = min_vruntime(vruntime, se->vruntime); +	} + +	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT +	smp_wmb(); +	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; +	struct rb_node *parent = NULL; +	struct sched_entity *entry; +	int leftmost = 1; + +	/* +	 * Find the right place in the rbtree: +	 */ +	while (*link) { +		parent = *link; +		entry = rb_entry(parent, struct sched_entity, run_node); +		/* +		 * We dont care about collisions. Nodes with +		 * the same key stay together. +		 */ +		if (entity_before(se, entry)) { +			link = &parent->rb_left; +		} else { +			link = &parent->rb_right; +			leftmost = 0; +		} +	} + +	/* +	 * Maintain a cache of leftmost tree entries (it is frequently +	 * used): +	 */ +	if (leftmost) +		cfs_rq->rb_leftmost = &se->run_node; + +	rb_link_node(&se->run_node, parent, link); +	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	if (cfs_rq->rb_leftmost == &se->run_node) { +		struct rb_node *next_node; + +		next_node = rb_next(&se->run_node); +		cfs_rq->rb_leftmost = next_node; +	} + +	rb_erase(&se->run_node, &cfs_rq->tasks_timeline); +} + +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +{ +	struct rb_node *left = cfs_rq->rb_leftmost; + +	if (!left) +		return NULL; + +	return rb_entry(left, struct sched_entity, run_node); +} + +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ +	struct rb_node *next = rb_next(&se->run_node); + +	if (!next) +		return NULL; + +	return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +{ +	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + +	if (!last) +		return NULL; + +	return rb_entry(last, struct sched_entity, run_node); +} + +/************************************************************** + * Scheduling class statistics methods: + */ + +int sched_proc_update_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, +		loff_t *ppos) +{ +	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); +	int factor = get_update_sysctl_factor(); + +	if (ret || !write) +		return ret; + +	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, +					sysctl_sched_min_granularity); + +#define WRT_SYSCTL(name) \ +	(normalized_sysctl_##name = sysctl_##name / (factor)) +	WRT_SYSCTL(sched_min_granularity); +	WRT_SYSCTL(sched_latency); +	WRT_SYSCTL(sched_wakeup_granularity); +#undef WRT_SYSCTL + +	return 0; +} +#endif + +/* + * delta /= w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ +	if (unlikely(se->load.weight != NICE_0_LOAD)) +		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + +	return delta; +} + +/* + * The idea is to set a period in which each task runs once. + * + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. + * + * p = (nr <= nl) ? l : l*nr/nl + */ +static u64 __sched_period(unsigned long nr_running) +{ +	u64 period = sysctl_sched_latency; +	unsigned long nr_latency = sched_nr_latency; + +	if (unlikely(nr_running > nr_latency)) { +		period = sysctl_sched_min_granularity; +		period *= nr_running; +	} + +	return period; +} + +/* + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*P[w/rw] + */ +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + +	for_each_sched_entity(se) { +		struct load_weight *load; +		struct load_weight lw; + +		cfs_rq = cfs_rq_of(se); +		load = &cfs_rq->load; + +		if (unlikely(!se->on_rq)) { +			lw = cfs_rq->load; + +			update_load_add(&lw, se->load.weight); +			load = &lw; +		} +		slice = calc_delta_mine(slice, se->load.weight, load); +	} +	return slice; +} + +/* + * We calculate the vruntime slice of a to be inserted task + * + * vs = s/w + */ +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	return calc_delta_fair(sched_slice(cfs_rq, se), se); +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); +static void update_cfs_shares(struct cfs_rq *cfs_rq); + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static inline void +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, +	      unsigned long delta_exec) +{ +	unsigned long delta_exec_weighted; + +	schedstat_set(curr->statistics.exec_max, +		      max((u64)delta_exec, curr->statistics.exec_max)); + +	curr->sum_exec_runtime += delta_exec; +	schedstat_add(cfs_rq, exec_clock, delta_exec); +	delta_exec_weighted = calc_delta_fair(delta_exec, curr); + +	curr->vruntime += delta_exec_weighted; +	update_min_vruntime(cfs_rq); + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +	cfs_rq->load_unacc_exec_time += delta_exec; +#endif +} + +static void update_curr(struct cfs_rq *cfs_rq) +{ +	struct sched_entity *curr = cfs_rq->curr; +	u64 now = rq_of(cfs_rq)->clock_task; +	unsigned long delta_exec; + +	if (unlikely(!curr)) +		return; + +	/* +	 * Get the amount of time the current task was running +	 * since the last time we changed load (this cannot +	 * overflow on 32 bits): +	 */ +	delta_exec = (unsigned long)(now - curr->exec_start); +	if (!delta_exec) +		return; + +	__update_curr(cfs_rq, curr, delta_exec); +	curr->exec_start = now; + +	if (entity_is_task(curr)) { +		struct task_struct *curtask = task_of(curr); + +		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); +		cpuacct_charge(curtask, delta_exec); +		account_group_exec_runtime(curtask, delta_exec); +	} + +	account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); +} + +/* + * Task is being enqueued - update stats: + */ +static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	/* +	 * Are we enqueueing a waiting task? (for current tasks +	 * a dequeue/enqueue event is a NOP) +	 */ +	if (se != cfs_rq->curr) +		update_stats_wait_start(cfs_rq, se); +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, +			rq_of(cfs_rq)->clock - se->statistics.wait_start)); +	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); +	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + +			rq_of(cfs_rq)->clock - se->statistics.wait_start); +#ifdef CONFIG_SCHEDSTATS +	if (entity_is_task(se)) { +		trace_sched_stat_wait(task_of(se), +			rq_of(cfs_rq)->clock - se->statistics.wait_start); +	} +#endif +	schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	/* +	 * Mark the end of the wait period if dequeueing a +	 * waiting task: +	 */ +	if (se != cfs_rq->curr) +		update_stats_wait_end(cfs_rq, se); +} + +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	/* +	 * We are starting a new run period: +	 */ +	se->exec_start = rq_of(cfs_rq)->clock_task; +} + +/************************************************** + * Scheduling class queueing methods: + */ + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +	cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	update_load_add(&cfs_rq->load, se->load.weight); +	if (!parent_entity(se)) +		update_load_add(&rq_of(cfs_rq)->load, se->load.weight); +	if (entity_is_task(se)) { +		add_cfs_task_weight(cfs_rq, se->load.weight); +		list_add(&se->group_node, &cfs_rq->tasks); +	} +	cfs_rq->nr_running++; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	update_load_sub(&cfs_rq->load, se->load.weight); +	if (!parent_entity(se)) +		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); +	if (entity_is_task(se)) { +		add_cfs_task_weight(cfs_rq, -se->load.weight); +		list_del_init(&se->group_node); +	} +	cfs_rq->nr_running--; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* we need this in update_cfs_load and load-balance functions below */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); +# ifdef CONFIG_SMP +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, +					    int global_update) +{ +	struct task_group *tg = cfs_rq->tg; +	long load_avg; + +	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); +	load_avg -= cfs_rq->load_contribution; + +	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { +		atomic_add(load_avg, &tg->load_weight); +		cfs_rq->load_contribution += load_avg; +	} +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +	u64 period = sysctl_sched_shares_window; +	u64 now, delta; +	unsigned long load = cfs_rq->load.weight; + +	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) +		return; + +	now = rq_of(cfs_rq)->clock_task; +	delta = now - cfs_rq->load_stamp; + +	/* truncate load history at 4 idle periods */ +	if (cfs_rq->load_stamp > cfs_rq->load_last && +	    now - cfs_rq->load_last > 4 * period) { +		cfs_rq->load_period = 0; +		cfs_rq->load_avg = 0; +		delta = period - 1; +	} + +	cfs_rq->load_stamp = now; +	cfs_rq->load_unacc_exec_time = 0; +	cfs_rq->load_period += delta; +	if (load) { +		cfs_rq->load_last = now; +		cfs_rq->load_avg += delta * load; +	} + +	/* consider updating load contribution on each fold or truncate */ +	if (global_update || cfs_rq->load_period > period +	    || !cfs_rq->load_period) +		update_cfs_rq_load_contribution(cfs_rq, global_update); + +	while (cfs_rq->load_period > period) { +		/* +		 * Inline assembly required to prevent the compiler +		 * optimising this loop into a divmod call. +		 * See __iter_div_u64_rem() for another example of this. +		 */ +		asm("" : "+rm" (cfs_rq->load_period)); +		cfs_rq->load_period /= 2; +		cfs_rq->load_avg /= 2; +	} + +	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) +		list_del_leaf_cfs_rq(cfs_rq); +} + +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ +	long tg_weight; + +	/* +	 * Use this CPU's actual weight instead of the last load_contribution +	 * to gain a more accurate current total weight. See +	 * update_cfs_rq_load_contribution(). +	 */ +	tg_weight = atomic_read(&tg->load_weight); +	tg_weight -= cfs_rq->load_contribution; +	tg_weight += cfs_rq->load.weight; + +	return tg_weight; +} + +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ +	long tg_weight, load, shares; + +	tg_weight = calc_tg_weight(tg, cfs_rq); +	load = cfs_rq->load.weight; + +	shares = (tg->shares * load); +	if (tg_weight) +		shares /= tg_weight; + +	if (shares < MIN_SHARES) +		shares = MIN_SHARES; +	if (shares > tg->shares) +		shares = tg->shares; + +	return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { +		update_cfs_load(cfs_rq, 0); +		update_cfs_shares(cfs_rq); +	} +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ +	return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +			    unsigned long weight) +{ +	if (se->on_rq) { +		/* commit outstanding execution time */ +		if (cfs_rq->curr == se) +			update_curr(cfs_rq); +		account_entity_dequeue(cfs_rq, se); +	} + +	update_load_set(&se->load, weight); + +	if (se->on_rq) +		account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +	struct task_group *tg; +	struct sched_entity *se; +	long shares; + +	tg = cfs_rq->tg; +	se = tg->se[cpu_of(rq_of(cfs_rq))]; +	if (!se || throttled_hierarchy(cfs_rq)) +		return; +#ifndef CONFIG_SMP +	if (likely(se->load.weight == tg->shares)) +		return; +#endif +	shares = calc_cfs_shares(cfs_rq, tg); + +	reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHEDSTATS +	struct task_struct *tsk = NULL; + +	if (entity_is_task(se)) +		tsk = task_of(se); + +	if (se->statistics.sleep_start) { +		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + +		if ((s64)delta < 0) +			delta = 0; + +		if (unlikely(delta > se->statistics.sleep_max)) +			se->statistics.sleep_max = delta; + +		se->statistics.sum_sleep_runtime += delta; + +		if (tsk) { +			account_scheduler_latency(tsk, delta >> 10, 1); +			trace_sched_stat_sleep(tsk, delta); +		} +	} +	if (se->statistics.block_start) { +		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + +		if ((s64)delta < 0) +			delta = 0; + +		if (unlikely(delta > se->statistics.block_max)) +			se->statistics.block_max = delta; + +		se->statistics.sum_sleep_runtime += delta; + +		if (tsk) { +			if (tsk->in_iowait) { +				se->statistics.iowait_sum += delta; +				se->statistics.iowait_count++; +				trace_sched_stat_iowait(tsk, delta); +			} + +			trace_sched_stat_blocked(tsk, delta); + +			/* +			 * Blocking time is in units of nanosecs, so shift by +			 * 20 to get a milliseconds-range estimation of the +			 * amount of time that the task spent sleeping: +			 */ +			if (unlikely(prof_on == SLEEP_PROFILING)) { +				profile_hits(SLEEP_PROFILING, +						(void *)get_wchan(tsk), +						delta >> 20); +			} +			account_scheduler_latency(tsk, delta >> 10, 0); +		} +	} +#endif +} + +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG +	s64 d = se->vruntime - cfs_rq->min_vruntime; + +	if (d < 0) +		d = -d; + +	if (d > 3*sysctl_sched_latency) +		schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +{ +	u64 vruntime = cfs_rq->min_vruntime; + +	/* +	 * The 'current' period is already promised to the current tasks, +	 * however the extra weight of the new task will slow them down a +	 * little, place the new task so that it fits in the slot that +	 * stays open at the end. +	 */ +	if (initial && sched_feat(START_DEBIT)) +		vruntime += sched_vslice(cfs_rq, se); + +	/* sleeps up to a single latency don't count. */ +	if (!initial) { +		unsigned long thresh = sysctl_sched_latency; + +		/* +		 * Halve their sleep time's effect, to allow +		 * for a gentler effect of sleepers: +		 */ +		if (sched_feat(GENTLE_FAIR_SLEEPERS)) +			thresh >>= 1; + +		vruntime -= thresh; +	} + +	/* ensure we never gain time by being placed backwards. */ +	vruntime = max_vruntime(se->vruntime, vruntime); + +	se->vruntime = vruntime; +} + +static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + +static void +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +	/* +	 * Update the normalized vruntime before updating min_vruntime +	 * through callig update_curr(). +	 */ +	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) +		se->vruntime += cfs_rq->min_vruntime; + +	/* +	 * Update run-time statistics of the 'current'. +	 */ +	update_curr(cfs_rq); +	update_cfs_load(cfs_rq, 0); +	account_entity_enqueue(cfs_rq, se); +	update_cfs_shares(cfs_rq); + +	if (flags & ENQUEUE_WAKEUP) { +		place_entity(cfs_rq, se, 0); +		enqueue_sleeper(cfs_rq, se); +	} + +	update_stats_enqueue(cfs_rq, se); +	check_spread(cfs_rq, se); +	if (se != cfs_rq->curr) +		__enqueue_entity(cfs_rq, se); +	se->on_rq = 1; + +	if (cfs_rq->nr_running == 1) { +		list_add_leaf_cfs_rq(cfs_rq); +		check_enqueue_throttle(cfs_rq); +	} +} + +static void __clear_buddies_last(struct sched_entity *se) +{ +	for_each_sched_entity(se) { +		struct cfs_rq *cfs_rq = cfs_rq_of(se); +		if (cfs_rq->last == se) +			cfs_rq->last = NULL; +		else +			break; +	} +} + +static void __clear_buddies_next(struct sched_entity *se) +{ +	for_each_sched_entity(se) { +		struct cfs_rq *cfs_rq = cfs_rq_of(se); +		if (cfs_rq->next == se) +			cfs_rq->next = NULL; +		else +			break; +	} +} + +static void __clear_buddies_skip(struct sched_entity *se) +{ +	for_each_sched_entity(se) { +		struct cfs_rq *cfs_rq = cfs_rq_of(se); +		if (cfs_rq->skip == se) +			cfs_rq->skip = NULL; +		else +			break; +	} +} + +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	if (cfs_rq->last == se) +		__clear_buddies_last(se); + +	if (cfs_rq->next == se) +		__clear_buddies_next(se); + +	if (cfs_rq->skip == se) +		__clear_buddies_skip(se); +} + +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +static void +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +	/* +	 * Update run-time statistics of the 'current'. +	 */ +	update_curr(cfs_rq); + +	update_stats_dequeue(cfs_rq, se); +	if (flags & DEQUEUE_SLEEP) { +#ifdef CONFIG_SCHEDSTATS +		if (entity_is_task(se)) { +			struct task_struct *tsk = task_of(se); + +			if (tsk->state & TASK_INTERRUPTIBLE) +				se->statistics.sleep_start = rq_of(cfs_rq)->clock; +			if (tsk->state & TASK_UNINTERRUPTIBLE) +				se->statistics.block_start = rq_of(cfs_rq)->clock; +		} +#endif +	} + +	clear_buddies(cfs_rq, se); + +	if (se != cfs_rq->curr) +		__dequeue_entity(cfs_rq, se); +	se->on_rq = 0; +	update_cfs_load(cfs_rq, 0); +	account_entity_dequeue(cfs_rq, se); + +	/* +	 * Normalize the entity after updating the min_vruntime because the +	 * update can refer to the ->curr item and we need to reflect this +	 * movement in our normalized position. +	 */ +	if (!(flags & DEQUEUE_SLEEP)) +		se->vruntime -= cfs_rq->min_vruntime; + +	/* return excess runtime on last dequeue */ +	return_cfs_rq_runtime(cfs_rq); + +	update_min_vruntime(cfs_rq); +	update_cfs_shares(cfs_rq); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ +	unsigned long ideal_runtime, delta_exec; +	struct sched_entity *se; +	s64 delta; + +	ideal_runtime = sched_slice(cfs_rq, curr); +	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; +	if (delta_exec > ideal_runtime) { +		resched_task(rq_of(cfs_rq)->curr); +		/* +		 * The current task ran long enough, ensure it doesn't get +		 * re-elected due to buddy favours. +		 */ +		clear_buddies(cfs_rq, curr); +		return; +	} + +	/* +	 * Ensure that a task that missed wakeup preemption by a +	 * narrow margin doesn't have to wait for a full slice. +	 * This also mitigates buddy induced latencies under load. +	 */ +	if (delta_exec < sysctl_sched_min_granularity) +		return; + +	se = __pick_first_entity(cfs_rq); +	delta = curr->vruntime - se->vruntime; + +	if (delta < 0) +		return; + +	if (delta > ideal_runtime) +		resched_task(rq_of(cfs_rq)->curr); +} + +static void +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +	/* 'current' is not kept within the tree. */ +	if (se->on_rq) { +		/* +		 * Any task has to be enqueued before it get to execute on +		 * a CPU. So account for the time it spent waiting on the +		 * runqueue. +		 */ +		update_stats_wait_end(cfs_rq, se); +		__dequeue_entity(cfs_rq, se); +	} + +	update_stats_curr_start(cfs_rq, se); +	cfs_rq->curr = se; +#ifdef CONFIG_SCHEDSTATS +	/* +	 * Track our maximum slice length, if the CPU's load is at +	 * least twice that of our own weight (i.e. dont track it +	 * when there are only lesser-weight tasks around): +	 */ +	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { +		se->statistics.slice_max = max(se->statistics.slice_max, +			se->sum_exec_runtime - se->prev_sum_exec_runtime); +	} +#endif +	se->prev_sum_exec_runtime = se->sum_exec_runtime; +} + +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +{ +	struct sched_entity *se = __pick_first_entity(cfs_rq); +	struct sched_entity *left = se; + +	/* +	 * Avoid running the skip buddy, if running something else can +	 * be done without getting too unfair. +	 */ +	if (cfs_rq->skip == se) { +		struct sched_entity *second = __pick_next_entity(se); +		if (second && wakeup_preempt_entity(second, left) < 1) +			se = second; +	} + +	/* +	 * Prefer last buddy, try to return the CPU to a preempted task. +	 */ +	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) +		se = cfs_rq->last; + +	/* +	 * Someone really wants this to run. If it's not unfair, run it. +	 */ +	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) +		se = cfs_rq->next; + +	clear_buddies(cfs_rq, se); + +	return se; +} + +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +{ +	/* +	 * If still on the runqueue then deactivate_task() +	 * was not called and update_curr() has to be done: +	 */ +	if (prev->on_rq) +		update_curr(cfs_rq); + +	/* throttle cfs_rqs exceeding runtime */ +	check_cfs_rq_runtime(cfs_rq); + +	check_spread(cfs_rq, prev); +	if (prev->on_rq) { +		update_stats_wait_start(cfs_rq, prev); +		/* Put 'current' back into the tree. */ +		__enqueue_entity(cfs_rq, prev); +	} +	cfs_rq->curr = NULL; +} + +static void +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +{ +	/* +	 * Update run-time statistics of the 'current'. +	 */ +	update_curr(cfs_rq); + +	/* +	 * Update share accounting for long-running entities. +	 */ +	update_entity_shares_tick(cfs_rq); + +#ifdef CONFIG_SCHED_HRTICK +	/* +	 * queued ticks are scheduled to match the slice, so don't bother +	 * validating it and just reschedule. +	 */ +	if (queued) { +		resched_task(rq_of(cfs_rq)->curr); +		return; +	} +	/* +	 * don't let the period tick interfere with the hrtick preemption +	 */ +	if (!sched_feat(DOUBLE_TICK) && +			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) +		return; +#endif + +	if (cfs_rq->nr_running > 1) +		check_preempt_tick(cfs_rq, curr); +} + + +/************************************************** + * CFS bandwidth control machinery + */ + +#ifdef CONFIG_CFS_BANDWIDTH + +#ifdef HAVE_JUMP_LABEL +static struct jump_label_key __cfs_bandwidth_used; + +static inline bool cfs_bandwidth_used(void) +{ +	return static_branch(&__cfs_bandwidth_used); +} + +void account_cfs_bandwidth_used(int enabled, int was_enabled) +{ +	/* only need to count groups transitioning between enabled/!enabled */ +	if (enabled && !was_enabled) +		jump_label_inc(&__cfs_bandwidth_used); +	else if (!enabled && was_enabled) +		jump_label_dec(&__cfs_bandwidth_used); +} +#else /* HAVE_JUMP_LABEL */ +static bool cfs_bandwidth_used(void) +{ +	return true; +} + +void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +#endif /* HAVE_JUMP_LABEL */ + +/* + * default period for cfs group bandwidth. + * default: 0.1s, units: nanoseconds + */ +static inline u64 default_cfs_period(void) +{ +	return 100000000ULL; +} + +static inline u64 sched_cfs_bandwidth_slice(void) +{ +	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; +} + +/* + * Replenish runtime according to assigned quota and update expiration time. + * We use sched_clock_cpu directly instead of rq->clock to avoid adding + * additional synchronization around rq->lock. + * + * requires cfs_b->lock + */ +void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +{ +	u64 now; + +	if (cfs_b->quota == RUNTIME_INF) +		return; + +	now = sched_clock_cpu(smp_processor_id()); +	cfs_b->runtime = cfs_b->quota; +	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ +	return &tg->cfs_bandwidth; +} + +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ +	struct task_group *tg = cfs_rq->tg; +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); +	u64 amount = 0, min_amount, expires; + +	/* note: this is a positive sum as runtime_remaining <= 0 */ +	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + +	raw_spin_lock(&cfs_b->lock); +	if (cfs_b->quota == RUNTIME_INF) +		amount = min_amount; +	else { +		/* +		 * If the bandwidth pool has become inactive, then at least one +		 * period must have elapsed since the last consumption. +		 * Refresh the global state and ensure bandwidth timer becomes +		 * active. +		 */ +		if (!cfs_b->timer_active) { +			__refill_cfs_bandwidth_runtime(cfs_b); +			__start_cfs_bandwidth(cfs_b); +		} + +		if (cfs_b->runtime > 0) { +			amount = min(cfs_b->runtime, min_amount); +			cfs_b->runtime -= amount; +			cfs_b->idle = 0; +		} +	} +	expires = cfs_b->runtime_expires; +	raw_spin_unlock(&cfs_b->lock); + +	cfs_rq->runtime_remaining += amount; +	/* +	 * we may have advanced our local expiration to account for allowed +	 * spread between our sched_clock and the one on which runtime was +	 * issued. +	 */ +	if ((s64)(expires - cfs_rq->runtime_expires) > 0) +		cfs_rq->runtime_expires = expires; + +	return cfs_rq->runtime_remaining > 0; +} + +/* + * Note: This depends on the synchronization provided by sched_clock and the + * fact that rq->clock snapshots this value. + */ +static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); +	struct rq *rq = rq_of(cfs_rq); + +	/* if the deadline is ahead of our clock, nothing to do */ +	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) +		return; + +	if (cfs_rq->runtime_remaining < 0) +		return; + +	/* +	 * If the local deadline has passed we have to consider the +	 * possibility that our sched_clock is 'fast' and the global deadline +	 * has not truly expired. +	 * +	 * Fortunately we can check determine whether this the case by checking +	 * whether the global deadline has advanced. +	 */ + +	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { +		/* extend local deadline, drift is bounded above by 2 ticks */ +		cfs_rq->runtime_expires += TICK_NSEC; +	} else { +		/* global deadline is ahead, expiration has passed */ +		cfs_rq->runtime_remaining = 0; +	} +} + +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, +				     unsigned long delta_exec) +{ +	/* dock delta_exec before expiring quota (as it could span periods) */ +	cfs_rq->runtime_remaining -= delta_exec; +	expire_cfs_rq_runtime(cfs_rq); + +	if (likely(cfs_rq->runtime_remaining > 0)) +		return; + +	/* +	 * if we're unable to extend our runtime we resched so that the active +	 * hierarchy can be throttled +	 */ +	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) +		resched_task(rq_of(cfs_rq)->curr); +} + +static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, +						   unsigned long delta_exec) +{ +	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) +		return; + +	__account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ +	return cfs_bandwidth_used() && cfs_rq->throttled; +} + +/* check whether cfs_rq, or any parent, is throttled */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ +	return cfs_bandwidth_used() && cfs_rq->throttle_count; +} + +/* + * Ensure that neither of the group entities corresponding to src_cpu or + * dest_cpu are members of a throttled hierarchy when performing group + * load-balance operations. + */ +static inline int throttled_lb_pair(struct task_group *tg, +				    int src_cpu, int dest_cpu) +{ +	struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + +	src_cfs_rq = tg->cfs_rq[src_cpu]; +	dest_cfs_rq = tg->cfs_rq[dest_cpu]; + +	return throttled_hierarchy(src_cfs_rq) || +	       throttled_hierarchy(dest_cfs_rq); +} + +/* updated child weight may affect parent so we have to do this bottom up */ +static int tg_unthrottle_up(struct task_group *tg, void *data) +{ +	struct rq *rq = data; +	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + +	cfs_rq->throttle_count--; +#ifdef CONFIG_SMP +	if (!cfs_rq->throttle_count) { +		u64 delta = rq->clock_task - cfs_rq->load_stamp; + +		/* leaving throttled state, advance shares averaging windows */ +		cfs_rq->load_stamp += delta; +		cfs_rq->load_last += delta; + +		/* update entity weight now that we are on_rq again */ +		update_cfs_shares(cfs_rq); +	} +#endif + +	return 0; +} + +static int tg_throttle_down(struct task_group *tg, void *data) +{ +	struct rq *rq = data; +	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + +	/* group is entering throttled state, record last load */ +	if (!cfs_rq->throttle_count) +		update_cfs_load(cfs_rq, 0); +	cfs_rq->throttle_count++; + +	return 0; +} + +static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +{ +	struct rq *rq = rq_of(cfs_rq); +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); +	struct sched_entity *se; +	long task_delta, dequeue = 1; + +	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + +	/* account load preceding throttle */ +	rcu_read_lock(); +	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); +	rcu_read_unlock(); + +	task_delta = cfs_rq->h_nr_running; +	for_each_sched_entity(se) { +		struct cfs_rq *qcfs_rq = cfs_rq_of(se); +		/* throttled entity or throttle-on-deactivate */ +		if (!se->on_rq) +			break; + +		if (dequeue) +			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); +		qcfs_rq->h_nr_running -= task_delta; + +		if (qcfs_rq->load.weight) +			dequeue = 0; +	} + +	if (!se) +		rq->nr_running -= task_delta; + +	cfs_rq->throttled = 1; +	cfs_rq->throttled_timestamp = rq->clock; +	raw_spin_lock(&cfs_b->lock); +	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); +	raw_spin_unlock(&cfs_b->lock); +} + +void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +{ +	struct rq *rq = rq_of(cfs_rq); +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); +	struct sched_entity *se; +	int enqueue = 1; +	long task_delta; + +	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + +	cfs_rq->throttled = 0; +	raw_spin_lock(&cfs_b->lock); +	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; +	list_del_rcu(&cfs_rq->throttled_list); +	raw_spin_unlock(&cfs_b->lock); +	cfs_rq->throttled_timestamp = 0; + +	update_rq_clock(rq); +	/* update hierarchical throttle state */ +	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + +	if (!cfs_rq->load.weight) +		return; + +	task_delta = cfs_rq->h_nr_running; +	for_each_sched_entity(se) { +		if (se->on_rq) +			enqueue = 0; + +		cfs_rq = cfs_rq_of(se); +		if (enqueue) +			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); +		cfs_rq->h_nr_running += task_delta; + +		if (cfs_rq_throttled(cfs_rq)) +			break; +	} + +	if (!se) +		rq->nr_running += task_delta; + +	/* determine whether we need to wake up potentially idle cpu */ +	if (rq->curr == rq->idle && rq->cfs.nr_running) +		resched_task(rq->curr); +} + +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, +		u64 remaining, u64 expires) +{ +	struct cfs_rq *cfs_rq; +	u64 runtime = remaining; + +	rcu_read_lock(); +	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, +				throttled_list) { +		struct rq *rq = rq_of(cfs_rq); + +		raw_spin_lock(&rq->lock); +		if (!cfs_rq_throttled(cfs_rq)) +			goto next; + +		runtime = -cfs_rq->runtime_remaining + 1; +		if (runtime > remaining) +			runtime = remaining; +		remaining -= runtime; + +		cfs_rq->runtime_remaining += runtime; +		cfs_rq->runtime_expires = expires; + +		/* we check whether we're throttled above */ +		if (cfs_rq->runtime_remaining > 0) +			unthrottle_cfs_rq(cfs_rq); + +next: +		raw_spin_unlock(&rq->lock); + +		if (!remaining) +			break; +	} +	rcu_read_unlock(); + +	return remaining; +} + +/* + * Responsible for refilling a task_group's bandwidth and unthrottling its + * cfs_rqs as appropriate. If there has been no activity within the last + * period the timer is deactivated until scheduling resumes; cfs_b->idle is + * used to track this state. + */ +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +{ +	u64 runtime, runtime_expires; +	int idle = 1, throttled; + +	raw_spin_lock(&cfs_b->lock); +	/* no need to continue the timer with no bandwidth constraint */ +	if (cfs_b->quota == RUNTIME_INF) +		goto out_unlock; + +	throttled = !list_empty(&cfs_b->throttled_cfs_rq); +	/* idle depends on !throttled (for the case of a large deficit) */ +	idle = cfs_b->idle && !throttled; +	cfs_b->nr_periods += overrun; + +	/* if we're going inactive then everything else can be deferred */ +	if (idle) +		goto out_unlock; + +	__refill_cfs_bandwidth_runtime(cfs_b); + +	if (!throttled) { +		/* mark as potentially idle for the upcoming period */ +		cfs_b->idle = 1; +		goto out_unlock; +	} + +	/* account preceding periods in which throttling occurred */ +	cfs_b->nr_throttled += overrun; + +	/* +	 * There are throttled entities so we must first use the new bandwidth +	 * to unthrottle them before making it generally available.  This +	 * ensures that all existing debts will be paid before a new cfs_rq is +	 * allowed to run. +	 */ +	runtime = cfs_b->runtime; +	runtime_expires = cfs_b->runtime_expires; +	cfs_b->runtime = 0; + +	/* +	 * This check is repeated as we are holding onto the new bandwidth +	 * while we unthrottle.  This can potentially race with an unthrottled +	 * group trying to acquire new bandwidth from the global pool. +	 */ +	while (throttled && runtime > 0) { +		raw_spin_unlock(&cfs_b->lock); +		/* we can't nest cfs_b->lock while distributing bandwidth */ +		runtime = distribute_cfs_runtime(cfs_b, runtime, +						 runtime_expires); +		raw_spin_lock(&cfs_b->lock); + +		throttled = !list_empty(&cfs_b->throttled_cfs_rq); +	} + +	/* return (any) remaining runtime */ +	cfs_b->runtime = runtime; +	/* +	 * While we are ensured activity in the period following an +	 * unthrottle, this also covers the case in which the new bandwidth is +	 * insufficient to cover the existing bandwidth deficit.  (Forcing the +	 * timer to remain active while there are any throttled entities.) +	 */ +	cfs_b->idle = 0; +out_unlock: +	if (idle) +		cfs_b->timer_active = 0; +	raw_spin_unlock(&cfs_b->lock); + +	return idle; +} + +/* a cfs_rq won't donate quota below this amount */ +static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; +/* minimum remaining period time to redistribute slack quota */ +static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; +/* how long we wait to gather additional slack before distributing */ +static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; + +/* are we near the end of the current quota period? */ +static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) +{ +	struct hrtimer *refresh_timer = &cfs_b->period_timer; +	u64 remaining; + +	/* if the call-back is running a quota refresh is already occurring */ +	if (hrtimer_callback_running(refresh_timer)) +		return 1; + +	/* is a quota refresh about to occur? */ +	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); +	if (remaining < min_expire) +		return 1; + +	return 0; +} + +static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) +{ +	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; + +	/* if there's a quota refresh soon don't bother with slack */ +	if (runtime_refresh_within(cfs_b, min_left)) +		return; + +	start_bandwidth_timer(&cfs_b->slack_timer, +				ns_to_ktime(cfs_bandwidth_slack_period)); +} + +/* we know any runtime found here is valid as update_curr() precedes return */ +static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); +	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; + +	if (slack_runtime <= 0) +		return; + +	raw_spin_lock(&cfs_b->lock); +	if (cfs_b->quota != RUNTIME_INF && +	    cfs_rq->runtime_expires == cfs_b->runtime_expires) { +		cfs_b->runtime += slack_runtime; + +		/* we are under rq->lock, defer unthrottling using a timer */ +		if (cfs_b->runtime > sched_cfs_bandwidth_slice() && +		    !list_empty(&cfs_b->throttled_cfs_rq)) +			start_cfs_slack_bandwidth(cfs_b); +	} +	raw_spin_unlock(&cfs_b->lock); + +	/* even if it's not valid for return we don't want to try again */ +	cfs_rq->runtime_remaining -= slack_runtime; +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ +	if (!cfs_bandwidth_used()) +		return; + +	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) +		return; + +	__return_cfs_rq_runtime(cfs_rq); +} + +/* + * This is done with a timer (instead of inline with bandwidth return) since + * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. + */ +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) +{ +	u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); +	u64 expires; + +	/* confirm we're still not at a refresh boundary */ +	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) +		return; + +	raw_spin_lock(&cfs_b->lock); +	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { +		runtime = cfs_b->runtime; +		cfs_b->runtime = 0; +	} +	expires = cfs_b->runtime_expires; +	raw_spin_unlock(&cfs_b->lock); + +	if (!runtime) +		return; + +	runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + +	raw_spin_lock(&cfs_b->lock); +	if (expires == cfs_b->runtime_expires) +		cfs_b->runtime = runtime; +	raw_spin_unlock(&cfs_b->lock); +} + +/* + * When a group wakes up we want to make sure that its quota is not already + * expired/exceeded, otherwise it may be allowed to steal additional ticks of + * runtime as update_curr() throttling can not not trigger until it's on-rq. + */ +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) +{ +	if (!cfs_bandwidth_used()) +		return; + +	/* an active group must be handled by the update_curr()->put() path */ +	if (!cfs_rq->runtime_enabled || cfs_rq->curr) +		return; + +	/* ensure the group is not already throttled */ +	if (cfs_rq_throttled(cfs_rq)) +		return; + +	/* update runtime allocation */ +	account_cfs_rq_runtime(cfs_rq, 0); +	if (cfs_rq->runtime_remaining <= 0) +		throttle_cfs_rq(cfs_rq); +} + +/* conditionally throttle active cfs_rq's from put_prev_entity() */ +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ +	if (!cfs_bandwidth_used()) +		return; + +	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) +		return; + +	/* +	 * it's possible for a throttled entity to be forced into a running +	 * state (e.g. set_curr_task), in this case we're finished. +	 */ +	if (cfs_rq_throttled(cfs_rq)) +		return; + +	throttle_cfs_rq(cfs_rq); +} + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ +	struct cfs_bandwidth *cfs_b = +		container_of(timer, struct cfs_bandwidth, slack_timer); +	do_sched_cfs_slack_timer(cfs_b); + +	return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ +	struct cfs_bandwidth *cfs_b = +		container_of(timer, struct cfs_bandwidth, period_timer); +	ktime_t now; +	int overrun; +	int idle = 0; + +	for (;;) { +		now = hrtimer_cb_get_time(timer); +		overrun = hrtimer_forward(timer, now, cfs_b->period); + +		if (!overrun) +			break; + +		idle = do_sched_cfs_period_timer(cfs_b, overrun); +	} + +	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ +	raw_spin_lock_init(&cfs_b->lock); +	cfs_b->runtime = 0; +	cfs_b->quota = RUNTIME_INF; +	cfs_b->period = ns_to_ktime(default_cfs_period()); + +	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); +	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	cfs_b->period_timer.function = sched_cfs_period_timer; +	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ +	cfs_rq->runtime_enabled = 0; +	INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ +	/* +	 * The timer may be active because we're trying to set a new bandwidth +	 * period or because we're racing with the tear-down path +	 * (timer_active==0 becomes visible before the hrtimer call-back +	 * terminates).  In either case we ensure that it's re-programmed +	 */ +	while (unlikely(hrtimer_active(&cfs_b->period_timer))) { +		raw_spin_unlock(&cfs_b->lock); +		/* ensure cfs_b->lock is available while we wait */ +		hrtimer_cancel(&cfs_b->period_timer); + +		raw_spin_lock(&cfs_b->lock); +		/* if someone else restarted the timer then we're done */ +		if (cfs_b->timer_active) +			return; +	} + +	cfs_b->timer_active = 1; +	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ +	hrtimer_cancel(&cfs_b->period_timer); +	hrtimer_cancel(&cfs_b->slack_timer); +} + +void unthrottle_offline_cfs_rqs(struct rq *rq) +{ +	struct cfs_rq *cfs_rq; + +	for_each_leaf_cfs_rq(rq, cfs_rq) { +		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + +		if (!cfs_rq->runtime_enabled) +			continue; + +		/* +		 * clock_task is not advancing so we just need to make sure +		 * there's some valid quota amount +		 */ +		cfs_rq->runtime_remaining = cfs_b->quota; +		if (cfs_rq_throttled(cfs_rq)) +			unthrottle_cfs_rq(cfs_rq); +	} +} + +#else /* CONFIG_CFS_BANDWIDTH */ +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, +				     unsigned long delta_exec) {} +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ +	return 0; +} + +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ +	return 0; +} + +static inline int throttled_lb_pair(struct task_group *tg, +				    int src_cpu, int dest_cpu) +{ +	return 0; +} + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +#endif + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ +	return NULL; +} +static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +void unthrottle_offline_cfs_rqs(struct rq *rq) {} + +#endif /* CONFIG_CFS_BANDWIDTH */ + +/************************************************** + * CFS operations on tasks: + */ + +#ifdef CONFIG_SCHED_HRTICK +static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ +	struct sched_entity *se = &p->se; +	struct cfs_rq *cfs_rq = cfs_rq_of(se); + +	WARN_ON(task_rq(p) != rq); + +	if (cfs_rq->nr_running > 1) { +		u64 slice = sched_slice(cfs_rq, se); +		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; +		s64 delta = slice - ran; + +		if (delta < 0) { +			if (rq->curr == p) +				resched_task(p); +			return; +		} + +		/* +		 * Don't schedule slices shorter than 10000ns, that just +		 * doesn't make sense. Rely on vruntime for fairness. +		 */ +		if (rq->curr != p) +			delta = max_t(s64, 10000LL, delta); + +		hrtick_start(rq, delta); +	} +} + +/* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ +static void hrtick_update(struct rq *rq) +{ +	struct task_struct *curr = rq->curr; + +	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) +		return; + +	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) +		hrtick_start_fair(rq, curr); +} +#else /* !CONFIG_SCHED_HRTICK */ +static inline void +hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ +} + +static inline void hrtick_update(struct rq *rq) +{ +} +#endif + +/* + * The enqueue_task method is called before nr_running is + * increased. Here we update the fair scheduling stats and + * then put the task into the rbtree: + */ +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ +	struct cfs_rq *cfs_rq; +	struct sched_entity *se = &p->se; + +	for_each_sched_entity(se) { +		if (se->on_rq) +			break; +		cfs_rq = cfs_rq_of(se); +		enqueue_entity(cfs_rq, se, flags); + +		/* +		 * end evaluation on encountering a throttled cfs_rq +		 * +		 * note: in the case of encountering a throttled cfs_rq we will +		 * post the final h_nr_running increment below. +		*/ +		if (cfs_rq_throttled(cfs_rq)) +			break; +		cfs_rq->h_nr_running++; + +		flags = ENQUEUE_WAKEUP; +	} + +	for_each_sched_entity(se) { +		cfs_rq = cfs_rq_of(se); +		cfs_rq->h_nr_running++; + +		if (cfs_rq_throttled(cfs_rq)) +			break; + +		update_cfs_load(cfs_rq, 0); +		update_cfs_shares(cfs_rq); +	} + +	if (!se) +		inc_nr_running(rq); +	hrtick_update(rq); +} + +static void set_next_buddy(struct sched_entity *se); + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ +	struct cfs_rq *cfs_rq; +	struct sched_entity *se = &p->se; +	int task_sleep = flags & DEQUEUE_SLEEP; + +	for_each_sched_entity(se) { +		cfs_rq = cfs_rq_of(se); +		dequeue_entity(cfs_rq, se, flags); + +		/* +		 * end evaluation on encountering a throttled cfs_rq +		 * +		 * note: in the case of encountering a throttled cfs_rq we will +		 * post the final h_nr_running decrement below. +		*/ +		if (cfs_rq_throttled(cfs_rq)) +			break; +		cfs_rq->h_nr_running--; + +		/* Don't dequeue parent if it has other entities besides us */ +		if (cfs_rq->load.weight) { +			/* +			 * Bias pick_next to pick a task from this cfs_rq, as +			 * p is sleeping when it is within its sched_slice. +			 */ +			if (task_sleep && parent_entity(se)) +				set_next_buddy(parent_entity(se)); + +			/* avoid re-evaluating load for this entity */ +			se = parent_entity(se); +			break; +		} +		flags |= DEQUEUE_SLEEP; +	} + +	for_each_sched_entity(se) { +		cfs_rq = cfs_rq_of(se); +		cfs_rq->h_nr_running--; + +		if (cfs_rq_throttled(cfs_rq)) +			break; + +		update_cfs_load(cfs_rq, 0); +		update_cfs_shares(cfs_rq); +	} + +	if (!se) +		dec_nr_running(rq); +	hrtick_update(rq); +} + +#ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ +	return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long total = weighted_cpuload(cpu); + +	if (type == 0 || !sched_feat(LB_BIAS)) +		return total; + +	return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long total = weighted_cpuload(cpu); + +	if (type == 0 || !sched_feat(LB_BIAS)) +		return total; + +	return max(rq->cpu_load[type-1], total); +} + +static unsigned long power_of(int cpu) +{ +	return cpu_rq(cpu)->cpu_power; +} + +static unsigned long cpu_avg_load_per_task(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + +	if (nr_running) +		return rq->load.weight / nr_running; + +	return 0; +} + + +static void task_waking_fair(struct task_struct *p) +{ +	struct sched_entity *se = &p->se; +	struct cfs_rq *cfs_rq = cfs_rq_of(se); +	u64 min_vruntime; + +#ifndef CONFIG_64BIT +	u64 min_vruntime_copy; + +	do { +		min_vruntime_copy = cfs_rq->min_vruntime_copy; +		smp_rmb(); +		min_vruntime = cfs_rq->min_vruntime; +	} while (min_vruntime != min_vruntime_copy); +#else +	min_vruntime = cfs_rq->min_vruntime; +#endif + +	se->vruntime -= min_vruntime; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + * + * Calculate the effective load difference if @wl is added (subtracted) to @tg + * on this @cpu and results in a total addition (subtraction) of @wg to the + * total group weight. + * + * Given a runqueue weight distribution (rw_i) we can compute a shares + * distribution (s_i) using: + * + *   s_i = rw_i / \Sum rw_j						(1) + * + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting + * shares distribution (s_i): + * + *   rw_i = {   2,   4,   1,   0 } + *   s_i  = { 2/7, 4/7, 1/7,   0 } + * + * As per wake_affine() we're interested in the load of two CPUs (the CPU the + * task used to run on and the CPU the waker is running on), we need to + * compute the effect of waking a task on either CPU and, in case of a sync + * wakeup, compute the effect of the current task going to sleep. + * + * So for a change of @wl to the local @cpu with an overall group weight change + * of @wl we can compute the new shares distribution (s'_i) using: + * + *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2) + * + * Suppose we're interested in CPUs 0 and 1, and want to compute the load + * differences in waking a task to CPU 0. The additional task changes the + * weight and shares distributions like: + * + *   rw'_i = {   3,   4,   1,   0 } + *   s'_i  = { 3/8, 4/8, 1/8,   0 } + * + * We can then compute the difference in effective weight by using: + * + *   dw_i = S * (s'_i - s_i)						(3) + * + * Where 'S' is the group weight as seen by its parent. + * + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - + * 4/7) times the weight of the group. + */ +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) +{ +	struct sched_entity *se = tg->se[cpu]; + +	if (!tg->parent)	/* the trivial, non-cgroup case */ +		return wl; + +	for_each_sched_entity(se) { +		long w, W; + +		tg = se->my_q->tg; + +		/* +		 * W = @wg + \Sum rw_j +		 */ +		W = wg + calc_tg_weight(tg, se->my_q); + +		/* +		 * w = rw_i + @wl +		 */ +		w = se->my_q->load.weight + wl; + +		/* +		 * wl = S * s'_i; see (2) +		 */ +		if (W > 0 && w < W) +			wl = (w * tg->shares) / W; +		else +			wl = tg->shares; + +		/* +		 * Per the above, wl is the new se->load.weight value; since +		 * those are clipped to [MIN_SHARES, ...) do so now. See +		 * calc_cfs_shares(). +		 */ +		if (wl < MIN_SHARES) +			wl = MIN_SHARES; + +		/* +		 * wl = dw_i = S * (s'_i - s_i); see (3) +		 */ +		wl -= se->load.weight; + +		/* +		 * Recursively apply this logic to all parent groups to compute +		 * the final effective load change on the root group. Since +		 * only the @tg group gets extra weight, all parent groups can +		 * only redistribute existing shares. @wl is the shift in shares +		 * resulting from this level per the above. +		 */ +		wg = 0; +	} + +	return wl; +} +#else + +static inline unsigned long effective_load(struct task_group *tg, int cpu, +		unsigned long wl, unsigned long wg) +{ +	return wl; +} + +#endif + +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +{ +	s64 this_load, load; +	int idx, this_cpu, prev_cpu; +	unsigned long tl_per_task; +	struct task_group *tg; +	unsigned long weight; +	int balanced; + +	idx	  = sd->wake_idx; +	this_cpu  = smp_processor_id(); +	prev_cpu  = task_cpu(p); +	load	  = source_load(prev_cpu, idx); +	this_load = target_load(this_cpu, idx); + +	/* +	 * If sync wakeup then subtract the (maximum possible) +	 * effect of the currently running task from the load +	 * of the current CPU: +	 */ +	if (sync) { +		tg = task_group(current); +		weight = current->se.load.weight; + +		this_load += effective_load(tg, this_cpu, -weight, -weight); +		load += effective_load(tg, prev_cpu, 0, -weight); +	} + +	tg = task_group(p); +	weight = p->se.load.weight; + +	/* +	 * In low-load situations, where prev_cpu is idle and this_cpu is idle +	 * due to the sync cause above having dropped this_load to 0, we'll +	 * always have an imbalance, but there's really nothing you can do +	 * about that, so that's good too. +	 * +	 * Otherwise check if either cpus are near enough in load to allow this +	 * task to be woken on this_cpu. +	 */ +	if (this_load > 0) { +		s64 this_eff_load, prev_eff_load; + +		this_eff_load = 100; +		this_eff_load *= power_of(prev_cpu); +		this_eff_load *= this_load + +			effective_load(tg, this_cpu, weight, weight); + +		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; +		prev_eff_load *= power_of(this_cpu); +		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + +		balanced = this_eff_load <= prev_eff_load; +	} else +		balanced = true; + +	/* +	 * If the currently running task will sleep within +	 * a reasonable amount of time then attract this newly +	 * woken task: +	 */ +	if (sync && balanced) +		return 1; + +	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); +	tl_per_task = cpu_avg_load_per_task(this_cpu); + +	if (balanced || +	    (this_load <= load && +	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) { +		/* +		 * This domain has SD_WAKE_AFFINE and +		 * p is cache cold in this domain, and +		 * there is no bad imbalance. +		 */ +		schedstat_inc(sd, ttwu_move_affine); +		schedstat_inc(p, se.statistics.nr_wakeups_affine); + +		return 1; +	} +	return 0; +} + +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, +		  int this_cpu, int load_idx) +{ +	struct sched_group *idlest = NULL, *group = sd->groups; +	unsigned long min_load = ULONG_MAX, this_load = 0; +	int imbalance = 100 + (sd->imbalance_pct-100)/2; + +	do { +		unsigned long load, avg_load; +		int local_group; +		int i; + +		/* Skip over this group if it has no CPUs allowed */ +		if (!cpumask_intersects(sched_group_cpus(group), +					tsk_cpus_allowed(p))) +			continue; + +		local_group = cpumask_test_cpu(this_cpu, +					       sched_group_cpus(group)); + +		/* Tally up the load of all CPUs in the group */ +		avg_load = 0; + +		for_each_cpu(i, sched_group_cpus(group)) { +			/* Bias balancing toward cpus of our domain */ +			if (local_group) +				load = source_load(i, load_idx); +			else +				load = target_load(i, load_idx); + +			avg_load += load; +		} + +		/* Adjust by relative CPU power of the group */ +		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + +		if (local_group) { +			this_load = avg_load; +		} else if (avg_load < min_load) { +			min_load = avg_load; +			idlest = group; +		} +	} while (group = group->next, group != sd->groups); + +	if (!idlest || 100*this_load < imbalance*min_load) +		return NULL; +	return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ +	unsigned long load, min_load = ULONG_MAX; +	int idlest = -1; +	int i; + +	/* Traverse only the allowed CPUs */ +	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { +		load = weighted_cpuload(i); + +		if (load < min_load || (load == min_load && i == this_cpu)) { +			min_load = load; +			idlest = i; +		} +	} + +	return idlest; +} + +/* + * Try and locate an idle CPU in the sched_domain. + */ +static int select_idle_sibling(struct task_struct *p, int target) +{ +	int cpu = smp_processor_id(); +	int prev_cpu = task_cpu(p); +	struct sched_domain *sd; +	struct sched_group *sg; +	int i; + +	/* +	 * If the task is going to be woken-up on this cpu and if it is +	 * already idle, then it is the right target. +	 */ +	if (target == cpu && idle_cpu(cpu)) +		return cpu; + +	/* +	 * If the task is going to be woken-up on the cpu where it previously +	 * ran and if it is currently idle, then it the right target. +	 */ +	if (target == prev_cpu && idle_cpu(prev_cpu)) +		return prev_cpu; + +	/* +	 * Otherwise, iterate the domains and find an elegible idle cpu. +	 */ +	rcu_read_lock(); + +	sd = rcu_dereference(per_cpu(sd_llc, target)); +	for_each_lower_domain(sd) { +		sg = sd->groups; +		do { +			if (!cpumask_intersects(sched_group_cpus(sg), +						tsk_cpus_allowed(p))) +				goto next; + +			for_each_cpu(i, sched_group_cpus(sg)) { +				if (!idle_cpu(i)) +					goto next; +			} + +			target = cpumask_first_and(sched_group_cpus(sg), +					tsk_cpus_allowed(p)); +			goto done; +next: +			sg = sg->next; +		} while (sg != sd->groups); +	} +done: +	rcu_read_unlock(); + +	return target; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +{ +	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; +	int cpu = smp_processor_id(); +	int prev_cpu = task_cpu(p); +	int new_cpu = cpu; +	int want_affine = 0; +	int want_sd = 1; +	int sync = wake_flags & WF_SYNC; + +	if (p->rt.nr_cpus_allowed == 1) +		return prev_cpu; + +	if (sd_flag & SD_BALANCE_WAKE) { +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) +			want_affine = 1; +		new_cpu = prev_cpu; +	} + +	rcu_read_lock(); +	for_each_domain(cpu, tmp) { +		if (!(tmp->flags & SD_LOAD_BALANCE)) +			continue; + +		/* +		 * If power savings logic is enabled for a domain, see if we +		 * are not overloaded, if so, don't balance wider. +		 */ +		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { +			unsigned long power = 0; +			unsigned long nr_running = 0; +			unsigned long capacity; +			int i; + +			for_each_cpu(i, sched_domain_span(tmp)) { +				power += power_of(i); +				nr_running += cpu_rq(i)->cfs.nr_running; +			} + +			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + +			if (tmp->flags & SD_POWERSAVINGS_BALANCE) +				nr_running /= 2; + +			if (nr_running < capacity) +				want_sd = 0; +		} + +		/* +		 * If both cpu and prev_cpu are part of this domain, +		 * cpu is a valid SD_WAKE_AFFINE target. +		 */ +		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && +		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { +			affine_sd = tmp; +			want_affine = 0; +		} + +		if (!want_sd && !want_affine) +			break; + +		if (!(tmp->flags & sd_flag)) +			continue; + +		if (want_sd) +			sd = tmp; +	} + +	if (affine_sd) { +		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) +			prev_cpu = cpu; + +		new_cpu = select_idle_sibling(p, prev_cpu); +		goto unlock; +	} + +	while (sd) { +		int load_idx = sd->forkexec_idx; +		struct sched_group *group; +		int weight; + +		if (!(sd->flags & sd_flag)) { +			sd = sd->child; +			continue; +		} + +		if (sd_flag & SD_BALANCE_WAKE) +			load_idx = sd->wake_idx; + +		group = find_idlest_group(sd, p, cpu, load_idx); +		if (!group) { +			sd = sd->child; +			continue; +		} + +		new_cpu = find_idlest_cpu(group, p, cpu); +		if (new_cpu == -1 || new_cpu == cpu) { +			/* Now try balancing at a lower domain level of cpu */ +			sd = sd->child; +			continue; +		} + +		/* Now try balancing at a lower domain level of new_cpu */ +		cpu = new_cpu; +		weight = sd->span_weight; +		sd = NULL; +		for_each_domain(cpu, tmp) { +			if (weight <= tmp->span_weight) +				break; +			if (tmp->flags & sd_flag) +				sd = tmp; +		} +		/* while loop will break here if sd == NULL */ +	} +unlock: +	rcu_read_unlock(); + +	return new_cpu; +} +#endif /* CONFIG_SMP */ + +static unsigned long +wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +{ +	unsigned long gran = sysctl_sched_wakeup_granularity; + +	/* +	 * Since its curr running now, convert the gran from real-time +	 * to virtual-time in his units. +	 * +	 * By using 'se' instead of 'curr' we penalize light tasks, so +	 * they get preempted easier. That is, if 'se' < 'curr' then +	 * the resulting gran will be larger, therefore penalizing the +	 * lighter, if otoh 'se' > 'curr' then the resulting gran will +	 * be smaller, again penalizing the lighter task. +	 * +	 * This is especially important for buddies when the leftmost +	 * task is higher priority than the buddy. +	 */ +	return calc_delta_fair(gran, se); +} + +/* + * Should 'se' preempt 'curr'. + * + *             |s1 + *        |s2 + *   |s3 + *         g + *      |<--->|c + * + *  w(c, s1) = -1 + *  w(c, s2) =  0 + *  w(c, s3) =  1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ +	s64 gran, vdiff = curr->vruntime - se->vruntime; + +	if (vdiff <= 0) +		return -1; + +	gran = wakeup_gran(curr, se); +	if (vdiff > gran) +		return 1; + +	return 0; +} + +static void set_last_buddy(struct sched_entity *se) +{ +	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) +		return; + +	for_each_sched_entity(se) +		cfs_rq_of(se)->last = se; +} + +static void set_next_buddy(struct sched_entity *se) +{ +	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) +		return; + +	for_each_sched_entity(se) +		cfs_rq_of(se)->next = se; +} + +static void set_skip_buddy(struct sched_entity *se) +{ +	for_each_sched_entity(se) +		cfs_rq_of(se)->skip = se; +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ +	struct task_struct *curr = rq->curr; +	struct sched_entity *se = &curr->se, *pse = &p->se; +	struct cfs_rq *cfs_rq = task_cfs_rq(curr); +	int scale = cfs_rq->nr_running >= sched_nr_latency; +	int next_buddy_marked = 0; + +	if (unlikely(se == pse)) +		return; + +	/* +	 * This is possible from callers such as pull_task(), in which we +	 * unconditionally check_prempt_curr() after an enqueue (which may have +	 * lead to a throttle).  This both saves work and prevents false +	 * next-buddy nomination below. +	 */ +	if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) +		return; + +	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { +		set_next_buddy(pse); +		next_buddy_marked = 1; +	} + +	/* +	 * We can come here with TIF_NEED_RESCHED already set from new task +	 * wake up path. +	 * +	 * Note: this also catches the edge-case of curr being in a throttled +	 * group (e.g. via set_curr_task), since update_curr() (in the +	 * enqueue of curr) will have resulted in resched being set.  This +	 * prevents us from potentially nominating it as a false LAST_BUDDY +	 * below. +	 */ +	if (test_tsk_need_resched(curr)) +		return; + +	/* Idle tasks are by definition preempted by non-idle tasks. */ +	if (unlikely(curr->policy == SCHED_IDLE) && +	    likely(p->policy != SCHED_IDLE)) +		goto preempt; + +	/* +	 * Batch and idle tasks do not preempt non-idle tasks (their preemption +	 * is driven by the tick): +	 */ +	if (unlikely(p->policy != SCHED_NORMAL)) +		return; + +	find_matching_se(&se, &pse); +	update_curr(cfs_rq_of(se)); +	BUG_ON(!pse); +	if (wakeup_preempt_entity(se, pse) == 1) { +		/* +		 * Bias pick_next to pick the sched entity that is +		 * triggering this preemption. +		 */ +		if (!next_buddy_marked) +			set_next_buddy(pse); +		goto preempt; +	} + +	return; + +preempt: +	resched_task(curr); +	/* +	 * Only set the backward buddy when the current task is still +	 * on the rq. This can happen when a wakeup gets interleaved +	 * with schedule on the ->pre_schedule() or idle_balance() +	 * point, either of which can * drop the rq lock. +	 * +	 * Also, during early boot the idle thread is in the fair class, +	 * for obvious reasons its a bad idea to schedule back to it. +	 */ +	if (unlikely(!se->on_rq || curr == rq->idle)) +		return; + +	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) +		set_last_buddy(se); +} + +static struct task_struct *pick_next_task_fair(struct rq *rq) +{ +	struct task_struct *p; +	struct cfs_rq *cfs_rq = &rq->cfs; +	struct sched_entity *se; + +	if (!cfs_rq->nr_running) +		return NULL; + +	do { +		se = pick_next_entity(cfs_rq); +		set_next_entity(cfs_rq, se); +		cfs_rq = group_cfs_rq(se); +	} while (cfs_rq); + +	p = task_of(se); +	if (hrtick_enabled(rq)) +		hrtick_start_fair(rq, p); + +	return p; +} + +/* + * Account for a descheduled task: + */ +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +{ +	struct sched_entity *se = &prev->se; +	struct cfs_rq *cfs_rq; + +	for_each_sched_entity(se) { +		cfs_rq = cfs_rq_of(se); +		put_prev_entity(cfs_rq, se); +	} +} + +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ +	struct task_struct *curr = rq->curr; +	struct cfs_rq *cfs_rq = task_cfs_rq(curr); +	struct sched_entity *se = &curr->se; + +	/* +	 * Are we the only task in the tree? +	 */ +	if (unlikely(rq->nr_running == 1)) +		return; + +	clear_buddies(cfs_rq, se); + +	if (curr->policy != SCHED_BATCH) { +		update_rq_clock(rq); +		/* +		 * Update run-time statistics of the 'current'. +		 */ +		update_curr(cfs_rq); +		/* +		 * Tell update_rq_clock() that we've just updated, +		 * so we don't do microscopic update in schedule() +		 * and double the fastpath cost. +		 */ +		 rq->skip_clock_update = 1; +	} + +	set_skip_buddy(se); +} + +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ +	struct sched_entity *se = &p->se; + +	/* throttled hierarchies are not runnable */ +	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) +		return false; + +	/* Tell the scheduler that we'd really like pse to run next. */ +	set_next_buddy(se); + +	yield_task_fair(rq); + +	return true; +} + +#ifdef CONFIG_SMP +/************************************************** + * Fair scheduling class load-balancing methods: + */ + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static void pull_task(struct rq *src_rq, struct task_struct *p, +		      struct rq *this_rq, int this_cpu) +{ +	deactivate_task(src_rq, p, 0); +	set_task_cpu(p, this_cpu); +	activate_task(this_rq, p, 0); +	check_preempt_curr(this_rq, p, 0); +} + +/* + * Is this task likely cache-hot: + */ +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ +	s64 delta; + +	if (p->sched_class != &fair_sched_class) +		return 0; + +	if (unlikely(p->policy == SCHED_IDLE)) +		return 0; + +	/* +	 * Buddy candidates are cache hot: +	 */ +	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && +			(&p->se == cfs_rq_of(&p->se)->next || +			 &p->se == cfs_rq_of(&p->se)->last)) +		return 1; + +	if (sysctl_sched_migration_cost == -1) +		return 1; +	if (sysctl_sched_migration_cost == 0) +		return 0; + +	delta = now - p->se.exec_start; + +	return delta < (s64)sysctl_sched_migration_cost; +} + +#define LBF_ALL_PINNED	0x01 +#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */ +#define LBF_HAD_BREAK	0x04 +#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT	0x10 + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, +		     struct sched_domain *sd, enum cpu_idle_type idle, +		     int *lb_flags) +{ +	int tsk_cache_hot = 0; +	/* +	 * We do not migrate tasks that are: +	 * 1) running (obviously), or +	 * 2) cannot be migrated to this CPU due to cpus_allowed, or +	 * 3) are cache-hot on their current CPU. +	 */ +	if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { +		schedstat_inc(p, se.statistics.nr_failed_migrations_affine); +		return 0; +	} +	*lb_flags &= ~LBF_ALL_PINNED; + +	if (task_running(rq, p)) { +		schedstat_inc(p, se.statistics.nr_failed_migrations_running); +		return 0; +	} + +	/* +	 * Aggressive migration if: +	 * 1) task is cache cold, or +	 * 2) too many balance attempts have failed. +	 */ + +	tsk_cache_hot = task_hot(p, rq->clock_task, sd); +	if (!tsk_cache_hot || +		sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS +		if (tsk_cache_hot) { +			schedstat_inc(sd, lb_hot_gained[idle]); +			schedstat_inc(p, se.statistics.nr_forced_migrations); +		} +#endif +		return 1; +	} + +	if (tsk_cache_hot) { +		schedstat_inc(p, se.statistics.nr_failed_migrations_hot); +		return 0; +	} +	return 1; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int +move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, +	      struct sched_domain *sd, enum cpu_idle_type idle) +{ +	struct task_struct *p, *n; +	struct cfs_rq *cfs_rq; +	int pinned = 0; + +	for_each_leaf_cfs_rq(busiest, cfs_rq) { +		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { +			if (throttled_lb_pair(task_group(p), +					      busiest->cpu, this_cpu)) +				break; + +			if (!can_migrate_task(p, busiest, this_cpu, +						sd, idle, &pinned)) +				continue; + +			pull_task(busiest, p, this_rq, this_cpu); +			/* +			 * Right now, this is only the second place pull_task() +			 * is called, so we can safely collect pull_task() +			 * stats here rather than inside pull_task(). +			 */ +			schedstat_inc(sd, lb_gained[idle]); +			return 1; +		} +	} + +	return 0; +} + +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +	      unsigned long max_load_move, struct sched_domain *sd, +	      enum cpu_idle_type idle, int *lb_flags, +	      struct cfs_rq *busiest_cfs_rq) +{ +	int loops = 0, pulled = 0; +	long rem_load_move = max_load_move; +	struct task_struct *p, *n; + +	if (max_load_move == 0) +		goto out; + +	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { +		if (loops++ > sysctl_sched_nr_migrate) { +			*lb_flags |= LBF_NEED_BREAK; +			break; +		} + +		if ((p->se.load.weight >> 1) > rem_load_move || +		    !can_migrate_task(p, busiest, this_cpu, sd, idle, +				      lb_flags)) +			continue; + +		pull_task(busiest, p, this_rq, this_cpu); +		pulled++; +		rem_load_move -= p->se.load.weight; + +#ifdef CONFIG_PREEMPT +		/* +		 * NEWIDLE balancing is a source of latency, so preemptible +		 * kernels will stop after the first task is pulled to minimize +		 * the critical section. +		 */ +		if (idle == CPU_NEWLY_IDLE) { +			*lb_flags |= LBF_ABORT; +			break; +		} +#endif + +		/* +		 * We only want to steal up to the prescribed amount of +		 * weighted load. +		 */ +		if (rem_load_move <= 0) +			break; +	} +out: +	/* +	 * Right now, this is one of only two places pull_task() is called, +	 * so we can safely collect pull_task() stats here rather than +	 * inside pull_task(). +	 */ +	schedstat_add(sd, lb_gained[idle], pulled); + +	return max_load_move - rem_load_move; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int update_shares_cpu(struct task_group *tg, int cpu) +{ +	struct cfs_rq *cfs_rq; +	unsigned long flags; +	struct rq *rq; + +	if (!tg->se[cpu]) +		return 0; + +	rq = cpu_rq(cpu); +	cfs_rq = tg->cfs_rq[cpu]; + +	raw_spin_lock_irqsave(&rq->lock, flags); + +	update_rq_clock(rq); +	update_cfs_load(cfs_rq, 1); + +	/* +	 * We need to update shares after updating tg->load_weight in +	 * order to adjust the weight of groups with long running tasks. +	 */ +	update_cfs_shares(cfs_rq); + +	raw_spin_unlock_irqrestore(&rq->lock, flags); + +	return 0; +} + +static void update_shares(int cpu) +{ +	struct cfs_rq *cfs_rq; +	struct rq *rq = cpu_rq(cpu); + +	rcu_read_lock(); +	/* +	 * Iterates the task_group tree in a bottom up fashion, see +	 * list_add_leaf_cfs_rq() for details. +	 */ +	for_each_leaf_cfs_rq(rq, cfs_rq) { +		/* throttled entities do not contribute to load */ +		if (throttled_hierarchy(cfs_rq)) +			continue; + +		update_shares_cpu(cfs_rq->tg, cpu); +	} +	rcu_read_unlock(); +} + +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ +	unsigned long load; +	long cpu = (long)data; + +	if (!tg->parent) { +		load = cpu_rq(cpu)->load.weight; +	} else { +		load = tg->parent->cfs_rq[cpu]->h_load; +		load *= tg->se[cpu]->load.weight; +		load /= tg->parent->cfs_rq[cpu]->load.weight + 1; +	} + +	tg->cfs_rq[cpu]->h_load = load; + +	return 0; +} + +static void update_h_load(long cpu) +{ +	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, +		  unsigned long max_load_move, +		  struct sched_domain *sd, enum cpu_idle_type idle, +		  int *lb_flags) +{ +	long rem_load_move = max_load_move; +	struct cfs_rq *busiest_cfs_rq; + +	rcu_read_lock(); +	update_h_load(cpu_of(busiest)); + +	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { +		unsigned long busiest_h_load = busiest_cfs_rq->h_load; +		unsigned long busiest_weight = busiest_cfs_rq->load.weight; +		u64 rem_load, moved_load; + +		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) +			break; + +		/* +		 * empty group or part of a throttled hierarchy +		 */ +		if (!busiest_cfs_rq->task_weight || +		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) +			continue; + +		rem_load = (u64)rem_load_move * busiest_weight; +		rem_load = div_u64(rem_load, busiest_h_load + 1); + +		moved_load = balance_tasks(this_rq, this_cpu, busiest, +				rem_load, sd, idle, lb_flags, +				busiest_cfs_rq); + +		if (!moved_load) +			continue; + +		moved_load *= busiest_h_load; +		moved_load = div_u64(moved_load, busiest_weight + 1); + +		rem_load_move -= moved_load; +		if (rem_load_move < 0) +			break; +	} +	rcu_read_unlock(); + +	return max_load_move - rem_load_move; +} +#else +static inline void update_shares(int cpu) +{ +} + +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, +		  unsigned long max_load_move, +		  struct sched_domain *sd, enum cpu_idle_type idle, +		  int *lb_flags) +{ +	return balance_tasks(this_rq, this_cpu, busiest, +			max_load_move, sd, idle, lb_flags, +			&busiest->cfs); +} +#endif + +/* + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +		      unsigned long max_load_move, +		      struct sched_domain *sd, enum cpu_idle_type idle, +		      int *lb_flags) +{ +	unsigned long total_load_moved = 0, load_moved; + +	do { +		load_moved = load_balance_fair(this_rq, this_cpu, busiest, +				max_load_move - total_load_moved, +				sd, idle, lb_flags); + +		total_load_moved += load_moved; + +		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) +			break; + +#ifdef CONFIG_PREEMPT +		/* +		 * NEWIDLE balancing is a source of latency, so preemptible +		 * kernels will stop after the first task is pulled to minimize +		 * the critical section. +		 */ +		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { +			*lb_flags |= LBF_ABORT; +			break; +		} +#endif +	} while (load_moved && max_load_move > total_load_moved); + +	return total_load_moved > 0; +} + +/********** Helpers for find_busiest_group ************************/ +/* + * sd_lb_stats - Structure to store the statistics of a sched_domain + * 		during load balancing. + */ +struct sd_lb_stats { +	struct sched_group *busiest; /* Busiest group in this sd */ +	struct sched_group *this;  /* Local group in this sd */ +	unsigned long total_load;  /* Total load of all groups in sd */ +	unsigned long total_pwr;   /*	Total power of all groups in sd */ +	unsigned long avg_load;	   /* Average load across all groups in sd */ + +	/** Statistics of this group */ +	unsigned long this_load; +	unsigned long this_load_per_task; +	unsigned long this_nr_running; +	unsigned long this_has_capacity; +	unsigned int  this_idle_cpus; + +	/* Statistics of the busiest group */ +	unsigned int  busiest_idle_cpus; +	unsigned long max_load; +	unsigned long busiest_load_per_task; +	unsigned long busiest_nr_running; +	unsigned long busiest_group_capacity; +	unsigned long busiest_has_capacity; +	unsigned int  busiest_group_weight; + +	int group_imb; /* Is there imbalance in this sd */ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +	int power_savings_balance; /* Is powersave balance needed for this sd */ +	struct sched_group *group_min; /* Least loaded group in sd */ +	struct sched_group *group_leader; /* Group which relieves group_min */ +	unsigned long min_load_per_task; /* load_per_task in group_min */ +	unsigned long leader_nr_running; /* Nr running of group_leader */ +	unsigned long min_nr_running; /* Nr running of group_min */ +#endif +}; + +/* + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { +	unsigned long avg_load; /*Avg load across the CPUs of the group */ +	unsigned long group_load; /* Total load over the CPUs of the group */ +	unsigned long sum_nr_running; /* Nr tasks running in the group */ +	unsigned long sum_weighted_load; /* Weighted load of group's tasks */ +	unsigned long group_capacity; +	unsigned long idle_cpus; +	unsigned long group_weight; +	int group_imb; /* Is there an imbalance in the group ? */ +	int group_has_capacity; /* Is there extra capacity in the group? */ +}; + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, +					enum cpu_idle_type idle) +{ +	int load_idx; + +	switch (idle) { +	case CPU_NOT_IDLE: +		load_idx = sd->busy_idx; +		break; + +	case CPU_NEWLY_IDLE: +		load_idx = sd->newidle_idx; +		break; +	default: +		load_idx = sd->idle_idx; +		break; +	} + +	return load_idx; +} + + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, +	struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ +	/* +	 * Busy processors will not participate in power savings +	 * balance. +	 */ +	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) +		sds->power_savings_balance = 0; +	else { +		sds->power_savings_balance = 1; +		sds->min_nr_running = ULONG_MAX; +		sds->leader_nr_running = 0; +	} +} + +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * 		load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, +	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + +	if (!sds->power_savings_balance) +		return; + +	/* +	 * If the local group is idle or completely loaded +	 * no need to do power savings balance at this domain +	 */ +	if (local_group && (sds->this_nr_running >= sgs->group_capacity || +				!sds->this_nr_running)) +		sds->power_savings_balance = 0; + +	/* +	 * If a group is already running at full capacity or idle, +	 * don't include that group in power savings calculations +	 */ +	if (!sds->power_savings_balance || +		sgs->sum_nr_running >= sgs->group_capacity || +		!sgs->sum_nr_running) +		return; + +	/* +	 * Calculate the group which has the least non-idle load. +	 * This is the group from where we need to pick up the load +	 * for saving power +	 */ +	if ((sgs->sum_nr_running < sds->min_nr_running) || +	    (sgs->sum_nr_running == sds->min_nr_running && +	     group_first_cpu(group) > group_first_cpu(sds->group_min))) { +		sds->group_min = group; +		sds->min_nr_running = sgs->sum_nr_running; +		sds->min_load_per_task = sgs->sum_weighted_load / +						sgs->sum_nr_running; +	} + +	/* +	 * Calculate the group which is almost near its +	 * capacity but still has some space to pick up some load +	 * from other group and save more power +	 */ +	if (sgs->sum_nr_running + 1 > sgs->group_capacity) +		return; + +	if (sgs->sum_nr_running > sds->leader_nr_running || +	    (sgs->sum_nr_running == sds->leader_nr_running && +	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) { +		sds->group_leader = group; +		sds->leader_nr_running = sgs->sum_nr_running; +	} +} + +/** + * check_power_save_busiest_group - see if there is potential for some power-savings balance + * @sds: Variable containing the statistics of the sched_domain + *	under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Description: + * Check if we have potential to perform some power-savings balance. + * If yes, set the busiest group to be the least loaded group in the + * sched_domain, so that it's CPUs can be put to idle. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, +					int this_cpu, unsigned long *imbalance) +{ +	if (!sds->power_savings_balance) +		return 0; + +	if (sds->this != sds->group_leader || +			sds->group_leader == sds->group_min) +		return 0; + +	*imbalance = sds->min_load_per_task; +	sds->busiest = sds->group_min; + +	return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, +	struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ +	return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, +	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ +	return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, +					int this_cpu, unsigned long *imbalance) +{ +	return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ +	return SCHED_POWER_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ +	return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +{ +	unsigned long weight = sd->span_weight; +	unsigned long smt_gain = sd->smt_gain; + +	smt_gain /= weight; + +	return smt_gain; +} + +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ +	return default_scale_smt_power(sd, cpu); +} + +unsigned long scale_rt_power(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	u64 total, available; + +	total = sched_avg_period() + (rq->clock - rq->age_stamp); + +	if (unlikely(total < rq->rt_avg)) { +		/* Ensures that power won't end up being negative */ +		available = 0; +	} else { +		available = total - rq->rt_avg; +	} + +	if (unlikely((s64)total < SCHED_POWER_SCALE)) +		total = SCHED_POWER_SCALE; + +	total >>= SCHED_POWER_SHIFT; + +	return div_u64(available, total); +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ +	unsigned long weight = sd->span_weight; +	unsigned long power = SCHED_POWER_SCALE; +	struct sched_group *sdg = sd->groups; + +	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { +		if (sched_feat(ARCH_POWER)) +			power *= arch_scale_smt_power(sd, cpu); +		else +			power *= default_scale_smt_power(sd, cpu); + +		power >>= SCHED_POWER_SHIFT; +	} + +	sdg->sgp->power_orig = power; + +	if (sched_feat(ARCH_POWER)) +		power *= arch_scale_freq_power(sd, cpu); +	else +		power *= default_scale_freq_power(sd, cpu); + +	power >>= SCHED_POWER_SHIFT; + +	power *= scale_rt_power(cpu); +	power >>= SCHED_POWER_SHIFT; + +	if (!power) +		power = 1; + +	cpu_rq(cpu)->cpu_power = power; +	sdg->sgp->power = power; +} + +void update_group_power(struct sched_domain *sd, int cpu) +{ +	struct sched_domain *child = sd->child; +	struct sched_group *group, *sdg = sd->groups; +	unsigned long power; + +	if (!child) { +		update_cpu_power(sd, cpu); +		return; +	} + +	power = 0; + +	group = child->groups; +	do { +		power += group->sgp->power; +		group = group->next; +	} while (group != child->groups); + +	sdg->sgp->power = power; +} + +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ +	/* +	 * Only siblings can have significantly less than SCHED_POWER_SCALE +	 */ +	if (!(sd->flags & SD_SHARE_CPUPOWER)) +		return 0; + +	/* +	 * If ~90% of the cpu_power is still there, we're good. +	 */ +	if (group->sgp->power * 32 > group->sgp->power_orig * 29) +		return 1; + +	return 0; +} + +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. + * @group: sched_group whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @local_group: Does group contain this_cpu. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct sched_domain *sd, +			struct sched_group *group, int this_cpu, +			enum cpu_idle_type idle, int load_idx, +			int local_group, const struct cpumask *cpus, +			int *balance, struct sg_lb_stats *sgs) +{ +	unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; +	int i; +	unsigned int balance_cpu = -1, first_idle_cpu = 0; +	unsigned long avg_load_per_task = 0; + +	if (local_group) +		balance_cpu = group_first_cpu(group); + +	/* Tally up the load of all CPUs in the group */ +	max_cpu_load = 0; +	min_cpu_load = ~0UL; +	max_nr_running = 0; + +	for_each_cpu_and(i, sched_group_cpus(group), cpus) { +		struct rq *rq = cpu_rq(i); + +		/* Bias balancing toward cpus of our domain */ +		if (local_group) { +			if (idle_cpu(i) && !first_idle_cpu) { +				first_idle_cpu = 1; +				balance_cpu = i; +			} + +			load = target_load(i, load_idx); +		} else { +			load = source_load(i, load_idx); +			if (load > max_cpu_load) { +				max_cpu_load = load; +				max_nr_running = rq->nr_running; +			} +			if (min_cpu_load > load) +				min_cpu_load = load; +		} + +		sgs->group_load += load; +		sgs->sum_nr_running += rq->nr_running; +		sgs->sum_weighted_load += weighted_cpuload(i); +		if (idle_cpu(i)) +			sgs->idle_cpus++; +	} + +	/* +	 * First idle cpu or the first cpu(busiest) in this sched group +	 * is eligible for doing load balancing at this and above +	 * domains. In the newly idle case, we will allow all the cpu's +	 * to do the newly idle load balance. +	 */ +	if (idle != CPU_NEWLY_IDLE && local_group) { +		if (balance_cpu != this_cpu) { +			*balance = 0; +			return; +		} +		update_group_power(sd, this_cpu); +	} + +	/* Adjust by relative CPU power of the group */ +	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + +	/* +	 * Consider the group unbalanced when the imbalance is larger +	 * than the average weight of a task. +	 * +	 * APZ: with cgroup the avg task weight can vary wildly and +	 *      might not be a suitable number - should we keep a +	 *      normalized nr_running number somewhere that negates +	 *      the hierarchy? +	 */ +	if (sgs->sum_nr_running) +		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + +	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) +		sgs->group_imb = 1; + +	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, +						SCHED_POWER_SCALE); +	if (!sgs->group_capacity) +		sgs->group_capacity = fix_small_capacity(sd, group); +	sgs->group_weight = group->group_weight; + +	if (sgs->group_capacity > sgs->sum_nr_running) +		sgs->group_has_capacity = 1; +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * @this_cpu: the current cpu + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, +				   struct sd_lb_stats *sds, +				   struct sched_group *sg, +				   struct sg_lb_stats *sgs, +				   int this_cpu) +{ +	if (sgs->avg_load <= sds->max_load) +		return false; + +	if (sgs->sum_nr_running > sgs->group_capacity) +		return true; + +	if (sgs->group_imb) +		return true; + +	/* +	 * ASYM_PACKING needs to move all the work to the lowest +	 * numbered CPUs in the group, therefore mark all groups +	 * higher than ourself as busy. +	 */ +	if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && +	    this_cpu < group_first_cpu(sg)) { +		if (!sds->busiest) +			return true; + +		if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) +			return true; +	} + +	return false; +} + +/** + * update_sd_lb_stats - Update sched_domain's statistics for load balancing. + * @sd: sched_domain whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. + */ +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, +			enum cpu_idle_type idle, const struct cpumask *cpus, +			int *balance, struct sd_lb_stats *sds) +{ +	struct sched_domain *child = sd->child; +	struct sched_group *sg = sd->groups; +	struct sg_lb_stats sgs; +	int load_idx, prefer_sibling = 0; + +	if (child && child->flags & SD_PREFER_SIBLING) +		prefer_sibling = 1; + +	init_sd_power_savings_stats(sd, sds, idle); +	load_idx = get_sd_load_idx(sd, idle); + +	do { +		int local_group; + +		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); +		memset(&sgs, 0, sizeof(sgs)); +		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, +				local_group, cpus, balance, &sgs); + +		if (local_group && !(*balance)) +			return; + +		sds->total_load += sgs.group_load; +		sds->total_pwr += sg->sgp->power; + +		/* +		 * In case the child domain prefers tasks go to siblings +		 * first, lower the sg capacity to one so that we'll try +		 * and move all the excess tasks away. We lower the capacity +		 * of a group only if the local group has the capacity to fit +		 * these excess tasks, i.e. nr_running < group_capacity. The +		 * extra check prevents the case where you always pull from the +		 * heaviest group when it is already under-utilized (possible +		 * with a large weight task outweighs the tasks on the system). +		 */ +		if (prefer_sibling && !local_group && sds->this_has_capacity) +			sgs.group_capacity = min(sgs.group_capacity, 1UL); + +		if (local_group) { +			sds->this_load = sgs.avg_load; +			sds->this = sg; +			sds->this_nr_running = sgs.sum_nr_running; +			sds->this_load_per_task = sgs.sum_weighted_load; +			sds->this_has_capacity = sgs.group_has_capacity; +			sds->this_idle_cpus = sgs.idle_cpus; +		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { +			sds->max_load = sgs.avg_load; +			sds->busiest = sg; +			sds->busiest_nr_running = sgs.sum_nr_running; +			sds->busiest_idle_cpus = sgs.idle_cpus; +			sds->busiest_group_capacity = sgs.group_capacity; +			sds->busiest_load_per_task = sgs.sum_weighted_load; +			sds->busiest_has_capacity = sgs.group_has_capacity; +			sds->busiest_group_weight = sgs.group_weight; +			sds->group_imb = sgs.group_imb; +		} + +		update_sd_power_savings_stats(sg, sds, local_group, &sgs); +		sg = sg->next; +	} while (sg != sd->groups); +} + +/** + * check_asym_packing - Check to see if the group is packed into the + *			sched doman. + * + * This is primarily intended to used at the sibling level.  Some + * cores like POWER7 prefer to use lower numbered SMT threads.  In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle.  When in lower SMT modes, the threads will + * perform better since they share less core resources.  Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads.  It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on.  Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU.  The amount of the imbalance is returned in *imbalance. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + */ +static int check_asym_packing(struct sched_domain *sd, +			      struct sd_lb_stats *sds, +			      int this_cpu, unsigned long *imbalance) +{ +	int busiest_cpu; + +	if (!(sd->flags & SD_ASYM_PACKING)) +		return 0; + +	if (!sds->busiest) +		return 0; + +	busiest_cpu = group_first_cpu(sds->busiest); +	if (this_cpu > busiest_cpu) +		return 0; + +	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, +				       SCHED_POWER_SCALE); +	return 1; +} + +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + *			amongst the groups of a sched_domain, during + *			load balancing. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: Variable to store the imbalance. + */ +static inline void fix_small_imbalance(struct sd_lb_stats *sds, +				int this_cpu, unsigned long *imbalance) +{ +	unsigned long tmp, pwr_now = 0, pwr_move = 0; +	unsigned int imbn = 2; +	unsigned long scaled_busy_load_per_task; + +	if (sds->this_nr_running) { +		sds->this_load_per_task /= sds->this_nr_running; +		if (sds->busiest_load_per_task > +				sds->this_load_per_task) +			imbn = 1; +	} else +		sds->this_load_per_task = +			cpu_avg_load_per_task(this_cpu); + +	scaled_busy_load_per_task = sds->busiest_load_per_task +					 * SCHED_POWER_SCALE; +	scaled_busy_load_per_task /= sds->busiest->sgp->power; + +	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= +			(scaled_busy_load_per_task * imbn)) { +		*imbalance = sds->busiest_load_per_task; +		return; +	} + +	/* +	 * OK, we don't have enough imbalance to justify moving tasks, +	 * however we may be able to increase total CPU power used by +	 * moving them. +	 */ + +	pwr_now += sds->busiest->sgp->power * +			min(sds->busiest_load_per_task, sds->max_load); +	pwr_now += sds->this->sgp->power * +			min(sds->this_load_per_task, sds->this_load); +	pwr_now /= SCHED_POWER_SCALE; + +	/* Amount of load we'd subtract */ +	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / +		sds->busiest->sgp->power; +	if (sds->max_load > tmp) +		pwr_move += sds->busiest->sgp->power * +			min(sds->busiest_load_per_task, sds->max_load - tmp); + +	/* Amount of load we'd add */ +	if (sds->max_load * sds->busiest->sgp->power < +		sds->busiest_load_per_task * SCHED_POWER_SCALE) +		tmp = (sds->max_load * sds->busiest->sgp->power) / +			sds->this->sgp->power; +	else +		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / +			sds->this->sgp->power; +	pwr_move += sds->this->sgp->power * +			min(sds->this_load_per_task, sds->this_load + tmp); +	pwr_move /= SCHED_POWER_SCALE; + +	/* Move if we gain throughput */ +	if (pwr_move > pwr_now) +		*imbalance = sds->busiest_load_per_task; +} + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + *			 groups of a given sched_domain during load balance. + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: Cpu for which currently load balance is being performed. + * @imbalance: The variable to store the imbalance. + */ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, +		unsigned long *imbalance) +{ +	unsigned long max_pull, load_above_capacity = ~0UL; + +	sds->busiest_load_per_task /= sds->busiest_nr_running; +	if (sds->group_imb) { +		sds->busiest_load_per_task = +			min(sds->busiest_load_per_task, sds->avg_load); +	} + +	/* +	 * In the presence of smp nice balancing, certain scenarios can have +	 * max load less than avg load(as we skip the groups at or below +	 * its cpu_power, while calculating max_load..) +	 */ +	if (sds->max_load < sds->avg_load) { +		*imbalance = 0; +		return fix_small_imbalance(sds, this_cpu, imbalance); +	} + +	if (!sds->group_imb) { +		/* +		 * Don't want to pull so many tasks that a group would go idle. +		 */ +		load_above_capacity = (sds->busiest_nr_running - +						sds->busiest_group_capacity); + +		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); + +		load_above_capacity /= sds->busiest->sgp->power; +	} + +	/* +	 * We're trying to get all the cpus to the average_load, so we don't +	 * want to push ourselves above the average load, nor do we wish to +	 * reduce the max loaded cpu below the average load. At the same time, +	 * we also don't want to reduce the group load below the group capacity +	 * (so that we can implement power-savings policies etc). Thus we look +	 * for the minimum possible imbalance. +	 * Be careful of negative numbers as they'll appear as very large values +	 * with unsigned longs. +	 */ +	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); + +	/* How much load to actually move to equalise the imbalance */ +	*imbalance = min(max_pull * sds->busiest->sgp->power, +		(sds->avg_load - sds->this_load) * sds->this->sgp->power) +			/ SCHED_POWER_SCALE; + +	/* +	 * if *imbalance is less than the average load per runnable task +	 * there is no guarantee that any tasks will be moved so we'll have +	 * a think about bumping its value to force at least one task to be +	 * moved +	 */ +	if (*imbalance < sds->busiest_load_per_task) +		return fix_small_imbalance(sds, this_cpu, imbalance); + +} + +/******* find_busiest_group() helpers end here *********************/ + +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + *		be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + *	is the appropriate cpu to perform load balancing at this_level. + * + * Returns:	- the busiest group if imbalance exists. + *		- If no imbalance and user has opted for power-savings balance, + *		   return the least loaded group whose CPUs can be + *		   put to idle by rebalancing its tasks onto our group. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, +		   unsigned long *imbalance, enum cpu_idle_type idle, +		   const struct cpumask *cpus, int *balance) +{ +	struct sd_lb_stats sds; + +	memset(&sds, 0, sizeof(sds)); + +	/* +	 * Compute the various statistics relavent for load balancing at +	 * this level. +	 */ +	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); + +	/* +	 * this_cpu is not the appropriate cpu to perform load balancing at +	 * this level. +	 */ +	if (!(*balance)) +		goto ret; + +	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && +	    check_asym_packing(sd, &sds, this_cpu, imbalance)) +		return sds.busiest; + +	/* There is no busy sibling group to pull tasks from */ +	if (!sds.busiest || sds.busiest_nr_running == 0) +		goto out_balanced; + +	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + +	/* +	 * If the busiest group is imbalanced the below checks don't +	 * work because they assumes all things are equal, which typically +	 * isn't true due to cpus_allowed constraints and the like. +	 */ +	if (sds.group_imb) +		goto force_balance; + +	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ +	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && +			!sds.busiest_has_capacity) +		goto force_balance; + +	/* +	 * If the local group is more busy than the selected busiest group +	 * don't try and pull any tasks. +	 */ +	if (sds.this_load >= sds.max_load) +		goto out_balanced; + +	/* +	 * Don't pull any tasks if this group is already above the domain +	 * average load. +	 */ +	if (sds.this_load >= sds.avg_load) +		goto out_balanced; + +	if (idle == CPU_IDLE) { +		/* +		 * This cpu is idle. If the busiest group load doesn't +		 * have more tasks than the number of available cpu's and +		 * there is no imbalance between this and busiest group +		 * wrt to idle cpu's, it is balanced. +		 */ +		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && +		    sds.busiest_nr_running <= sds.busiest_group_weight) +			goto out_balanced; +	} else { +		/* +		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use +		 * imbalance_pct to be conservative. +		 */ +		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) +			goto out_balanced; +	} + +force_balance: +	/* Looks like there is an imbalance. Compute it */ +	calculate_imbalance(&sds, this_cpu, imbalance); +	return sds.busiest; + +out_balanced: +	/* +	 * There is no obvious imbalance. But check if we can do some balancing +	 * to save power. +	 */ +	if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) +		return sds.busiest; +ret: +	*imbalance = 0; +	return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static struct rq * +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, +		   enum cpu_idle_type idle, unsigned long imbalance, +		   const struct cpumask *cpus) +{ +	struct rq *busiest = NULL, *rq; +	unsigned long max_load = 0; +	int i; + +	for_each_cpu(i, sched_group_cpus(group)) { +		unsigned long power = power_of(i); +		unsigned long capacity = DIV_ROUND_CLOSEST(power, +							   SCHED_POWER_SCALE); +		unsigned long wl; + +		if (!capacity) +			capacity = fix_small_capacity(sd, group); + +		if (!cpumask_test_cpu(i, cpus)) +			continue; + +		rq = cpu_rq(i); +		wl = weighted_cpuload(i); + +		/* +		 * When comparing with imbalance, use weighted_cpuload() +		 * which is not scaled with the cpu power. +		 */ +		if (capacity && rq->nr_running == 1 && wl > imbalance) +			continue; + +		/* +		 * For the load comparisons with the other cpu's, consider +		 * the weighted_cpuload() scaled with the cpu power, so that +		 * the load can be moved away from the cpu that is potentially +		 * running at a lower capacity. +		 */ +		wl = (wl * SCHED_POWER_SCALE) / power; + +		if (wl > max_load) { +			max_load = wl; +			busiest = rq; +		} +	} + +	return busiest; +} + +/* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL	512 + +/* Working cpumask for load_balance and load_balance_newidle. */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + +static int need_active_balance(struct sched_domain *sd, int idle, +			       int busiest_cpu, int this_cpu) +{ +	if (idle == CPU_NEWLY_IDLE) { + +		/* +		 * ASYM_PACKING needs to force migrate tasks from busy but +		 * higher numbered CPUs in order to pack all tasks in the +		 * lowest numbered CPUs. +		 */ +		if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) +			return 1; + +		/* +		 * The only task running in a non-idle cpu can be moved to this +		 * cpu in an attempt to completely freeup the other CPU +		 * package. +		 * +		 * The package power saving logic comes from +		 * find_busiest_group(). If there are no imbalance, then +		 * f_b_g() will return NULL. However when sched_mc={1,2} then +		 * f_b_g() will select a group from which a running task may be +		 * pulled to this cpu in order to make the other package idle. +		 * If there is no opportunity to make a package idle and if +		 * there are no imbalance, then f_b_g() will return NULL and no +		 * action will be taken in load_balance_newidle(). +		 * +		 * Under normal task pull operation due to imbalance, there +		 * will be more than one task in the source run queue and +		 * move_tasks() will succeed.  ld_moved will be true and this +		 * active balance code will not be triggered. +		 */ +		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) +			return 0; +	} + +	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); +} + +static int active_load_balance_cpu_stop(void *data); + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + */ +static int load_balance(int this_cpu, struct rq *this_rq, +			struct sched_domain *sd, enum cpu_idle_type idle, +			int *balance) +{ +	int ld_moved, lb_flags = 0, active_balance = 0; +	struct sched_group *group; +	unsigned long imbalance; +	struct rq *busiest; +	unsigned long flags; +	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + +	cpumask_copy(cpus, cpu_active_mask); + +	schedstat_inc(sd, lb_count[idle]); + +redo: +	group = find_busiest_group(sd, this_cpu, &imbalance, idle, +				   cpus, balance); + +	if (*balance == 0) +		goto out_balanced; + +	if (!group) { +		schedstat_inc(sd, lb_nobusyg[idle]); +		goto out_balanced; +	} + +	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); +	if (!busiest) { +		schedstat_inc(sd, lb_nobusyq[idle]); +		goto out_balanced; +	} + +	BUG_ON(busiest == this_rq); + +	schedstat_add(sd, lb_imbalance[idle], imbalance); + +	ld_moved = 0; +	if (busiest->nr_running > 1) { +		/* +		 * Attempt to move tasks. If find_busiest_group has found +		 * an imbalance but busiest->nr_running <= 1, the group is +		 * still unbalanced. ld_moved simply stays zero, so it is +		 * correctly treated as an imbalance. +		 */ +		lb_flags |= LBF_ALL_PINNED; +		local_irq_save(flags); +		double_rq_lock(this_rq, busiest); +		ld_moved = move_tasks(this_rq, this_cpu, busiest, +				      imbalance, sd, idle, &lb_flags); +		double_rq_unlock(this_rq, busiest); +		local_irq_restore(flags); + +		/* +		 * some other cpu did the load balance for us. +		 */ +		if (ld_moved && this_cpu != smp_processor_id()) +			resched_cpu(this_cpu); + +		if (lb_flags & LBF_ABORT) +			goto out_balanced; + +		if (lb_flags & LBF_NEED_BREAK) { +			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; +			if (lb_flags & LBF_ABORT) +				goto out_balanced; +			goto redo; +		} + +		/* All tasks on this runqueue were pinned by CPU affinity */ +		if (unlikely(lb_flags & LBF_ALL_PINNED)) { +			cpumask_clear_cpu(cpu_of(busiest), cpus); +			if (!cpumask_empty(cpus)) +				goto redo; +			goto out_balanced; +		} +	} + +	if (!ld_moved) { +		schedstat_inc(sd, lb_failed[idle]); +		/* +		 * Increment the failure counter only on periodic balance. +		 * We do not want newidle balance, which can be very +		 * frequent, pollute the failure counter causing +		 * excessive cache_hot migrations and active balances. +		 */ +		if (idle != CPU_NEWLY_IDLE) +			sd->nr_balance_failed++; + +		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { +			raw_spin_lock_irqsave(&busiest->lock, flags); + +			/* don't kick the active_load_balance_cpu_stop, +			 * if the curr task on busiest cpu can't be +			 * moved to this_cpu +			 */ +			if (!cpumask_test_cpu(this_cpu, +					tsk_cpus_allowed(busiest->curr))) { +				raw_spin_unlock_irqrestore(&busiest->lock, +							    flags); +				lb_flags |= LBF_ALL_PINNED; +				goto out_one_pinned; +			} + +			/* +			 * ->active_balance synchronizes accesses to +			 * ->active_balance_work.  Once set, it's cleared +			 * only after active load balance is finished. +			 */ +			if (!busiest->active_balance) { +				busiest->active_balance = 1; +				busiest->push_cpu = this_cpu; +				active_balance = 1; +			} +			raw_spin_unlock_irqrestore(&busiest->lock, flags); + +			if (active_balance) +				stop_one_cpu_nowait(cpu_of(busiest), +					active_load_balance_cpu_stop, busiest, +					&busiest->active_balance_work); + +			/* +			 * We've kicked active balancing, reset the failure +			 * counter. +			 */ +			sd->nr_balance_failed = sd->cache_nice_tries+1; +		} +	} else +		sd->nr_balance_failed = 0; + +	if (likely(!active_balance)) { +		/* We were unbalanced, so reset the balancing interval */ +		sd->balance_interval = sd->min_interval; +	} else { +		/* +		 * If we've begun active balancing, start to back off. This +		 * case may not be covered by the all_pinned logic if there +		 * is only 1 task on the busy runqueue (because we don't call +		 * move_tasks). +		 */ +		if (sd->balance_interval < sd->max_interval) +			sd->balance_interval *= 2; +	} + +	goto out; + +out_balanced: +	schedstat_inc(sd, lb_balanced[idle]); + +	sd->nr_balance_failed = 0; + +out_one_pinned: +	/* tune up the balancing interval */ +	if (((lb_flags & LBF_ALL_PINNED) && +			sd->balance_interval < MAX_PINNED_INTERVAL) || +			(sd->balance_interval < sd->max_interval)) +		sd->balance_interval *= 2; + +	ld_moved = 0; +out: +	return ld_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +void idle_balance(int this_cpu, struct rq *this_rq) +{ +	struct sched_domain *sd; +	int pulled_task = 0; +	unsigned long next_balance = jiffies + HZ; + +	this_rq->idle_stamp = this_rq->clock; + +	if (this_rq->avg_idle < sysctl_sched_migration_cost) +		return; + +	/* +	 * Drop the rq->lock, but keep IRQ/preempt disabled. +	 */ +	raw_spin_unlock(&this_rq->lock); + +	update_shares(this_cpu); +	rcu_read_lock(); +	for_each_domain(this_cpu, sd) { +		unsigned long interval; +		int balance = 1; + +		if (!(sd->flags & SD_LOAD_BALANCE)) +			continue; + +		if (sd->flags & SD_BALANCE_NEWIDLE) { +			/* If we've pulled tasks over stop searching: */ +			pulled_task = load_balance(this_cpu, this_rq, +						   sd, CPU_NEWLY_IDLE, &balance); +		} + +		interval = msecs_to_jiffies(sd->balance_interval); +		if (time_after(next_balance, sd->last_balance + interval)) +			next_balance = sd->last_balance + interval; +		if (pulled_task) { +			this_rq->idle_stamp = 0; +			break; +		} +	} +	rcu_read_unlock(); + +	raw_spin_lock(&this_rq->lock); + +	if (pulled_task || time_after(jiffies, this_rq->next_balance)) { +		/* +		 * We are going idle. next_balance may be set based on +		 * a busy processor. So reset next_balance. +		 */ +		this_rq->next_balance = next_balance; +	} +} + +/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ +	struct rq *busiest_rq = data; +	int busiest_cpu = cpu_of(busiest_rq); +	int target_cpu = busiest_rq->push_cpu; +	struct rq *target_rq = cpu_rq(target_cpu); +	struct sched_domain *sd; + +	raw_spin_lock_irq(&busiest_rq->lock); + +	/* make sure the requested cpu hasn't gone down in the meantime */ +	if (unlikely(busiest_cpu != smp_processor_id() || +		     !busiest_rq->active_balance)) +		goto out_unlock; + +	/* Is there any task to move? */ +	if (busiest_rq->nr_running <= 1) +		goto out_unlock; + +	/* +	 * This condition is "impossible", if it occurs +	 * we need to fix it. Originally reported by +	 * Bjorn Helgaas on a 128-cpu setup. +	 */ +	BUG_ON(busiest_rq == target_rq); + +	/* move a task from busiest_rq to target_rq */ +	double_lock_balance(busiest_rq, target_rq); + +	/* Search for an sd spanning us and the target CPU. */ +	rcu_read_lock(); +	for_each_domain(target_cpu, sd) { +		if ((sd->flags & SD_LOAD_BALANCE) && +		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) +				break; +	} + +	if (likely(sd)) { +		schedstat_inc(sd, alb_count); + +		if (move_one_task(target_rq, target_cpu, busiest_rq, +				  sd, CPU_IDLE)) +			schedstat_inc(sd, alb_pushed); +		else +			schedstat_inc(sd, alb_failed); +	} +	rcu_read_unlock(); +	double_unlock_balance(busiest_rq, target_rq); +out_unlock: +	busiest_rq->active_balance = 0; +	raw_spin_unlock_irq(&busiest_rq->lock); +	return 0; +} + +#ifdef CONFIG_NO_HZ +/* + * idle load balancing details + * - When one of the busy CPUs notice that there may be an idle rebalancing + *   needed, they will kick the idle load balancer, which then does idle + *   load balancing for all the idle CPUs. + */ +static struct { +	cpumask_var_t idle_cpus_mask; +	atomic_t nr_cpus; +	unsigned long next_balance;     /* in jiffy units */ +} nohz ____cacheline_aligned; + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu:	The cpu whose lowest level of sched domain is to + *		be returned. + * @flag:	The flag to check for the lowest sched_domain + *		for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ +	struct sched_domain *sd; + +	for_each_domain(cpu, sd) +		if (sd->flags & flag) +			break; + +	return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu:	The cpu whose domains we're iterating over. + * @sd:		variable holding the value of the power_savings_sd + *		for cpu. + * @flag:	The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ +	for (sd = lowest_flag_domain(cpu, flag); \ +		(sd && (sd->flags & flag)); sd = sd->parent) + +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu:	The cpu which is nominating a new idle_load_balancer. + * + * Returns:	Returns the id of the idle load balancer if it exists, + *		Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ +	int ilb = cpumask_first(nohz.idle_cpus_mask); +	struct sched_group *ilbg; +	struct sched_domain *sd; + +	/* +	 * Have idle load balancer selection from semi-idle packages only +	 * when power-aware load balancing is enabled +	 */ +	if (!(sched_smt_power_savings || sched_mc_power_savings)) +		goto out_done; + +	/* +	 * Optimize for the case when we have no idle CPUs or only one +	 * idle CPU. Don't walk the sched_domain hierarchy in such cases +	 */ +	if (cpumask_weight(nohz.idle_cpus_mask) < 2) +		goto out_done; + +	rcu_read_lock(); +	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { +		ilbg = sd->groups; + +		do { +			if (ilbg->group_weight != +				atomic_read(&ilbg->sgp->nr_busy_cpus)) { +				ilb = cpumask_first_and(nohz.idle_cpus_mask, +							sched_group_cpus(ilbg)); +				goto unlock; +			} + +			ilbg = ilbg->next; + +		} while (ilbg != sd->groups); +	} +unlock: +	rcu_read_unlock(); + +out_done: +	if (ilb < nr_cpu_ids && idle_cpu(ilb)) +		return ilb; + +	return nr_cpu_ids; +} +#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ +	return nr_cpu_ids; +} +#endif + +/* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ +	int ilb_cpu; + +	nohz.next_balance++; + +	ilb_cpu = find_new_ilb(cpu); + +	if (ilb_cpu >= nr_cpu_ids) +		return; + +	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) +		return; +	/* +	 * Use smp_send_reschedule() instead of resched_cpu(). +	 * This way we generate a sched IPI on the target cpu which +	 * is idle. And the softirq performing nohz idle load balance +	 * will be run before returning from the IPI. +	 */ +	smp_send_reschedule(ilb_cpu); +	return; +} + +static inline void set_cpu_sd_state_busy(void) +{ +	struct sched_domain *sd; +	int cpu = smp_processor_id(); + +	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) +		return; +	clear_bit(NOHZ_IDLE, nohz_flags(cpu)); + +	rcu_read_lock(); +	for_each_domain(cpu, sd) +		atomic_inc(&sd->groups->sgp->nr_busy_cpus); +	rcu_read_unlock(); +} + +void set_cpu_sd_state_idle(void) +{ +	struct sched_domain *sd; +	int cpu = smp_processor_id(); + +	if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) +		return; +	set_bit(NOHZ_IDLE, nohz_flags(cpu)); + +	rcu_read_lock(); +	for_each_domain(cpu, sd) +		atomic_dec(&sd->groups->sgp->nr_busy_cpus); +	rcu_read_unlock(); +} + +/* + * This routine will record that this cpu is going idle with tick stopped. + * This info will be used in performing idle load balancing in the future. + */ +void select_nohz_load_balancer(int stop_tick) +{ +	int cpu = smp_processor_id(); + +	if (stop_tick) { +		if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) +			return; + +		cpumask_set_cpu(cpu, nohz.idle_cpus_mask); +		atomic_inc(&nohz.nr_cpus); +		set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); +	} +	return; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +void update_max_interval(void) +{ +	max_load_balance_interval = HZ*num_online_cpus()/10; +} + +/* + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static void rebalance_domains(int cpu, enum cpu_idle_type idle) +{ +	int balance = 1; +	struct rq *rq = cpu_rq(cpu); +	unsigned long interval; +	struct sched_domain *sd; +	/* Earliest time when we have to do rebalance again */ +	unsigned long next_balance = jiffies + 60*HZ; +	int update_next_balance = 0; +	int need_serialize; + +	update_shares(cpu); + +	rcu_read_lock(); +	for_each_domain(cpu, sd) { +		if (!(sd->flags & SD_LOAD_BALANCE)) +			continue; + +		interval = sd->balance_interval; +		if (idle != CPU_IDLE) +			interval *= sd->busy_factor; + +		/* scale ms to jiffies */ +		interval = msecs_to_jiffies(interval); +		interval = clamp(interval, 1UL, max_load_balance_interval); + +		need_serialize = sd->flags & SD_SERIALIZE; + +		if (need_serialize) { +			if (!spin_trylock(&balancing)) +				goto out; +		} + +		if (time_after_eq(jiffies, sd->last_balance + interval)) { +			if (load_balance(cpu, rq, sd, idle, &balance)) { +				/* +				 * We've pulled tasks over so either we're no +				 * longer idle. +				 */ +				idle = CPU_NOT_IDLE; +			} +			sd->last_balance = jiffies; +		} +		if (need_serialize) +			spin_unlock(&balancing); +out: +		if (time_after(next_balance, sd->last_balance + interval)) { +			next_balance = sd->last_balance + interval; +			update_next_balance = 1; +		} + +		/* +		 * Stop the load balance at this level. There is another +		 * CPU in our sched group which is doing load balancing more +		 * actively. +		 */ +		if (!balance) +			break; +	} +	rcu_read_unlock(); + +	/* +	 * next_balance will be updated only when there is a need. +	 * When the cpu is attached to null domain for ex, it will not be +	 * updated. +	 */ +	if (likely(update_next_balance)) +		rq->next_balance = next_balance; +} + +#ifdef CONFIG_NO_HZ +/* + * In CONFIG_NO_HZ case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ +	struct rq *this_rq = cpu_rq(this_cpu); +	struct rq *rq; +	int balance_cpu; + +	if (idle != CPU_IDLE || +	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) +		goto end; + +	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { +		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) +			continue; + +		/* +		 * If this cpu gets work to do, stop the load balancing +		 * work being done for other cpus. Next load +		 * balancing owner will pick it up. +		 */ +		if (need_resched()) +			break; + +		raw_spin_lock_irq(&this_rq->lock); +		update_rq_clock(this_rq); +		update_cpu_load(this_rq); +		raw_spin_unlock_irq(&this_rq->lock); + +		rebalance_domains(balance_cpu, CPU_IDLE); + +		rq = cpu_rq(balance_cpu); +		if (time_after(this_rq->next_balance, rq->next_balance)) +			this_rq->next_balance = rq->next_balance; +	} +	nohz.next_balance = this_rq->next_balance; +end: +	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); +} + +/* + * Current heuristic for kicking the idle load balancer in the presence + * of an idle cpu is the system. + *   - This rq has more than one task. + *   - At any scheduler domain level, this cpu's scheduler group has multiple + *     busy cpu's exceeding the group's power. + *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler + *     domain span are idle. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ +	unsigned long now = jiffies; +	struct sched_domain *sd; + +	if (unlikely(idle_cpu(cpu))) +		return 0; + +       /* +	* We may be recently in ticked or tickless idle mode. At the first +	* busy tick after returning from idle, we will update the busy stats. +	*/ +	set_cpu_sd_state_busy(); +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); +		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); +		atomic_dec(&nohz.nr_cpus); +	} + +	/* +	 * None are in tickless mode and hence no need for NOHZ idle load +	 * balancing. +	 */ +	if (likely(!atomic_read(&nohz.nr_cpus))) +		return 0; + +	if (time_before(now, nohz.next_balance)) +		return 0; + +	if (rq->nr_running >= 2) +		goto need_kick; + +	rcu_read_lock(); +	for_each_domain(cpu, sd) { +		struct sched_group *sg = sd->groups; +		struct sched_group_power *sgp = sg->sgp; +		int nr_busy = atomic_read(&sgp->nr_busy_cpus); + +		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) +			goto need_kick_unlock; + +		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight +		    && (cpumask_first_and(nohz.idle_cpus_mask, +					  sched_domain_span(sd)) < cpu)) +			goto need_kick_unlock; + +		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) +			break; +	} +	rcu_read_unlock(); +	return 0; + +need_kick_unlock: +	rcu_read_unlock(); +need_kick: +	return 1; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +static void run_rebalance_domains(struct softirq_action *h) +{ +	int this_cpu = smp_processor_id(); +	struct rq *this_rq = cpu_rq(this_cpu); +	enum cpu_idle_type idle = this_rq->idle_balance ? +						CPU_IDLE : CPU_NOT_IDLE; + +	rebalance_domains(this_cpu, idle); + +	/* +	 * If this cpu has a pending nohz_balance_kick, then do the +	 * balancing on behalf of the other idle cpus whose ticks are +	 * stopped. +	 */ +	nohz_idle_balance(this_cpu, idle); +} + +static inline int on_null_domain(int cpu) +{ +	return !rcu_dereference_sched(cpu_rq(cpu)->sd); +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + */ +void trigger_load_balance(struct rq *rq, int cpu) +{ +	/* Don't need to rebalance while attached to NULL domain */ +	if (time_after_eq(jiffies, rq->next_balance) && +	    likely(!on_null_domain(cpu))) +		raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ +	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) +		nohz_balancer_kick(cpu); +#endif +} + +static void rq_online_fair(struct rq *rq) +{ +	update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ +	update_sysctl(); +} + +#endif /* CONFIG_SMP */ + +/* + * scheduler tick hitting a task of our scheduling class: + */ +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) +{ +	struct cfs_rq *cfs_rq; +	struct sched_entity *se = &curr->se; + +	for_each_sched_entity(se) { +		cfs_rq = cfs_rq_of(se); +		entity_tick(cfs_rq, se, queued); +	} +} + +/* + * called on fork with the child task as argument from the parent's context + *  - child not yet on the tasklist + *  - preemption disabled + */ +static void task_fork_fair(struct task_struct *p) +{ +	struct cfs_rq *cfs_rq; +	struct sched_entity *se = &p->se, *curr; +	int this_cpu = smp_processor_id(); +	struct rq *rq = this_rq(); +	unsigned long flags; + +	raw_spin_lock_irqsave(&rq->lock, flags); + +	update_rq_clock(rq); + +	cfs_rq = task_cfs_rq(current); +	curr = cfs_rq->curr; + +	if (unlikely(task_cpu(p) != this_cpu)) { +		rcu_read_lock(); +		__set_task_cpu(p, this_cpu); +		rcu_read_unlock(); +	} + +	update_curr(cfs_rq); + +	if (curr) +		se->vruntime = curr->vruntime; +	place_entity(cfs_rq, se, 1); + +	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { +		/* +		 * Upon rescheduling, sched_class::put_prev_task() will place +		 * 'current' within the tree based on its new key value. +		 */ +		swap(curr->vruntime, se->vruntime); +		resched_task(rq->curr); +	} + +	se->vruntime -= cfs_rq->min_vruntime; + +	raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +/* + * Priority of the task has changed. Check to see if we preempt + * the current task. + */ +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +{ +	if (!p->se.on_rq) +		return; + +	/* +	 * Reschedule if we are currently running on this runqueue and +	 * our priority decreased, or if we are not currently running on +	 * this runqueue and our priority is higher than the current's +	 */ +	if (rq->curr == p) { +		if (p->prio > oldprio) +			resched_task(rq->curr); +	} else +		check_preempt_curr(rq, p, 0); +} + +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ +	struct sched_entity *se = &p->se; +	struct cfs_rq *cfs_rq = cfs_rq_of(se); + +	/* +	 * Ensure the task's vruntime is normalized, so that when its +	 * switched back to the fair class the enqueue_entity(.flags=0) will +	 * do the right thing. +	 * +	 * If it was on_rq, then the dequeue_entity(.flags=0) will already +	 * have normalized the vruntime, if it was !on_rq, then only when +	 * the task is sleeping will it still have non-normalized vruntime. +	 */ +	if (!se->on_rq && p->state != TASK_RUNNING) { +		/* +		 * Fix up our vruntime so that the current sleep doesn't +		 * cause 'unlimited' sleep bonus. +		 */ +		place_entity(cfs_rq, se, 0); +		se->vruntime -= cfs_rq->min_vruntime; +	} +} + +/* + * We switched to the sched_fair class. + */ +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ +	if (!p->se.on_rq) +		return; + +	/* +	 * We were most likely switched from sched_rt, so +	 * kick off the schedule if running, otherwise just see +	 * if we can still preempt the current task. +	 */ +	if (rq->curr == p) +		resched_task(rq->curr); +	else +		check_preempt_curr(rq, p, 0); +} + +/* Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_curr_task_fair(struct rq *rq) +{ +	struct sched_entity *se = &rq->curr->se; + +	for_each_sched_entity(se) { +		struct cfs_rq *cfs_rq = cfs_rq_of(se); + +		set_next_entity(cfs_rq, se); +		/* ensure bandwidth has been allocated on our new cfs_rq */ +		account_cfs_rq_runtime(cfs_rq, 0); +	} +} + +void init_cfs_rq(struct cfs_rq *cfs_rq) +{ +	cfs_rq->tasks_timeline = RB_ROOT; +	INIT_LIST_HEAD(&cfs_rq->tasks); +	cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT +	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void task_move_group_fair(struct task_struct *p, int on_rq) +{ +	/* +	 * If the task was not on the rq at the time of this cgroup movement +	 * it must have been asleep, sleeping tasks keep their ->vruntime +	 * absolute on their old rq until wakeup (needed for the fair sleeper +	 * bonus in place_entity()). +	 * +	 * If it was on the rq, we've just 'preempted' it, which does convert +	 * ->vruntime to a relative base. +	 * +	 * Make sure both cases convert their relative position when migrating +	 * to another cgroup's rq. This does somewhat interfere with the +	 * fair sleeper stuff for the first placement, but who cares. +	 */ +	/* +	 * When !on_rq, vruntime of the task has usually NOT been normalized. +	 * But there are some cases where it has already been normalized: +	 * +	 * - Moving a forked child which is waiting for being woken up by +	 *   wake_up_new_task(). +	 * - Moving a task which has been woken up by try_to_wake_up() and +	 *   waiting for actually being woken up by sched_ttwu_pending(). +	 * +	 * To prevent boost or penalty in the new cfs_rq caused by delta +	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. +	 */ +	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) +		on_rq = 1; + +	if (!on_rq) +		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; +	set_task_rq(p, task_cpu(p)); +	if (!on_rq) +		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; +} + +void free_fair_sched_group(struct task_group *tg) +{ +	int i; + +	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + +	for_each_possible_cpu(i) { +		if (tg->cfs_rq) +			kfree(tg->cfs_rq[i]); +		if (tg->se) +			kfree(tg->se[i]); +	} + +	kfree(tg->cfs_rq); +	kfree(tg->se); +} + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ +	struct cfs_rq *cfs_rq; +	struct sched_entity *se; +	int i; + +	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); +	if (!tg->cfs_rq) +		goto err; +	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); +	if (!tg->se) +		goto err; + +	tg->shares = NICE_0_LOAD; + +	init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + +	for_each_possible_cpu(i) { +		cfs_rq = kzalloc_node(sizeof(struct cfs_rq), +				      GFP_KERNEL, cpu_to_node(i)); +		if (!cfs_rq) +			goto err; + +		se = kzalloc_node(sizeof(struct sched_entity), +				  GFP_KERNEL, cpu_to_node(i)); +		if (!se) +			goto err_free_rq; + +		init_cfs_rq(cfs_rq); +		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); +	} + +	return 1; + +err_free_rq: +	kfree(cfs_rq); +err: +	return 0; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	/* +	* Only empty task groups can be destroyed; so we can speculatively +	* check on_list without danger of it being re-added. +	*/ +	if (!tg->cfs_rq[cpu]->on_list) +		return; + +	raw_spin_lock_irqsave(&rq->lock, flags); +	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); +	raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, +			struct sched_entity *se, int cpu, +			struct sched_entity *parent) +{ +	struct rq *rq = cpu_rq(cpu); + +	cfs_rq->tg = tg; +	cfs_rq->rq = rq; +#ifdef CONFIG_SMP +	/* allow initial update_cfs_load() to truncate */ +	cfs_rq->load_stamp = 1; +#endif +	init_cfs_rq_runtime(cfs_rq); + +	tg->cfs_rq[cpu] = cfs_rq; +	tg->se[cpu] = se; + +	/* se could be NULL for root_task_group */ +	if (!se) +		return; + +	if (!parent) +		se->cfs_rq = &rq->cfs; +	else +		se->cfs_rq = parent->my_q; + +	se->my_q = cfs_rq; +	update_load_set(&se->load, 0); +	se->parent = parent; +} + +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ +	int i; +	unsigned long flags; + +	/* +	 * We can't change the weight of the root cgroup. +	 */ +	if (!tg->se[0]) +		return -EINVAL; + +	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); + +	mutex_lock(&shares_mutex); +	if (tg->shares == shares) +		goto done; + +	tg->shares = shares; +	for_each_possible_cpu(i) { +		struct rq *rq = cpu_rq(i); +		struct sched_entity *se; + +		se = tg->se[i]; +		/* Propagate contribution to hierarchy */ +		raw_spin_lock_irqsave(&rq->lock, flags); +		for_each_sched_entity(se) +			update_cfs_shares(group_cfs_rq(se)); +		raw_spin_unlock_irqrestore(&rq->lock, flags); +	} + +done: +	mutex_unlock(&shares_mutex); +	return 0; +} +#else /* CONFIG_FAIR_GROUP_SCHED */ + +void free_fair_sched_group(struct task_group *tg) { } + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ +	return 1; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + +static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) +{ +	struct sched_entity *se = &task->se; +	unsigned int rr_interval = 0; + +	/* +	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise +	 * idle runqueue: +	 */ +	if (rq->cfs.load.weight) +		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + +	return rr_interval; +} + +/* + * All the scheduling class methods: + */ +const struct sched_class fair_sched_class = { +	.next			= &idle_sched_class, +	.enqueue_task		= enqueue_task_fair, +	.dequeue_task		= dequeue_task_fair, +	.yield_task		= yield_task_fair, +	.yield_to_task		= yield_to_task_fair, + +	.check_preempt_curr	= check_preempt_wakeup, + +	.pick_next_task		= pick_next_task_fair, +	.put_prev_task		= put_prev_task_fair, + +#ifdef CONFIG_SMP +	.select_task_rq		= select_task_rq_fair, + +	.rq_online		= rq_online_fair, +	.rq_offline		= rq_offline_fair, + +	.task_waking		= task_waking_fair, +#endif + +	.set_curr_task          = set_curr_task_fair, +	.task_tick		= task_tick_fair, +	.task_fork		= task_fork_fair, + +	.prio_changed		= prio_changed_fair, +	.switched_from		= switched_from_fair, +	.switched_to		= switched_to_fair, + +	.get_rr_interval	= get_rr_interval_fair, + +#ifdef CONFIG_FAIR_GROUP_SCHED +	.task_move_group	= task_move_group_fair, +#endif +}; + +#ifdef CONFIG_SCHED_DEBUG +void print_cfs_stats(struct seq_file *m, int cpu) +{ +	struct cfs_rq *cfs_rq; + +	rcu_read_lock(); +	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) +		print_cfs_rq(m, cpu, cfs_rq); +	rcu_read_unlock(); +} +#endif + +__init void init_sched_fair_class(void) +{ +#ifdef CONFIG_SMP +	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + +#ifdef CONFIG_NO_HZ +	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); +#endif +#endif /* SMP */ + +} diff --git a/kernel/sched/features.h b/kernel/sched/features.h new file mode 100644 index 00000000000..e61fd73913d --- /dev/null +++ b/kernel/sched/features.h @@ -0,0 +1,70 @@ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, true) + +/* + * Based on load and program behaviour, see if it makes sense to place + * a newly woken task on the same cpu as the task that woke it -- + * improve cache locality. Typically used with SYNC wakeups as + * generated by pipes and the like, see also SYNC_WAKEUPS. + */ +SCHED_FEAT(AFFINE_WAKEUPS, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, true) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, false) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) +SCHED_FEAT(LB_BIAS, true) + +/* + * Spin-wait on mutex acquisition when the mutex owner is running on + * another cpu -- assumes that when the owner is running, it will soon + * release the lock. Decreases scheduling overhead. + */ +SCHED_FEAT(OWNER_SPIN, true) + +/* + * Decrement CPU power based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_POWER, true) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) + +SCHED_FEAT(FORCE_SD_OVERLAP, false) +SCHED_FEAT(RT_RUNTIME_SHARE, true) diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c new file mode 100644 index 00000000000..91b4c957f28 --- /dev/null +++ b/kernel/sched/idle_task.c @@ -0,0 +1,99 @@ +#include "sched.h" + +/* + * idle-task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE tasks which are + *  handled in sched_fair.c) + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +{ +	return task_cpu(p); /* IDLE tasks as never migrated */ +} +#endif /* CONFIG_SMP */ +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +{ +	resched_task(rq->idle); +} + +static struct task_struct *pick_next_task_idle(struct rq *rq) +{ +	schedstat_inc(rq, sched_goidle); +	calc_load_account_idle(rq); +	return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) +{ +	raw_spin_unlock_irq(&rq->lock); +	printk(KERN_ERR "bad: scheduling from the idle thread!\n"); +	dump_stack(); +	raw_spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_idle(struct rq *rq) +{ +} + +static void switched_to_idle(struct rq *rq, struct task_struct *p) +{ +	BUG(); +} + +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +{ +	BUG(); +} + +static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) +{ +	return 0; +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +const struct sched_class idle_sched_class = { +	/* .next is NULL */ +	/* no enqueue/yield_task for idle tasks */ + +	/* dequeue is not valid, we print a debug message there: */ +	.dequeue_task		= dequeue_task_idle, + +	.check_preempt_curr	= check_preempt_curr_idle, + +	.pick_next_task		= pick_next_task_idle, +	.put_prev_task		= put_prev_task_idle, + +#ifdef CONFIG_SMP +	.select_task_rq		= select_task_rq_idle, +#endif + +	.set_curr_task          = set_curr_task_idle, +	.task_tick		= task_tick_idle, + +	.get_rr_interval	= get_rr_interval_idle, + +	.prio_changed		= prio_changed_idle, +	.switched_to		= switched_to_idle, +}; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c new file mode 100644 index 00000000000..3640ebbb466 --- /dev/null +++ b/kernel/sched/rt.c @@ -0,0 +1,2048 @@ +/* + * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR + * policies) + */ + +#include "sched.h" + +#include <linux/slab.h> + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +struct rt_bandwidth def_rt_bandwidth; + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ +	struct rt_bandwidth *rt_b = +		container_of(timer, struct rt_bandwidth, rt_period_timer); +	ktime_t now; +	int overrun; +	int idle = 0; + +	for (;;) { +		now = hrtimer_cb_get_time(timer); +		overrun = hrtimer_forward(timer, now, rt_b->rt_period); + +		if (!overrun) +			break; + +		idle = do_sched_rt_period_timer(rt_b, overrun); +	} + +	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ +	rt_b->rt_period = ns_to_ktime(period); +	rt_b->rt_runtime = runtime; + +	raw_spin_lock_init(&rt_b->rt_runtime_lock); + +	hrtimer_init(&rt_b->rt_period_timer, +			CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ +	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) +		return; + +	if (hrtimer_active(&rt_b->rt_period_timer)) +		return; + +	raw_spin_lock(&rt_b->rt_runtime_lock); +	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); +	raw_spin_unlock(&rt_b->rt_runtime_lock); +} + +void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ +	struct rt_prio_array *array; +	int i; + +	array = &rt_rq->active; +	for (i = 0; i < MAX_RT_PRIO; i++) { +		INIT_LIST_HEAD(array->queue + i); +		__clear_bit(i, array->bitmap); +	} +	/* delimiter for bitsearch: */ +	__set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP +	rt_rq->highest_prio.curr = MAX_RT_PRIO; +	rt_rq->highest_prio.next = MAX_RT_PRIO; +	rt_rq->rt_nr_migratory = 0; +	rt_rq->overloaded = 0; +	plist_head_init(&rt_rq->pushable_tasks); +#endif + +	rt_rq->rt_time = 0; +	rt_rq->rt_throttled = 0; +	rt_rq->rt_runtime = 0; +	raw_spin_lock_init(&rt_rq->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ +	hrtimer_cancel(&rt_b->rt_period_timer); +} + +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_SCHED_DEBUG +	WARN_ON_ONCE(!rt_entity_is_task(rt_se)); +#endif +	return container_of(rt_se, struct task_struct, rt); +} + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ +	return rt_rq->rq; +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ +	return rt_se->rt_rq; +} + +void free_rt_sched_group(struct task_group *tg) +{ +	int i; + +	if (tg->rt_se) +		destroy_rt_bandwidth(&tg->rt_bandwidth); + +	for_each_possible_cpu(i) { +		if (tg->rt_rq) +			kfree(tg->rt_rq[i]); +		if (tg->rt_se) +			kfree(tg->rt_se[i]); +	} + +	kfree(tg->rt_rq); +	kfree(tg->rt_se); +} + +void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, +		struct sched_rt_entity *rt_se, int cpu, +		struct sched_rt_entity *parent) +{ +	struct rq *rq = cpu_rq(cpu); + +	rt_rq->highest_prio.curr = MAX_RT_PRIO; +	rt_rq->rt_nr_boosted = 0; +	rt_rq->rq = rq; +	rt_rq->tg = tg; + +	tg->rt_rq[cpu] = rt_rq; +	tg->rt_se[cpu] = rt_se; + +	if (!rt_se) +		return; + +	if (!parent) +		rt_se->rt_rq = &rq->rt; +	else +		rt_se->rt_rq = parent->my_q; + +	rt_se->my_q = rt_rq; +	rt_se->parent = parent; +	INIT_LIST_HEAD(&rt_se->run_list); +} + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ +	struct rt_rq *rt_rq; +	struct sched_rt_entity *rt_se; +	int i; + +	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); +	if (!tg->rt_rq) +		goto err; +	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); +	if (!tg->rt_se) +		goto err; + +	init_rt_bandwidth(&tg->rt_bandwidth, +			ktime_to_ns(def_rt_bandwidth.rt_period), 0); + +	for_each_possible_cpu(i) { +		rt_rq = kzalloc_node(sizeof(struct rt_rq), +				     GFP_KERNEL, cpu_to_node(i)); +		if (!rt_rq) +			goto err; + +		rt_se = kzalloc_node(sizeof(struct sched_rt_entity), +				     GFP_KERNEL, cpu_to_node(i)); +		if (!rt_se) +			goto err_free_rq; + +		init_rt_rq(rt_rq, cpu_rq(i)); +		rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; +		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); +	} + +	return 1; + +err_free_rq: +	kfree(rt_rq); +err: +	return 0; +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +#define rt_entity_is_task(rt_se) (1) + +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ +	return container_of(rt_se, struct task_struct, rt); +} + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ +	return container_of(rt_rq, struct rq, rt); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ +	struct task_struct *p = rt_task_of(rt_se); +	struct rq *rq = task_rq(p); + +	return &rq->rt; +} + +void free_rt_sched_group(struct task_group *tg) { } + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ +	return 1; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SMP + +static inline int rt_overloaded(struct rq *rq) +{ +	return atomic_read(&rq->rd->rto_count); +} + +static inline void rt_set_overload(struct rq *rq) +{ +	if (!rq->online) +		return; + +	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); +	/* +	 * Make sure the mask is visible before we set +	 * the overload count. That is checked to determine +	 * if we should look at the mask. It would be a shame +	 * if we looked at the mask, but the mask was not +	 * updated yet. +	 */ +	wmb(); +	atomic_inc(&rq->rd->rto_count); +} + +static inline void rt_clear_overload(struct rq *rq) +{ +	if (!rq->online) +		return; + +	/* the order here really doesn't matter */ +	atomic_dec(&rq->rd->rto_count); +	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); +} + +static void update_rt_migration(struct rt_rq *rt_rq) +{ +	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { +		if (!rt_rq->overloaded) { +			rt_set_overload(rq_of_rt_rq(rt_rq)); +			rt_rq->overloaded = 1; +		} +	} else if (rt_rq->overloaded) { +		rt_clear_overload(rq_of_rt_rq(rt_rq)); +		rt_rq->overloaded = 0; +	} +} + +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	if (!rt_entity_is_task(rt_se)) +		return; + +	rt_rq = &rq_of_rt_rq(rt_rq)->rt; + +	rt_rq->rt_nr_total++; +	if (rt_se->nr_cpus_allowed > 1) +		rt_rq->rt_nr_migratory++; + +	update_rt_migration(rt_rq); +} + +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	if (!rt_entity_is_task(rt_se)) +		return; + +	rt_rq = &rq_of_rt_rq(rt_rq)->rt; + +	rt_rq->rt_nr_total--; +	if (rt_se->nr_cpus_allowed > 1) +		rt_rq->rt_nr_migratory--; + +	update_rt_migration(rt_rq); +} + +static inline int has_pushable_tasks(struct rq *rq) +{ +	return !plist_head_empty(&rq->rt.pushable_tasks); +} + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ +	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +	plist_node_init(&p->pushable_tasks, p->prio); +	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); + +	/* Update the highest prio pushable task */ +	if (p->prio < rq->rt.highest_prio.next) +		rq->rt.highest_prio.next = p->prio; +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ +	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + +	/* Update the new highest prio pushable task */ +	if (has_pushable_tasks(rq)) { +		p = plist_first_entry(&rq->rt.pushable_tasks, +				      struct task_struct, pushable_tasks); +		rq->rt.highest_prio.next = p->prio; +	} else +		rq->rt.highest_prio.next = MAX_RT_PRIO; +} + +#else + +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +static inline +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +#endif /* CONFIG_SMP */ + +static inline int on_rt_rq(struct sched_rt_entity *rt_se) +{ +	return !list_empty(&rt_se->run_list); +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +{ +	if (!rt_rq->tg) +		return RUNTIME_INF; + +	return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ +	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); +} + +typedef struct task_group *rt_rq_iter_t; + +static inline struct task_group *next_task_group(struct task_group *tg) +{ +	do { +		tg = list_entry_rcu(tg->list.next, +			typeof(struct task_group), list); +	} while (&tg->list != &task_groups && task_group_is_autogroup(tg)); + +	if (&tg->list == &task_groups) +		tg = NULL; + +	return tg; +} + +#define for_each_rt_rq(rt_rq, iter, rq)					\ +	for (iter = container_of(&task_groups, typeof(*iter), list);	\ +		(iter = next_task_group(iter)) &&			\ +		(rt_rq = iter->rt_rq[cpu_of(rq)]);) + +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +	list_add_rcu(&rt_rq->leaf_rt_rq_list, +			&rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +	list_del_rcu(&rt_rq->leaf_rt_rq_list); +} + +#define for_each_leaf_rt_rq(rt_rq, rq) \ +	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + +#define for_each_sched_rt_entity(rt_se) \ +	for (; rt_se; rt_se = rt_se->parent) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ +	return rt_se->my_q; +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se); + +static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ +	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; +	struct sched_rt_entity *rt_se; + +	int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + +	rt_se = rt_rq->tg->rt_se[cpu]; + +	if (rt_rq->rt_nr_running) { +		if (rt_se && !on_rt_rq(rt_se)) +			enqueue_rt_entity(rt_se, false); +		if (rt_rq->highest_prio.curr < curr->prio) +			resched_task(curr); +	} +} + +static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ +	struct sched_rt_entity *rt_se; +	int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + +	rt_se = rt_rq->tg->rt_se[cpu]; + +	if (rt_se && on_rt_rq(rt_se)) +		dequeue_rt_entity(rt_se); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ +	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + +static int rt_se_boosted(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *rt_rq = group_rt_rq(rt_se); +	struct task_struct *p; + +	if (rt_rq) +		return !!rt_rq->rt_nr_boosted; + +	p = rt_task_of(rt_se); +	return p->prio != p->normal_prio; +} + +#ifdef CONFIG_SMP +static inline const struct cpumask *sched_rt_period_mask(void) +{ +	return cpu_rq(smp_processor_id())->rd->span; +} +#else +static inline const struct cpumask *sched_rt_period_mask(void) +{ +	return cpu_online_mask; +} +#endif + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ +	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ +	return &rt_rq->tg->rt_bandwidth; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ + +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +{ +	return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ +	return ktime_to_ns(def_rt_bandwidth.rt_period); +} + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ +	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +#define for_each_leaf_rt_rq(rt_rq, rq) \ +	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ +	for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ +	return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ +	if (rt_rq->rt_nr_running) +		resched_task(rq_of_rt_rq(rt_rq)->curr); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ +	return rt_rq->rt_throttled; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ +	return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ +	return &cpu_rq(cpu)->rt; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ +	return &def_rt_bandwidth; +} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ +static int do_balance_runtime(struct rt_rq *rt_rq) +{ +	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); +	struct root_domain *rd = cpu_rq(smp_processor_id())->rd; +	int i, weight, more = 0; +	u64 rt_period; + +	weight = cpumask_weight(rd->span); + +	raw_spin_lock(&rt_b->rt_runtime_lock); +	rt_period = ktime_to_ns(rt_b->rt_period); +	for_each_cpu(i, rd->span) { +		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); +		s64 diff; + +		if (iter == rt_rq) +			continue; + +		raw_spin_lock(&iter->rt_runtime_lock); +		/* +		 * Either all rqs have inf runtime and there's nothing to steal +		 * or __disable_runtime() below sets a specific rq to inf to +		 * indicate its been disabled and disalow stealing. +		 */ +		if (iter->rt_runtime == RUNTIME_INF) +			goto next; + +		/* +		 * From runqueues with spare time, take 1/n part of their +		 * spare time, but no more than our period. +		 */ +		diff = iter->rt_runtime - iter->rt_time; +		if (diff > 0) { +			diff = div_u64((u64)diff, weight); +			if (rt_rq->rt_runtime + diff > rt_period) +				diff = rt_period - rt_rq->rt_runtime; +			iter->rt_runtime -= diff; +			rt_rq->rt_runtime += diff; +			more = 1; +			if (rt_rq->rt_runtime == rt_period) { +				raw_spin_unlock(&iter->rt_runtime_lock); +				break; +			} +		} +next: +		raw_spin_unlock(&iter->rt_runtime_lock); +	} +	raw_spin_unlock(&rt_b->rt_runtime_lock); + +	return more; +} + +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ +static void __disable_runtime(struct rq *rq) +{ +	struct root_domain *rd = rq->rd; +	rt_rq_iter_t iter; +	struct rt_rq *rt_rq; + +	if (unlikely(!scheduler_running)) +		return; + +	for_each_rt_rq(rt_rq, iter, rq) { +		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); +		s64 want; +		int i; + +		raw_spin_lock(&rt_b->rt_runtime_lock); +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		/* +		 * Either we're all inf and nobody needs to borrow, or we're +		 * already disabled and thus have nothing to do, or we have +		 * exactly the right amount of runtime to take out. +		 */ +		if (rt_rq->rt_runtime == RUNTIME_INF || +				rt_rq->rt_runtime == rt_b->rt_runtime) +			goto balanced; +		raw_spin_unlock(&rt_rq->rt_runtime_lock); + +		/* +		 * Calculate the difference between what we started out with +		 * and what we current have, that's the amount of runtime +		 * we lend and now have to reclaim. +		 */ +		want = rt_b->rt_runtime - rt_rq->rt_runtime; + +		/* +		 * Greedy reclaim, take back as much as we can. +		 */ +		for_each_cpu(i, rd->span) { +			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); +			s64 diff; + +			/* +			 * Can't reclaim from ourselves or disabled runqueues. +			 */ +			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) +				continue; + +			raw_spin_lock(&iter->rt_runtime_lock); +			if (want > 0) { +				diff = min_t(s64, iter->rt_runtime, want); +				iter->rt_runtime -= diff; +				want -= diff; +			} else { +				iter->rt_runtime -= want; +				want -= want; +			} +			raw_spin_unlock(&iter->rt_runtime_lock); + +			if (!want) +				break; +		} + +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		/* +		 * We cannot be left wanting - that would mean some runtime +		 * leaked out of the system. +		 */ +		BUG_ON(want); +balanced: +		/* +		 * Disable all the borrow logic by pretending we have inf +		 * runtime - in which case borrowing doesn't make sense. +		 */ +		rt_rq->rt_runtime = RUNTIME_INF; +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +		raw_spin_unlock(&rt_b->rt_runtime_lock); +	} +} + +static void disable_runtime(struct rq *rq) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&rq->lock, flags); +	__disable_runtime(rq); +	raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static void __enable_runtime(struct rq *rq) +{ +	rt_rq_iter_t iter; +	struct rt_rq *rt_rq; + +	if (unlikely(!scheduler_running)) +		return; + +	/* +	 * Reset each runqueue's bandwidth settings +	 */ +	for_each_rt_rq(rt_rq, iter, rq) { +		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + +		raw_spin_lock(&rt_b->rt_runtime_lock); +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		rt_rq->rt_runtime = rt_b->rt_runtime; +		rt_rq->rt_time = 0; +		rt_rq->rt_throttled = 0; +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +		raw_spin_unlock(&rt_b->rt_runtime_lock); +	} +} + +static void enable_runtime(struct rq *rq) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&rq->lock, flags); +	__enable_runtime(rq); +	raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	int cpu = (int)(long)hcpu; + +	switch (action) { +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +		disable_runtime(cpu_rq(cpu)); +		return NOTIFY_OK; + +	case CPU_DOWN_FAILED: +	case CPU_DOWN_FAILED_FROZEN: +	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN: +		enable_runtime(cpu_rq(cpu)); +		return NOTIFY_OK; + +	default: +		return NOTIFY_DONE; +	} +} + +static int balance_runtime(struct rt_rq *rt_rq) +{ +	int more = 0; + +	if (!sched_feat(RT_RUNTIME_SHARE)) +		return more; + +	if (rt_rq->rt_time > rt_rq->rt_runtime) { +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +		more = do_balance_runtime(rt_rq); +		raw_spin_lock(&rt_rq->rt_runtime_lock); +	} + +	return more; +} +#else /* !CONFIG_SMP */ +static inline int balance_runtime(struct rt_rq *rt_rq) +{ +	return 0; +} +#endif /* CONFIG_SMP */ + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ +	int i, idle = 1; +	const struct cpumask *span; + +	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) +		return 1; + +	span = sched_rt_period_mask(); +	for_each_cpu(i, span) { +		int enqueue = 0; +		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); +		struct rq *rq = rq_of_rt_rq(rt_rq); + +		raw_spin_lock(&rq->lock); +		if (rt_rq->rt_time) { +			u64 runtime; + +			raw_spin_lock(&rt_rq->rt_runtime_lock); +			if (rt_rq->rt_throttled) +				balance_runtime(rt_rq); +			runtime = rt_rq->rt_runtime; +			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); +			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { +				rt_rq->rt_throttled = 0; +				enqueue = 1; + +				/* +				 * Force a clock update if the CPU was idle, +				 * lest wakeup -> unthrottle time accumulate. +				 */ +				if (rt_rq->rt_nr_running && rq->curr == rq->idle) +					rq->skip_clock_update = -1; +			} +			if (rt_rq->rt_time || rt_rq->rt_nr_running) +				idle = 0; +			raw_spin_unlock(&rt_rq->rt_runtime_lock); +		} else if (rt_rq->rt_nr_running) { +			idle = 0; +			if (!rt_rq_throttled(rt_rq)) +				enqueue = 1; +		} + +		if (enqueue) +			sched_rt_rq_enqueue(rt_rq); +		raw_spin_unlock(&rq->lock); +	} + +	return idle; +} + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED +	struct rt_rq *rt_rq = group_rt_rq(rt_se); + +	if (rt_rq) +		return rt_rq->highest_prio.curr; +#endif + +	return rt_task_of(rt_se)->prio; +} + +static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) +{ +	u64 runtime = sched_rt_runtime(rt_rq); + +	if (rt_rq->rt_throttled) +		return rt_rq_throttled(rt_rq); + +	if (runtime >= sched_rt_period(rt_rq)) +		return 0; + +	balance_runtime(rt_rq); +	runtime = sched_rt_runtime(rt_rq); +	if (runtime == RUNTIME_INF) +		return 0; + +	if (rt_rq->rt_time > runtime) { +		rt_rq->rt_throttled = 1; +		printk_once(KERN_WARNING "sched: RT throttling activated\n"); +		if (rt_rq_throttled(rt_rq)) { +			sched_rt_rq_dequeue(rt_rq); +			return 1; +		} +	} + +	return 0; +} + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static void update_curr_rt(struct rq *rq) +{ +	struct task_struct *curr = rq->curr; +	struct sched_rt_entity *rt_se = &curr->rt; +	struct rt_rq *rt_rq = rt_rq_of_se(rt_se); +	u64 delta_exec; + +	if (curr->sched_class != &rt_sched_class) +		return; + +	delta_exec = rq->clock_task - curr->se.exec_start; +	if (unlikely((s64)delta_exec < 0)) +		delta_exec = 0; + +	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + +	curr->se.sum_exec_runtime += delta_exec; +	account_group_exec_runtime(curr, delta_exec); + +	curr->se.exec_start = rq->clock_task; +	cpuacct_charge(curr, delta_exec); + +	sched_rt_avg_update(rq, delta_exec); + +	if (!rt_bandwidth_enabled()) +		return; + +	for_each_sched_rt_entity(rt_se) { +		rt_rq = rt_rq_of_se(rt_se); + +		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { +			raw_spin_lock(&rt_rq->rt_runtime_lock); +			rt_rq->rt_time += delta_exec; +			if (sched_rt_runtime_exceeded(rt_rq)) +				resched_task(curr); +			raw_spin_unlock(&rt_rq->rt_runtime_lock); +		} +	} +} + +#if defined CONFIG_SMP + +static void +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	if (rq->online && prio < prev_prio) +		cpupri_set(&rq->rd->cpupri, rq->cpu, prio); +} + +static void +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	if (rq->online && rt_rq->highest_prio.curr != prev_prio) +		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); +} + +#else /* CONFIG_SMP */ + +static inline +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} +static inline +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} + +#endif /* CONFIG_SMP */ + +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +static void +inc_rt_prio(struct rt_rq *rt_rq, int prio) +{ +	int prev_prio = rt_rq->highest_prio.curr; + +	if (prio < prev_prio) +		rt_rq->highest_prio.curr = prio; + +	inc_rt_prio_smp(rt_rq, prio, prev_prio); +} + +static void +dec_rt_prio(struct rt_rq *rt_rq, int prio) +{ +	int prev_prio = rt_rq->highest_prio.curr; + +	if (rt_rq->rt_nr_running) { + +		WARN_ON(prio < prev_prio); + +		/* +		 * This may have been our highest task, and therefore +		 * we may have some recomputation to do +		 */ +		if (prio == prev_prio) { +			struct rt_prio_array *array = &rt_rq->active; + +			rt_rq->highest_prio.curr = +				sched_find_first_bit(array->bitmap); +		} + +	} else +		rt_rq->highest_prio.curr = MAX_RT_PRIO; + +	dec_rt_prio_smp(rt_rq, prio, prev_prio); +} + +#else + +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} + +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	if (rt_se_boosted(rt_se)) +		rt_rq->rt_nr_boosted++; + +	if (rt_rq->tg) +		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +} + +static void +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	if (rt_se_boosted(rt_se)) +		rt_rq->rt_nr_boosted--; + +	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	start_rt_bandwidth(&def_rt_bandwidth); +} + +static inline +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	int prio = rt_se_prio(rt_se); + +	WARN_ON(!rt_prio(prio)); +	rt_rq->rt_nr_running++; + +	inc_rt_prio(rt_rq, prio); +	inc_rt_migration(rt_se, rt_rq); +	inc_rt_group(rt_se, rt_rq); +} + +static inline +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +	WARN_ON(!rt_prio(rt_se_prio(rt_se))); +	WARN_ON(!rt_rq->rt_nr_running); +	rt_rq->rt_nr_running--; + +	dec_rt_prio(rt_rq, rt_se_prio(rt_se)); +	dec_rt_migration(rt_se, rt_rq); +	dec_rt_group(rt_se, rt_rq); +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +{ +	struct rt_rq *rt_rq = rt_rq_of_se(rt_se); +	struct rt_prio_array *array = &rt_rq->active; +	struct rt_rq *group_rq = group_rt_rq(rt_se); +	struct list_head *queue = array->queue + rt_se_prio(rt_se); + +	/* +	 * Don't enqueue the group if its throttled, or when empty. +	 * The latter is a consequence of the former when a child group +	 * get throttled and the current group doesn't have any other +	 * active members. +	 */ +	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) +		return; + +	if (!rt_rq->rt_nr_running) +		list_add_leaf_rt_rq(rt_rq); + +	if (head) +		list_add(&rt_se->run_list, queue); +	else +		list_add_tail(&rt_se->run_list, queue); +	__set_bit(rt_se_prio(rt_se), array->bitmap); + +	inc_rt_tasks(rt_se, rt_rq); +} + +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *rt_rq = rt_rq_of_se(rt_se); +	struct rt_prio_array *array = &rt_rq->active; + +	list_del_init(&rt_se->run_list); +	if (list_empty(array->queue + rt_se_prio(rt_se))) +		__clear_bit(rt_se_prio(rt_se), array->bitmap); + +	dec_rt_tasks(rt_se, rt_rq); +	if (!rt_rq->rt_nr_running) +		list_del_leaf_rt_rq(rt_rq); +} + +/* + * Because the prio of an upper entry depends on the lower + * entries, we must remove entries top - down. + */ +static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +{ +	struct sched_rt_entity *back = NULL; + +	for_each_sched_rt_entity(rt_se) { +		rt_se->back = back; +		back = rt_se; +	} + +	for (rt_se = back; rt_se; rt_se = rt_se->back) { +		if (on_rt_rq(rt_se)) +			__dequeue_rt_entity(rt_se); +	} +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +{ +	dequeue_rt_stack(rt_se); +	for_each_sched_rt_entity(rt_se) +		__enqueue_rt_entity(rt_se, head); +} + +static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ +	dequeue_rt_stack(rt_se); + +	for_each_sched_rt_entity(rt_se) { +		struct rt_rq *rt_rq = group_rt_rq(rt_se); + +		if (rt_rq && rt_rq->rt_nr_running) +			__enqueue_rt_entity(rt_se, false); +	} +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void +enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) +{ +	struct sched_rt_entity *rt_se = &p->rt; + +	if (flags & ENQUEUE_WAKEUP) +		rt_se->timeout = 0; + +	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + +	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) +		enqueue_pushable_task(rq, p); + +	inc_nr_running(rq); +} + +static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) +{ +	struct sched_rt_entity *rt_se = &p->rt; + +	update_curr_rt(rq); +	dequeue_rt_entity(rt_se); + +	dequeue_pushable_task(rq, p); + +	dec_nr_running(rq); +} + +/* + * Put task to the head or the end of the run list without the overhead of + * dequeue followed by enqueue. + */ +static void +requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) +{ +	if (on_rt_rq(rt_se)) { +		struct rt_prio_array *array = &rt_rq->active; +		struct list_head *queue = array->queue + rt_se_prio(rt_se); + +		if (head) +			list_move(&rt_se->run_list, queue); +		else +			list_move_tail(&rt_se->run_list, queue); +	} +} + +static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) +{ +	struct sched_rt_entity *rt_se = &p->rt; +	struct rt_rq *rt_rq; + +	for_each_sched_rt_entity(rt_se) { +		rt_rq = rt_rq_of_se(rt_se); +		requeue_rt_entity(rt_rq, rt_se, head); +	} +} + +static void yield_task_rt(struct rq *rq) +{ +	requeue_task_rt(rq, rq->curr, 0); +} + +#ifdef CONFIG_SMP +static int find_lowest_rq(struct task_struct *task); + +static int +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +{ +	struct task_struct *curr; +	struct rq *rq; +	int cpu; + +	cpu = task_cpu(p); + +	if (p->rt.nr_cpus_allowed == 1) +		goto out; + +	/* For anything but wake ups, just return the task_cpu */ +	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) +		goto out; + +	rq = cpu_rq(cpu); + +	rcu_read_lock(); +	curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + +	/* +	 * If the current task on @p's runqueue is an RT task, then +	 * try to see if we can wake this RT task up on another +	 * runqueue. Otherwise simply start this RT task +	 * on its current runqueue. +	 * +	 * We want to avoid overloading runqueues. If the woken +	 * task is a higher priority, then it will stay on this CPU +	 * and the lower prio task should be moved to another CPU. +	 * Even though this will probably make the lower prio task +	 * lose its cache, we do not want to bounce a higher task +	 * around just because it gave up its CPU, perhaps for a +	 * lock? +	 * +	 * For equal prio tasks, we just let the scheduler sort it out. +	 * +	 * Otherwise, just let it ride on the affined RQ and the +	 * post-schedule router will push the preempted task away +	 * +	 * This test is optimistic, if we get it wrong the load-balancer +	 * will have to sort it out. +	 */ +	if (curr && unlikely(rt_task(curr)) && +	    (curr->rt.nr_cpus_allowed < 2 || +	     curr->prio <= p->prio) && +	    (p->rt.nr_cpus_allowed > 1)) { +		int target = find_lowest_rq(p); + +		if (target != -1) +			cpu = target; +	} +	rcu_read_unlock(); + +out: +	return cpu; +} + +static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) +{ +	if (rq->curr->rt.nr_cpus_allowed == 1) +		return; + +	if (p->rt.nr_cpus_allowed != 1 +	    && cpupri_find(&rq->rd->cpupri, p, NULL)) +		return; + +	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) +		return; + +	/* +	 * There appears to be other cpus that can accept +	 * current and none to run 'p', so lets reschedule +	 * to try and push current away: +	 */ +	requeue_task_rt(rq, p, 1); +	resched_task(rq->curr); +} + +#endif /* CONFIG_SMP */ + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +{ +	if (p->prio < rq->curr->prio) { +		resched_task(rq->curr); +		return; +	} + +#ifdef CONFIG_SMP +	/* +	 * If: +	 * +	 * - the newly woken task is of equal priority to the current task +	 * - the newly woken task is non-migratable while current is migratable +	 * - current will be preempted on the next reschedule +	 * +	 * we should check to see if current can readily move to a different +	 * cpu.  If so, we will reschedule to allow the push logic to try +	 * to move current somewhere else, making room for our non-migratable +	 * task. +	 */ +	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) +		check_preempt_equal_prio(rq, p); +#endif +} + +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, +						   struct rt_rq *rt_rq) +{ +	struct rt_prio_array *array = &rt_rq->active; +	struct sched_rt_entity *next = NULL; +	struct list_head *queue; +	int idx; + +	idx = sched_find_first_bit(array->bitmap); +	BUG_ON(idx >= MAX_RT_PRIO); + +	queue = array->queue + idx; +	next = list_entry(queue->next, struct sched_rt_entity, run_list); + +	return next; +} + +static struct task_struct *_pick_next_task_rt(struct rq *rq) +{ +	struct sched_rt_entity *rt_se; +	struct task_struct *p; +	struct rt_rq *rt_rq; + +	rt_rq = &rq->rt; + +	if (!rt_rq->rt_nr_running) +		return NULL; + +	if (rt_rq_throttled(rt_rq)) +		return NULL; + +	do { +		rt_se = pick_next_rt_entity(rq, rt_rq); +		BUG_ON(!rt_se); +		rt_rq = group_rt_rq(rt_se); +	} while (rt_rq); + +	p = rt_task_of(rt_se); +	p->se.exec_start = rq->clock_task; + +	return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ +	struct task_struct *p = _pick_next_task_rt(rq); + +	/* The running task is never eligible for pushing */ +	if (p) +		dequeue_pushable_task(rq, p); + +#ifdef CONFIG_SMP +	/* +	 * We detect this state here so that we can avoid taking the RQ +	 * lock again later if there is no need to push +	 */ +	rq->post_schedule = has_pushable_tasks(rq); +#endif + +	return p; +} + +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +{ +	update_curr_rt(rq); + +	/* +	 * The previous task needs to be made eligible for pushing +	 * if it is still active +	 */ +	if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) +		enqueue_pushable_task(rq, p); +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define RT_MAX_TRIES 3 + +static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) +{ +	if (!task_running(rq, p) && +	    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && +	    (p->rt.nr_cpus_allowed > 1)) +		return 1; +	return 0; +} + +/* Return the second highest RT task, NULL otherwise */ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +{ +	struct task_struct *next = NULL; +	struct sched_rt_entity *rt_se; +	struct rt_prio_array *array; +	struct rt_rq *rt_rq; +	int idx; + +	for_each_leaf_rt_rq(rt_rq, rq) { +		array = &rt_rq->active; +		idx = sched_find_first_bit(array->bitmap); +next_idx: +		if (idx >= MAX_RT_PRIO) +			continue; +		if (next && next->prio < idx) +			continue; +		list_for_each_entry(rt_se, array->queue + idx, run_list) { +			struct task_struct *p; + +			if (!rt_entity_is_task(rt_se)) +				continue; + +			p = rt_task_of(rt_se); +			if (pick_rt_task(rq, p, cpu)) { +				next = p; +				break; +			} +		} +		if (!next) { +			idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); +			goto next_idx; +		} +	} + +	return next; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); + +static int find_lowest_rq(struct task_struct *task) +{ +	struct sched_domain *sd; +	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); +	int this_cpu = smp_processor_id(); +	int cpu      = task_cpu(task); + +	/* Make sure the mask is initialized first */ +	if (unlikely(!lowest_mask)) +		return -1; + +	if (task->rt.nr_cpus_allowed == 1) +		return -1; /* No other targets possible */ + +	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) +		return -1; /* No targets found */ + +	/* +	 * At this point we have built a mask of cpus representing the +	 * lowest priority tasks in the system.  Now we want to elect +	 * the best one based on our affinity and topology. +	 * +	 * We prioritize the last cpu that the task executed on since +	 * it is most likely cache-hot in that location. +	 */ +	if (cpumask_test_cpu(cpu, lowest_mask)) +		return cpu; + +	/* +	 * Otherwise, we consult the sched_domains span maps to figure +	 * out which cpu is logically closest to our hot cache data. +	 */ +	if (!cpumask_test_cpu(this_cpu, lowest_mask)) +		this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + +	rcu_read_lock(); +	for_each_domain(cpu, sd) { +		if (sd->flags & SD_WAKE_AFFINE) { +			int best_cpu; + +			/* +			 * "this_cpu" is cheaper to preempt than a +			 * remote processor. +			 */ +			if (this_cpu != -1 && +			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { +				rcu_read_unlock(); +				return this_cpu; +			} + +			best_cpu = cpumask_first_and(lowest_mask, +						     sched_domain_span(sd)); +			if (best_cpu < nr_cpu_ids) { +				rcu_read_unlock(); +				return best_cpu; +			} +		} +	} +	rcu_read_unlock(); + +	/* +	 * And finally, if there were no matches within the domains +	 * just give the caller *something* to work with from the compatible +	 * locations. +	 */ +	if (this_cpu != -1) +		return this_cpu; + +	cpu = cpumask_any(lowest_mask); +	if (cpu < nr_cpu_ids) +		return cpu; +	return -1; +} + +/* Will lock the rq it finds */ +static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) +{ +	struct rq *lowest_rq = NULL; +	int tries; +	int cpu; + +	for (tries = 0; tries < RT_MAX_TRIES; tries++) { +		cpu = find_lowest_rq(task); + +		if ((cpu == -1) || (cpu == rq->cpu)) +			break; + +		lowest_rq = cpu_rq(cpu); + +		/* if the prio of this runqueue changed, try again */ +		if (double_lock_balance(rq, lowest_rq)) { +			/* +			 * We had to unlock the run queue. In +			 * the mean time, task could have +			 * migrated already or had its affinity changed. +			 * Also make sure that it wasn't scheduled on its rq. +			 */ +			if (unlikely(task_rq(task) != rq || +				     !cpumask_test_cpu(lowest_rq->cpu, +						       tsk_cpus_allowed(task)) || +				     task_running(rq, task) || +				     !task->on_rq)) { + +				raw_spin_unlock(&lowest_rq->lock); +				lowest_rq = NULL; +				break; +			} +		} + +		/* If this rq is still suitable use it. */ +		if (lowest_rq->rt.highest_prio.curr > task->prio) +			break; + +		/* try again */ +		double_unlock_balance(rq, lowest_rq); +		lowest_rq = NULL; +	} + +	return lowest_rq; +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ +	struct task_struct *p; + +	if (!has_pushable_tasks(rq)) +		return NULL; + +	p = plist_first_entry(&rq->rt.pushable_tasks, +			      struct task_struct, pushable_tasks); + +	BUG_ON(rq->cpu != task_cpu(p)); +	BUG_ON(task_current(rq, p)); +	BUG_ON(p->rt.nr_cpus_allowed <= 1); + +	BUG_ON(!p->on_rq); +	BUG_ON(!rt_task(p)); + +	return p; +} + +/* + * If the current CPU has more than one RT task, see if the non + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +static int push_rt_task(struct rq *rq) +{ +	struct task_struct *next_task; +	struct rq *lowest_rq; +	int ret = 0; + +	if (!rq->rt.overloaded) +		return 0; + +	next_task = pick_next_pushable_task(rq); +	if (!next_task) +		return 0; + +retry: +	if (unlikely(next_task == rq->curr)) { +		WARN_ON(1); +		return 0; +	} + +	/* +	 * It's possible that the next_task slipped in of +	 * higher priority than current. If that's the case +	 * just reschedule current. +	 */ +	if (unlikely(next_task->prio < rq->curr->prio)) { +		resched_task(rq->curr); +		return 0; +	} + +	/* We might release rq lock */ +	get_task_struct(next_task); + +	/* find_lock_lowest_rq locks the rq if found */ +	lowest_rq = find_lock_lowest_rq(next_task, rq); +	if (!lowest_rq) { +		struct task_struct *task; +		/* +		 * find_lock_lowest_rq releases rq->lock +		 * so it is possible that next_task has migrated. +		 * +		 * We need to make sure that the task is still on the same +		 * run-queue and is also still the next task eligible for +		 * pushing. +		 */ +		task = pick_next_pushable_task(rq); +		if (task_cpu(next_task) == rq->cpu && task == next_task) { +			/* +			 * The task hasn't migrated, and is still the next +			 * eligible task, but we failed to find a run-queue +			 * to push it to.  Do not retry in this case, since +			 * other cpus will pull from us when ready. +			 */ +			goto out; +		} + +		if (!task) +			/* No more tasks, just exit */ +			goto out; + +		/* +		 * Something has shifted, try again. +		 */ +		put_task_struct(next_task); +		next_task = task; +		goto retry; +	} + +	deactivate_task(rq, next_task, 0); +	set_task_cpu(next_task, lowest_rq->cpu); +	activate_task(lowest_rq, next_task, 0); +	ret = 1; + +	resched_task(lowest_rq->curr); + +	double_unlock_balance(rq, lowest_rq); + +out: +	put_task_struct(next_task); + +	return ret; +} + +static void push_rt_tasks(struct rq *rq) +{ +	/* push_rt_task will return true if it moved an RT */ +	while (push_rt_task(rq)) +		; +} + +static int pull_rt_task(struct rq *this_rq) +{ +	int this_cpu = this_rq->cpu, ret = 0, cpu; +	struct task_struct *p; +	struct rq *src_rq; + +	if (likely(!rt_overloaded(this_rq))) +		return 0; + +	for_each_cpu(cpu, this_rq->rd->rto_mask) { +		if (this_cpu == cpu) +			continue; + +		src_rq = cpu_rq(cpu); + +		/* +		 * Don't bother taking the src_rq->lock if the next highest +		 * task is known to be lower-priority than our current task. +		 * This may look racy, but if this value is about to go +		 * logically higher, the src_rq will push this task away. +		 * And if its going logically lower, we do not care +		 */ +		if (src_rq->rt.highest_prio.next >= +		    this_rq->rt.highest_prio.curr) +			continue; + +		/* +		 * We can potentially drop this_rq's lock in +		 * double_lock_balance, and another CPU could +		 * alter this_rq +		 */ +		double_lock_balance(this_rq, src_rq); + +		/* +		 * Are there still pullable RT tasks? +		 */ +		if (src_rq->rt.rt_nr_running <= 1) +			goto skip; + +		p = pick_next_highest_task_rt(src_rq, this_cpu); + +		/* +		 * Do we have an RT task that preempts +		 * the to-be-scheduled task? +		 */ +		if (p && (p->prio < this_rq->rt.highest_prio.curr)) { +			WARN_ON(p == src_rq->curr); +			WARN_ON(!p->on_rq); + +			/* +			 * There's a chance that p is higher in priority +			 * than what's currently running on its cpu. +			 * This is just that p is wakeing up and hasn't +			 * had a chance to schedule. We only pull +			 * p if it is lower in priority than the +			 * current task on the run queue +			 */ +			if (p->prio < src_rq->curr->prio) +				goto skip; + +			ret = 1; + +			deactivate_task(src_rq, p, 0); +			set_task_cpu(p, this_cpu); +			activate_task(this_rq, p, 0); +			/* +			 * We continue with the search, just in +			 * case there's an even higher prio task +			 * in another runqueue. (low likelihood +			 * but possible) +			 */ +		} +skip: +		double_unlock_balance(this_rq, src_rq); +	} + +	return ret; +} + +static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) +{ +	/* Try to pull RT tasks here if we lower this rq's prio */ +	if (rq->rt.highest_prio.curr > prev->prio) +		pull_rt_task(rq); +} + +static void post_schedule_rt(struct rq *rq) +{ +	push_rt_tasks(rq); +} + +/* + * If we are not running and we are not going to reschedule soon, we should + * try to push tasks away now + */ +static void task_woken_rt(struct rq *rq, struct task_struct *p) +{ +	if (!task_running(rq, p) && +	    !test_tsk_need_resched(rq->curr) && +	    has_pushable_tasks(rq) && +	    p->rt.nr_cpus_allowed > 1 && +	    rt_task(rq->curr) && +	    (rq->curr->rt.nr_cpus_allowed < 2 || +	     rq->curr->prio <= p->prio)) +		push_rt_tasks(rq); +} + +static void set_cpus_allowed_rt(struct task_struct *p, +				const struct cpumask *new_mask) +{ +	int weight = cpumask_weight(new_mask); + +	BUG_ON(!rt_task(p)); + +	/* +	 * Update the migration status of the RQ if we have an RT task +	 * which is running AND changing its weight value. +	 */ +	if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { +		struct rq *rq = task_rq(p); + +		if (!task_current(rq, p)) { +			/* +			 * Make sure we dequeue this task from the pushable list +			 * before going further.  It will either remain off of +			 * the list because we are no longer pushable, or it +			 * will be requeued. +			 */ +			if (p->rt.nr_cpus_allowed > 1) +				dequeue_pushable_task(rq, p); + +			/* +			 * Requeue if our weight is changing and still > 1 +			 */ +			if (weight > 1) +				enqueue_pushable_task(rq, p); + +		} + +		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { +			rq->rt.rt_nr_migratory++; +		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { +			BUG_ON(!rq->rt.rt_nr_migratory); +			rq->rt.rt_nr_migratory--; +		} + +		update_rt_migration(&rq->rt); +	} +} + +/* Assumes rq->lock is held */ +static void rq_online_rt(struct rq *rq) +{ +	if (rq->rt.overloaded) +		rt_set_overload(rq); + +	__enable_runtime(rq); + +	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); +} + +/* Assumes rq->lock is held */ +static void rq_offline_rt(struct rq *rq) +{ +	if (rq->rt.overloaded) +		rt_clear_overload(rq); + +	__disable_runtime(rq); + +	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); +} + +/* + * When switch from the rt queue, we bring ourselves to a position + * that we might want to pull RT tasks from other runqueues. + */ +static void switched_from_rt(struct rq *rq, struct task_struct *p) +{ +	/* +	 * If there are other RT tasks then we will reschedule +	 * and the scheduling of the other RT tasks will handle +	 * the balancing. But if we are the last RT task +	 * we may need to handle the pulling of RT tasks +	 * now. +	 */ +	if (p->on_rq && !rq->rt.rt_nr_running) +		pull_rt_task(rq); +} + +void init_sched_rt_class(void) +{ +	unsigned int i; + +	for_each_possible_cpu(i) { +		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), +					GFP_KERNEL, cpu_to_node(i)); +	} +} +#endif /* CONFIG_SMP */ + +/* + * When switching a task to RT, we may overload the runqueue + * with RT tasks. In this case we try to push them off to + * other runqueues. + */ +static void switched_to_rt(struct rq *rq, struct task_struct *p) +{ +	int check_resched = 1; + +	/* +	 * If we are already running, then there's nothing +	 * that needs to be done. But if we are not running +	 * we may need to preempt the current running task. +	 * If that current running task is also an RT task +	 * then see if we can move to another run queue. +	 */ +	if (p->on_rq && rq->curr != p) { +#ifdef CONFIG_SMP +		if (rq->rt.overloaded && push_rt_task(rq) && +		    /* Don't resched if we changed runqueues */ +		    rq != task_rq(p)) +			check_resched = 0; +#endif /* CONFIG_SMP */ +		if (check_resched && p->prio < rq->curr->prio) +			resched_task(rq->curr); +	} +} + +/* + * Priority of the task has changed. This may cause + * us to initiate a push or pull. + */ +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +{ +	if (!p->on_rq) +		return; + +	if (rq->curr == p) { +#ifdef CONFIG_SMP +		/* +		 * If our priority decreases while running, we +		 * may need to pull tasks to this runqueue. +		 */ +		if (oldprio < p->prio) +			pull_rt_task(rq); +		/* +		 * If there's a higher priority task waiting to run +		 * then reschedule. Note, the above pull_rt_task +		 * can release the rq lock and p could migrate. +		 * Only reschedule if p is still on the same runqueue. +		 */ +		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) +			resched_task(p); +#else +		/* For UP simply resched on drop of prio */ +		if (oldprio < p->prio) +			resched_task(p); +#endif /* CONFIG_SMP */ +	} else { +		/* +		 * This task is not running, but if it is +		 * greater than the current running task +		 * then reschedule. +		 */ +		if (p->prio < rq->curr->prio) +			resched_task(rq->curr); +	} +} + +static void watchdog(struct rq *rq, struct task_struct *p) +{ +	unsigned long soft, hard; + +	/* max may change after cur was read, this will be fixed next tick */ +	soft = task_rlimit(p, RLIMIT_RTTIME); +	hard = task_rlimit_max(p, RLIMIT_RTTIME); + +	if (soft != RLIM_INFINITY) { +		unsigned long next; + +		p->rt.timeout++; +		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); +		if (p->rt.timeout > next) +			p->cputime_expires.sched_exp = p->se.sum_exec_runtime; +	} +} + +static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) +{ +	update_curr_rt(rq); + +	watchdog(rq, p); + +	/* +	 * RR tasks need a special form of timeslice management. +	 * FIFO tasks have no timeslices. +	 */ +	if (p->policy != SCHED_RR) +		return; + +	if (--p->rt.time_slice) +		return; + +	p->rt.time_slice = DEF_TIMESLICE; + +	/* +	 * Requeue to the end of queue if we are not the only element +	 * on the queue: +	 */ +	if (p->rt.run_list.prev != p->rt.run_list.next) { +		requeue_task_rt(rq, p, 0); +		set_tsk_need_resched(p); +	} +} + +static void set_curr_task_rt(struct rq *rq) +{ +	struct task_struct *p = rq->curr; + +	p->se.exec_start = rq->clock_task; + +	/* The running task is never eligible for pushing */ +	dequeue_pushable_task(rq, p); +} + +static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) +{ +	/* +	 * Time slice is 0 for SCHED_FIFO tasks +	 */ +	if (task->policy == SCHED_RR) +		return DEF_TIMESLICE; +	else +		return 0; +} + +const struct sched_class rt_sched_class = { +	.next			= &fair_sched_class, +	.enqueue_task		= enqueue_task_rt, +	.dequeue_task		= dequeue_task_rt, +	.yield_task		= yield_task_rt, + +	.check_preempt_curr	= check_preempt_curr_rt, + +	.pick_next_task		= pick_next_task_rt, +	.put_prev_task		= put_prev_task_rt, + +#ifdef CONFIG_SMP +	.select_task_rq		= select_task_rq_rt, + +	.set_cpus_allowed       = set_cpus_allowed_rt, +	.rq_online              = rq_online_rt, +	.rq_offline             = rq_offline_rt, +	.pre_schedule		= pre_schedule_rt, +	.post_schedule		= post_schedule_rt, +	.task_woken		= task_woken_rt, +	.switched_from		= switched_from_rt, +#endif + +	.set_curr_task          = set_curr_task_rt, +	.task_tick		= task_tick_rt, + +	.get_rr_interval	= get_rr_interval_rt, + +	.prio_changed		= prio_changed_rt, +	.switched_to		= switched_to_rt, +}; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); + +void print_rt_stats(struct seq_file *m, int cpu) +{ +	rt_rq_iter_t iter; +	struct rt_rq *rt_rq; + +	rcu_read_lock(); +	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) +		print_rt_rq(m, cpu, rt_rq); +	rcu_read_unlock(); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 00000000000..98c0c2623db --- /dev/null +++ b/kernel/sched/sched.h @@ -0,0 +1,1166 @@ + +#include <linux/sched.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> + +#include "cpupri.h" + +extern __read_mostly int scheduler_running; + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p)		((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD		SCHED_LOAD_SCALE +#define NICE_0_SHIFT		SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define DEF_TIMESLICE		(100 * HZ / 1000) + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF	((u64)~0ULL) + +static inline int rt_policy(int policy) +{ +	if (policy == SCHED_FIFO || policy == SCHED_RR) +		return 1; +	return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ +	return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { +	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ +	struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { +	/* nests inside the rq lock: */ +	raw_spinlock_t		rt_runtime_lock; +	ktime_t			rt_period; +	u64			rt_runtime; +	struct hrtimer		rt_period_timer; +}; + +extern struct mutex sched_domains_mutex; + +#ifdef CONFIG_CGROUP_SCHED + +#include <linux/cgroup.h> + +struct cfs_rq; +struct rt_rq; + +static LIST_HEAD(task_groups); + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH +	raw_spinlock_t lock; +	ktime_t period; +	u64 quota, runtime; +	s64 hierarchal_quota; +	u64 runtime_expires; + +	int idle, timer_active; +	struct hrtimer period_timer, slack_timer; +	struct list_head throttled_cfs_rq; + +	/* statistics */ +	int nr_periods, nr_throttled; +	u64 throttled_time; +#endif +}; + +/* task group related information */ +struct task_group { +	struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED +	/* schedulable entities of this group on each cpu */ +	struct sched_entity **se; +	/* runqueue "owned" by this group on each cpu */ +	struct cfs_rq **cfs_rq; +	unsigned long shares; + +	atomic_t load_weight; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +	struct sched_rt_entity **rt_se; +	struct rt_rq **rt_rq; + +	struct rt_bandwidth rt_bandwidth; +#endif + +	struct rcu_head rcu; +	struct list_head list; + +	struct task_group *parent; +	struct list_head siblings; +	struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP +	struct autogroup *autogroup; +#endif + +	struct cfs_bandwidth cfs_bandwidth; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + *  limitation from this.) + */ +#define MIN_SHARES	(1UL <<  1) +#define MAX_SHARES	(1UL << 18) +#endif + +/* Default task group. + *	Every task in system belong to this group at bootup. + */ +extern struct task_group root_task_group; + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, +			     tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ +	return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, +			struct sched_entity *se, int cpu, +			struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, +		struct sched_rt_entity *rt_se, int cpu, +		struct sched_rt_entity *parent); + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif	/* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { +	struct load_weight load; +	unsigned long nr_running, h_nr_running; + +	u64 exec_clock; +	u64 min_vruntime; +#ifndef CONFIG_64BIT +	u64 min_vruntime_copy; +#endif + +	struct rb_root tasks_timeline; +	struct rb_node *rb_leftmost; + +	struct list_head tasks; +	struct list_head *balance_iterator; + +	/* +	 * 'curr' points to currently running entity on this cfs_rq. +	 * It is set to NULL otherwise (i.e when none are currently running). +	 */ +	struct sched_entity *curr, *next, *last, *skip; + +#ifdef	CONFIG_SCHED_DEBUG +	unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */ + +	/* +	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in +	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities +	 * (like users, containers etc.) +	 * +	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This +	 * list is used during load balance. +	 */ +	int on_list; +	struct list_head leaf_cfs_rq_list; +	struct task_group *tg;	/* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP +	/* +	 * the part of load.weight contributed by tasks +	 */ +	unsigned long task_weight; + +	/* +	 *   h_load = weight * f(tg) +	 * +	 * Where f(tg) is the recursive weight fraction assigned to +	 * this group. +	 */ +	unsigned long h_load; + +	/* +	 * Maintaining per-cpu shares distribution for group scheduling +	 * +	 * load_stamp is the last time we updated the load average +	 * load_last is the last time we updated the load average and saw load +	 * load_unacc_exec_time is currently unaccounted execution time +	 */ +	u64 load_avg; +	u64 load_period; +	u64 load_stamp, load_last, load_unacc_exec_time; + +	unsigned long load_contribution; +#endif /* CONFIG_SMP */ +#ifdef CONFIG_CFS_BANDWIDTH +	int runtime_enabled; +	u64 runtime_expires; +	s64 runtime_remaining; + +	u64 throttled_timestamp; +	int throttled, throttle_count; +	struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +}; + +static inline int rt_bandwidth_enabled(void) +{ +	return sysctl_sched_rt_runtime >= 0; +} + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { +	struct rt_prio_array active; +	unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +	struct { +		int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP +		int next; /* next highest */ +#endif +	} highest_prio; +#endif +#ifdef CONFIG_SMP +	unsigned long rt_nr_migratory; +	unsigned long rt_nr_total; +	int overloaded; +	struct plist_head pushable_tasks; +#endif +	int rt_throttled; +	u64 rt_time; +	u64 rt_runtime; +	/* Nests inside the rq lock: */ +	raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED +	unsigned long rt_nr_boosted; + +	struct rq *rq; +	struct list_head leaf_rt_rq_list; +	struct task_group *tg; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { +	atomic_t refcount; +	atomic_t rto_count; +	struct rcu_head rcu; +	cpumask_var_t span; +	cpumask_var_t online; + +	/* +	 * The "RT overload" flag: it gets set if a CPU has more than +	 * one runnable RT task. +	 */ +	cpumask_var_t rto_mask; +	struct cpupri cpupri; +}; + +extern struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { +	/* runqueue lock: */ +	raw_spinlock_t lock; + +	/* +	 * nr_running and cpu_load should be in the same cacheline because +	 * remote CPUs use both these fields when doing load calculation. +	 */ +	unsigned long nr_running; +	#define CPU_LOAD_IDX_MAX 5 +	unsigned long cpu_load[CPU_LOAD_IDX_MAX]; +	unsigned long last_load_update_tick; +#ifdef CONFIG_NO_HZ +	u64 nohz_stamp; +	unsigned long nohz_flags; +#endif +	int skip_clock_update; + +	/* capture load from *all* tasks on this cpu: */ +	struct load_weight load; +	unsigned long nr_load_updates; +	u64 nr_switches; + +	struct cfs_rq cfs; +	struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED +	/* list of leaf cfs_rq on this cpu: */ +	struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED +	struct list_head leaf_rt_rq_list; +#endif + +	/* +	 * This is part of a global counter where only the total sum +	 * over all CPUs matters. A task can increase this counter on +	 * one CPU and if it got migrated afterwards it may decrease +	 * it on another CPU. Always updated under the runqueue lock: +	 */ +	unsigned long nr_uninterruptible; + +	struct task_struct *curr, *idle, *stop; +	unsigned long next_balance; +	struct mm_struct *prev_mm; + +	u64 clock; +	u64 clock_task; + +	atomic_t nr_iowait; + +#ifdef CONFIG_SMP +	struct root_domain *rd; +	struct sched_domain *sd; + +	unsigned long cpu_power; + +	unsigned char idle_balance; +	/* For active balancing */ +	int post_schedule; +	int active_balance; +	int push_cpu; +	struct cpu_stop_work active_balance_work; +	/* cpu of this runqueue: */ +	int cpu; +	int online; + +	u64 rt_avg; +	u64 age_stamp; +	u64 idle_stamp; +	u64 avg_idle; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +	u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT +	u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +	u64 prev_steal_time_rq; +#endif + +	/* calc_load related fields */ +	unsigned long calc_load_update; +	long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP +	int hrtick_csd_pending; +	struct call_single_data hrtick_csd; +#endif +	struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS +	/* latency stats */ +	struct sched_info rq_sched_info; +	unsigned long long rq_cpu_time; +	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + +	/* sys_sched_yield() stats */ +	unsigned int yld_count; + +	/* schedule() stats */ +	unsigned int sched_switch; +	unsigned int sched_count; +	unsigned int sched_goidle; + +	/* try_to_wake_up() stats */ +	unsigned int ttwu_count; +	unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP +	struct llist_head wake_list; +#endif +}; + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP +	return rq->cpu; +#else +	return 0; +#endif +} + +DECLARE_PER_CPU(struct rq, runqueues); + +#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu))) +#define this_rq()		(&__get_cpu_var(runqueues)) +#define task_rq(p)		cpu_rq(task_cpu(p)) +#define cpu_curr(cpu)		(cpu_rq(cpu)->curr) +#define raw_rq()		(&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_SMP + +#define rcu_dereference_check_sched_domain(p) \ +	rcu_dereference_check((p), \ +			      lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ +	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ +			__sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu:	The cpu whose highest level of sched domain is to + *		be returned. + * @flag:	The flag to check for the highest sched_domain + *		for the given cpu. + * + * Returns the highest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ +	struct sched_domain *sd, *hsd = NULL; + +	for_each_domain(cpu, sd) { +		if (!(sd->flags & flag)) +			break; +		hsd = sd; +	} + +	return hsd; +} + +DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_id); + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "auto_group.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ +	struct task_group *tg; +	struct cgroup_subsys_state *css; + +	css = task_subsys_state_check(p, cpu_cgroup_subsys_id, +			lockdep_is_held(&p->pi_lock) || +			lockdep_is_held(&task_rq(p)->lock)); +	tg = container_of(css, struct task_group, css); + +	return autogroup_task_group(p, tg); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) +	struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +	p->se.cfs_rq = tg->cfs_rq[cpu]; +	p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +	p->rt.rt_rq  = tg->rt_rq[cpu]; +	p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ +	return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +	set_task_rq(p, cpu); +#ifdef CONFIG_SMP +	/* +	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be +	 * successfuly executed on another CPU. We must ensure that updates of +	 * per-task data have been completed by this moment. +	 */ +	smp_wmb(); +	task_thread_info(p)->cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include <linux/jump_label.h> +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +extern const_debug unsigned int sysctl_sched_features; + +#define SCHED_FEAT(name, enabled)	\ +	__SCHED_FEAT_##name , + +enum { +#include "features.h" +	__SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +static __always_inline bool static_branch__true(struct jump_label_key *key) +{ +	return likely(static_branch(key)); /* Not out of line branch. */ +} + +static __always_inline bool static_branch__false(struct jump_label_key *key) +{ +	return unlikely(static_branch(key)); /* Out of line branch. */ +} + +#define SCHED_FEAT(name, enabled)					\ +static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +{									\ +	return static_branch__##enabled(key);				\ +} + +#include "features.h" + +#undef SCHED_FEAT + +extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) +#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ + +static inline u64 global_rt_period(void) +{ +	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ +	if (sysctl_sched_rt_runtime < 0) +		return RUNTIME_INF; + +	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + + + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ +	return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP +	return p->on_cpu; +#else +	return task_current(rq, p); +#endif +} + + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next)	do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev)	do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP +	/* +	 * We can optimise this out completely for !SMP, because the +	 * SMP rebalancing from interrupt is the only thing that cares +	 * here. +	 */ +	next->on_cpu = 1; +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP +	/* +	 * After ->on_cpu is cleared, the task can be moved to a different CPU. +	 * We must ensure this doesn't happen until the switch is completely +	 * finished. +	 */ +	smp_wmb(); +	prev->on_cpu = 0; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK +	/* this is a valid case when another task releases the spinlock */ +	rq->lock.owner = current; +#endif +	/* +	 * If we are tracking spinlock dependencies then we have to +	 * fix up the runqueue lock - which gets 'carried over' from +	 * prev into current: +	 */ +	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + +	raw_spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP +	/* +	 * We can optimise this out completely for !SMP, because the +	 * SMP rebalancing from interrupt is the only thing that cares +	 * here. +	 */ +	next->on_cpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	raw_spin_unlock_irq(&rq->lock); +#else +	raw_spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP +	/* +	 * After ->on_cpu is cleared, the task can be moved to a different CPU. +	 * We must ensure this doesn't happen until the switch is completely +	 * finished. +	 */ +	smp_wmb(); +	prev->on_cpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ +	lw->weight += inc; +	lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ +	lw->weight -= dec; +	lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ +	lw->weight = w; +	lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO                3 +#define WMULT_IDLEPRIO         1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */     88761,     71755,     56483,     46273,     36291, + /* -15 */     29154,     23254,     18705,     14949,     11916, + /* -10 */      9548,      7620,      6100,      4904,      3906, + /*  -5 */      3121,      2501,      1991,      1586,      1277, + /*   0 */      1024,       820,       655,       526,       423, + /*   5 */       335,       272,       215,       172,       137, + /*  10 */       110,        87,        70,        56,        45, + /*  15 */        36,        29,        23,        18,        15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */     48388,     59856,     76040,     92818,    118348, + /* -15 */    147320,    184698,    229616,    287308,    360437, + /* -10 */    449829,    563644,    704093,    875809,   1099582, + /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326, + /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587, + /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126, + /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717, + /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { +	CPUACCT_STAT_USER,	/* ... user mode */ +	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */ + +	CPUACCT_STAT_NSTATS, +}; + + +#define sched_class_highest (&stop_sched_class) +#define for_each_class(class) \ +   for (class = sched_class_highest; class; class = class->next) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + + +#ifdef CONFIG_SMP + +extern void trigger_load_balance(struct rq *rq, int cpu); +extern void idle_balance(int this_cpu, struct rq *this_rq); + +#else	/* CONFIG_SMP */ + +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); +extern void update_group_power(struct sched_domain *sd, int cpu); +extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); + +extern void resched_task(struct task_struct *p); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +extern void update_cpu_load(struct rq *this_rq); + +#ifdef CONFIG_CGROUP_CPUACCT +#include <linux/cgroup.h> +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { +	struct cgroup_subsys_state css; +	/* cpuusage holds pointer to a u64-type object on every cpu */ +	u64 __percpu *cpuusage; +	struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ +	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), +			    struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ +	return container_of(task_subsys_state(tsk, cpuacct_subsys_id), +			    struct cpuacct, css); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ +	if (!ca || !ca->css.cgroup->parent) +		return NULL; +	return cgroup_ca(ca->css.cgroup->parent); +} + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +#endif + +static inline void inc_nr_running(struct rq *rq) +{ +	rq->nr_running++; +} + +static inline void dec_nr_running(struct rq *rq) +{ +	rq->nr_running--; +} + +extern void update_rq_clock(struct rq *rq); + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_time_avg; +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +static inline u64 sched_avg_period(void) +{ +	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +void calc_load_account_idle(struct rq *this_rq); + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + *  - enabled by features + *  - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ +	if (!sched_feat(HRTICK)) +		return 0; +	if (!cpu_active(cpu_of(rq))) +		return 0; +	return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ +	return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +	rq->rt_avg += rt_delta; +	sched_avg_update(rq); +} +#else +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } +static inline void sched_avg_update(struct rq *rq) { } +#endif + +extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations.  This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below.  However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) +	__releases(this_rq->lock) +	__acquires(busiest->lock) +	__acquires(this_rq->lock) +{ +	raw_spin_unlock(&this_rq->lock); +	double_rq_lock(this_rq, busiest); + +	return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry.  This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) +	__releases(this_rq->lock) +	__acquires(busiest->lock) +	__acquires(this_rq->lock) +{ +	int ret = 0; + +	if (unlikely(!raw_spin_trylock(&busiest->lock))) { +		if (busiest < this_rq) { +			raw_spin_unlock(&this_rq->lock); +			raw_spin_lock(&busiest->lock); +			raw_spin_lock_nested(&this_rq->lock, +					      SINGLE_DEPTH_NESTING); +			ret = 1; +		} else +			raw_spin_lock_nested(&busiest->lock, +					      SINGLE_DEPTH_NESTING); +	} +	return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ +	if (unlikely(!irqs_disabled())) { +		/* printk() doesn't work good under rq->lock */ +		raw_spin_unlock(&this_rq->lock); +		BUG_ON(1); +	} + +	return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) +	__releases(busiest->lock) +{ +	raw_spin_unlock(&busiest->lock); +	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) +	__acquires(rq1->lock) +	__acquires(rq2->lock) +{ +	BUG_ON(!irqs_disabled()); +	if (rq1 == rq2) { +		raw_spin_lock(&rq1->lock); +		__acquire(rq2->lock);	/* Fake it out ;) */ +	} else { +		if (rq1 < rq2) { +			raw_spin_lock(&rq1->lock); +			raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); +		} else { +			raw_spin_lock(&rq2->lock); +			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); +		} +	} +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) +	__releases(rq1->lock) +	__releases(rq2->lock) +{ +	raw_spin_unlock(&rq1->lock); +	if (rq1 != rq2) +		raw_spin_unlock(&rq2->lock); +	else +		__release(rq2->lock); +} + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) +	__acquires(rq1->lock) +	__acquires(rq2->lock) +{ +	BUG_ON(!irqs_disabled()); +	BUG_ON(rq1 != rq2); +	raw_spin_lock(&rq1->lock); +	__acquire(rq2->lock);	/* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) +	__releases(rq1->lock) +	__releases(rq2->lock) +{ +	BUG_ON(rq1 != rq2); +	raw_spin_unlock(&rq1->lock); +	__release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void unthrottle_offline_cfs_rqs(struct rq *rq); + +extern void account_cfs_bandwidth_used(int enabled, int was_enabled); + +#ifdef CONFIG_NO_HZ +enum rq_nohz_flag_bits { +	NOHZ_TICK_STOPPED, +	NOHZ_BALANCE_KICK, +	NOHZ_IDLE, +}; + +#define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags) +#endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 00000000000..2a581ba8e19 --- /dev/null +++ b/kernel/sched/stats.c @@ -0,0 +1,111 @@ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "sched.h" + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 15 + +static int show_schedstat(struct seq_file *seq, void *v) +{ +	int cpu; +	int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; +	char *mask_str = kmalloc(mask_len, GFP_KERNEL); + +	if (mask_str == NULL) +		return -ENOMEM; + +	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); +	seq_printf(seq, "timestamp %lu\n", jiffies); +	for_each_online_cpu(cpu) { +		struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP +		struct sched_domain *sd; +		int dcount = 0; +#endif + +		/* runqueue-specific stats */ +		seq_printf(seq, +		    "cpu%d %u %u %u %u %u %u %llu %llu %lu", +		    cpu, rq->yld_count, +		    rq->sched_switch, rq->sched_count, rq->sched_goidle, +		    rq->ttwu_count, rq->ttwu_local, +		    rq->rq_cpu_time, +		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); + +		seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP +		/* domain-specific stats */ +		rcu_read_lock(); +		for_each_domain(cpu, sd) { +			enum cpu_idle_type itype; + +			cpumask_scnprintf(mask_str, mask_len, +					  sched_domain_span(sd)); +			seq_printf(seq, "domain%d %s", dcount++, mask_str); +			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; +					itype++) { +				seq_printf(seq, " %u %u %u %u %u %u %u %u", +				    sd->lb_count[itype], +				    sd->lb_balanced[itype], +				    sd->lb_failed[itype], +				    sd->lb_imbalance[itype], +				    sd->lb_gained[itype], +				    sd->lb_hot_gained[itype], +				    sd->lb_nobusyq[itype], +				    sd->lb_nobusyg[itype]); +			} +			seq_printf(seq, +				   " %u %u %u %u %u %u %u %u %u %u %u %u\n", +			    sd->alb_count, sd->alb_failed, sd->alb_pushed, +			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, +			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, +			    sd->ttwu_wake_remote, sd->ttwu_move_affine, +			    sd->ttwu_move_balance); +		} +		rcu_read_unlock(); +#endif +	} +	kfree(mask_str); +	return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ +	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); +	char *buf = kmalloc(size, GFP_KERNEL); +	struct seq_file *m; +	int res; + +	if (!buf) +		return -ENOMEM; +	res = single_open(file, show_schedstat, NULL); +	if (!res) { +		m = file->private_data; +		m->buf = buf; +		m->size = size; +	} else +		kfree(buf); +	return res; +} + +static const struct file_operations proc_schedstat_operations = { +	.open    = schedstat_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = single_release, +}; + +static int __init proc_schedstat_init(void) +{ +	proc_create("schedstat", 0, NULL, &proc_schedstat_operations); +	return 0; +} +module_init(proc_schedstat_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h new file mode 100644 index 00000000000..2ef90a51ec5 --- /dev/null +++ b/kernel/sched/stats.h @@ -0,0 +1,231 @@ + +#ifdef CONFIG_SCHEDSTATS + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ +	if (rq) { +		rq->rq_sched_info.run_delay += delta; +		rq->rq_sched_info.pcount++; +	} +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ +	if (rq) +		rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ +	if (rq) +		rq->rq_sched_info.run_delay += delta; +} +# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0) +# define schedstat_set(var, val)	do { var = (val); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{} +# define schedstat_inc(rq, field)	do { } while (0) +# define schedstat_add(rq, field, amt)	do { } while (0) +# define schedstat_set(var, val)	do { } while (0) +#endif + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ +	t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. + */ +static inline void sched_info_dequeued(struct task_struct *t) +{ +	unsigned long long now = task_rq(t)->clock, delta = 0; + +	if (unlikely(sched_info_on())) +		if (t->sched_info.last_queued) +			delta = now - t->sched_info.last_queued; +	sched_info_reset_dequeued(t); +	t->sched_info.run_delay += delta; + +	rq_sched_info_dequeued(task_rq(t), delta); +} + +/* + * Called when a task finally hits the cpu.  We can now calculate how + * long it was waiting to run.  We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct task_struct *t) +{ +	unsigned long long now = task_rq(t)->clock, delta = 0; + +	if (t->sched_info.last_queued) +		delta = now - t->sched_info.last_queued; +	sched_info_reset_dequeued(t); +	t->sched_info.run_delay += delta; +	t->sched_info.last_arrival = now; +	t->sched_info.pcount++; + +	rq_sched_info_arrive(task_rq(t), delta); +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set.  It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) +{ +	if (unlikely(sched_info_on())) +		if (!t->sched_info.last_queued) +			t->sched_info.last_queued = task_rq(t)->clock; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily.  Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct task_struct *t) +{ +	unsigned long long delta = task_rq(t)->clock - +					t->sched_info.last_arrival; + +	rq_sched_info_depart(task_rq(t), delta); + +	if (t->state == TASK_RUNNING) +		sched_info_queued(t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice.  (This may also be called when switching to or from + * the idle task.)  We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ +	struct rq *rq = task_rq(prev); + +	/* +	 * prev now departs the cpu.  It's not interesting to record +	 * stats about how efficient we were at scheduling the idle +	 * process, however. +	 */ +	if (prev != rq->idle) +		sched_info_depart(prev); + +	if (next != rq->idle) +		sched_info_arrive(next); +} +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ +	if (unlikely(sched_info_on())) +		__sched_info_switch(prev, next); +} +#else +#define sched_info_queued(t)			do { } while (0) +#define sched_info_reset_dequeued(t)	do { } while (0) +#define sched_info_dequeued(t)			do { } while (0) +#define sched_info_switch(t, next)		do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick.  None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk:	Pointer to task structure. + * @cputime:	Time value by which to increment the utime field of the + *		thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, +					   cputime_t cputime) +{ +	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + +	if (!cputimer->running) +		return; + +	raw_spin_lock(&cputimer->lock); +	cputimer->cputime.utime += cputime; +	raw_spin_unlock(&cputimer->lock); +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk:	Pointer to task structure. + * @cputime:	Time value by which to increment the stime field of the + *		thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, +					     cputime_t cputime) +{ +	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + +	if (!cputimer->running) +		return; + +	raw_spin_lock(&cputimer->lock); +	cputimer->cputime.stime += cputime; +	raw_spin_unlock(&cputimer->lock); +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk:	Pointer to task structure. + * @ns:		Time value by which to increment the sum_exec_runtime field + *		of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, +					      unsigned long long ns) +{ +	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + +	if (!cputimer->running) +		return; + +	raw_spin_lock(&cputimer->lock); +	cputimer->cputime.sum_exec_runtime += ns; +	raw_spin_unlock(&cputimer->lock); +} diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c new file mode 100644 index 00000000000..7b386e86fd2 --- /dev/null +++ b/kernel/sched/stop_task.c @@ -0,0 +1,108 @@ +#include "sched.h" + +/* + * stop-task scheduling class. + * + * The stop task is the highest priority task in the system, it preempts + * everything and will be preempted by nothing. + * + * See kernel/stop_machine.c + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +{ +	return task_cpu(p); /* stop tasks as never migrate */ +} +#endif /* CONFIG_SMP */ + +static void +check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +{ +	/* we're never preempted */ +} + +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ +	struct task_struct *stop = rq->stop; + +	if (stop && stop->on_rq) +		return stop; + +	return NULL; +} + +static void +enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +	inc_nr_running(rq); +} + +static void +dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +	dec_nr_running(rq); +} + +static void yield_task_stop(struct rq *rq) +{ +	BUG(); /* the stop task should never yield, its pointless. */ +} + +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_stop(struct rq *rq) +{ +} + +static void switched_to_stop(struct rq *rq, struct task_struct *p) +{ +	BUG(); /* its impossible to change to this class */ +} + +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +{ +	BUG(); /* how!?, what priority? */ +} + +static unsigned int +get_rr_interval_stop(struct rq *rq, struct task_struct *task) +{ +	return 0; +} + +/* + * Simple, special scheduling class for the per-CPU stop tasks: + */ +const struct sched_class stop_sched_class = { +	.next			= &rt_sched_class, + +	.enqueue_task		= enqueue_task_stop, +	.dequeue_task		= dequeue_task_stop, +	.yield_task		= yield_task_stop, + +	.check_preempt_curr	= check_preempt_curr_stop, + +	.pick_next_task		= pick_next_task_stop, +	.put_prev_task		= put_prev_task_stop, + +#ifdef CONFIG_SMP +	.select_task_rq		= select_task_rq_stop, +#endif + +	.set_curr_task          = set_curr_task_stop, +	.task_tick		= task_tick_stop, + +	.get_rr_interval	= get_rr_interval_stop, + +	.prio_changed		= prio_changed_stop, +	.switched_to		= switched_to_stop, +};  |