diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/events/core.c | 17 | ||||
| -rw-r--r-- | kernel/hrtimer.c | 4 | ||||
| -rw-r--r-- | kernel/posix-cpu-timers.c | 76 | ||||
| -rw-r--r-- | kernel/rcutree.c | 16 | ||||
| -rw-r--r-- | kernel/rcutree.h | 2 | ||||
| -rw-r--r-- | kernel/rcutree_plugin.h | 33 | ||||
| -rw-r--r-- | kernel/sched/core.c | 92 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 10 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 25 | ||||
| -rw-r--r-- | kernel/softirq.c | 19 | ||||
| -rw-r--r-- | kernel/time/Kconfig | 80 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 3 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 5 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 296 | ||||
| -rw-r--r-- | kernel/timer.c | 16 | 
16 files changed, 607 insertions, 88 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 3820e3cefba..6b41c1899a8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -18,6 +18,7 @@  #include <linux/poll.h>  #include <linux/slab.h>  #include <linux/hash.h> +#include <linux/tick.h>  #include <linux/sysfs.h>  #include <linux/dcache.h>  #include <linux/percpu.h> @@ -685,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)  	WARN_ON(!irqs_disabled()); -	if (list_empty(&cpuctx->rotation_list)) +	if (list_empty(&cpuctx->rotation_list)) { +		int was_empty = list_empty(head);  		list_add(&cpuctx->rotation_list, head); +		if (was_empty) +			tick_nohz_full_kick(); +	}  }  static void get_ctx(struct perf_event_context *ctx) @@ -2591,6 +2596,16 @@ done:  		list_del_init(&cpuctx->rotation_list);  } +#ifdef CONFIG_NO_HZ_FULL +bool perf_event_can_stop_tick(void) +{ +	if (list_empty(&__get_cpu_var(rotation_list))) +		return true; +	else +		return false; +} +#endif +  void perf_event_task_tick(void)  {  	struct list_head *head = &__get_cpu_var(rotation_list); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 609d8ff38b7..fd4b13b131f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -172,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,   */  static int hrtimer_get_target(int this_cpu, int pinned)  { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))  		return get_nohz_timer_target();  #endif @@ -1125,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)  }  EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /**   * hrtimer_get_next_event - get the time until next expiry event   * diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8fd709c9bb5..42670e9b44e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -10,6 +10,8 @@  #include <linux/kernel_stat.h>  #include <trace/events/timer.h>  #include <linux/random.h> +#include <linux/tick.h> +#include <linux/workqueue.h>  /*   * Called after updating RLIMIT_CPU to run cpu timer and update @@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,  	}  } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime:	The struct to compare. + * + * Checks @cputime to see if all fields are zero.  Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ +	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) +		return 1; +	return 0; +} +  static inline cputime_t prof_ticks(struct task_struct *p)  {  	cputime_t utime, stime; @@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,  	return 0;  } +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ +	tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ +	schedule_work(&nohz_kick_work); +} + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ +	if (!task_cputime_zero(&tsk->cputime_expires)) +		return false; + +	if (tsk->signal->cputimer.running) +		return false; + +	return true; +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif +  /*   * Guts of sys_timer_settime for CPU timers.   * This is called with the timer locked and interrupts disabled. @@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  		sample_to_timespec(timer->it_clock,  				   old_incr, &old->it_interval);  	} +	if (!ret) +		posix_cpu_timer_kick_nohz();  	return ret;  } @@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,  	}  } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime:	The struct to compare. - * - * Checks @cputime to see if all fields are zero.  Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ -	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) -		return 1; -	return 0; -} -  /*   * Check for any per-thread CPU timers that have fired and move them   * off the tsk->*_timers list onto the firing list.  Per-thread timers @@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)  			cpu_timer_fire(timer);  		spin_unlock(&timer->it_lock);  	} + +	/* +	 * In case some timers were rescheduled after the queue got emptied, +	 * wake up full dynticks CPUs. +	 */ +	if (tsk->signal->cputimer.running) +		posix_cpu_timer_kick_nohz();  }  /* @@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  		}  		if (!*newval) -			return; +			goto out;  		*newval += now.cpu;  	} @@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  			tsk->signal->cputime_expires.virt_exp = *newval;  		break;  	} +out: +	posix_cpu_timer_kick_nohz();  }  static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d8534308fd0..16ea6792501 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -799,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  		rdp->offline_fqs++;  		return 1;  	} + +	/* +	 * There is a possibility that a CPU in adaptive-ticks state +	 * might run in the kernel with the scheduling-clock tick disabled +	 * for an extended time period.  Invoke rcu_kick_nohz_cpu() to +	 * force the CPU to restart the scheduling-clock tick in this +	 * CPU is in this state. +	 */ +	rcu_kick_nohz_cpu(rdp->cpu); +  	return 0;  } @@ -1820,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,  			  struct rcu_node *rnp, struct rcu_data *rdp)  {  	/* No-CBs CPUs do not have orphanable callbacks. */ -	if (is_nocb_cpu(rdp->cpu)) +	if (rcu_is_nocb_cpu(rdp->cpu))  		return;  	/* @@ -2892,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)  	 * corresponding CPU's preceding callbacks have been invoked.  	 */  	for_each_possible_cpu(cpu) { -		if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) +		if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))  			continue;  		rdp = per_cpu_ptr(rsp->rda, cpu); -		if (is_nocb_cpu(cpu)) { +		if (rcu_is_nocb_cpu(cpu)) {  			_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,  					   rsp->n_barrier_done);  			atomic_inc(&rsp->barrier_cpu_count); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14ee40795d6..da77a8f57ff 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -530,13 +530,13 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp);  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);  static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);  static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool is_nocb_cpu(int cpu);  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  			    bool lazy);  static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,  				      struct rcu_data *rdp);  static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);  static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void rcu_kick_nohz_cpu(int cpu);  static bool init_nocb_callback_list(struct rcu_data *rdp);  #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index d084ae3f281..170814dc418 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,6 +28,7 @@  #include <linux/gfp.h>  #include <linux/oom.h>  #include <linux/smpboot.h> +#include <linux/tick.h>  #define RCU_KTHREAD_PRIO 1 @@ -1705,7 +1706,7 @@ static void rcu_prepare_for_idle(int cpu)  		return;  	/* If this is a no-CBs CPU, no callbacks, just return. */ -	if (is_nocb_cpu(cpu)) +	if (rcu_is_nocb_cpu(cpu))  		return;  	/* @@ -1747,7 +1748,7 @@ static void rcu_cleanup_after_idle(int cpu)  	struct rcu_data *rdp;  	struct rcu_state *rsp; -	if (is_nocb_cpu(cpu)) +	if (rcu_is_nocb_cpu(cpu))  		return;  	rcu_try_advance_all_cbs();  	for_each_rcu_flavor(rsp) { @@ -2052,7 +2053,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)  }  /* Is the specified CPU a no-CPUs CPU? */ -static bool is_nocb_cpu(int cpu) +bool rcu_is_nocb_cpu(int cpu)  {  	if (have_rcu_nocb_mask)  		return cpumask_test_cpu(cpu, rcu_nocb_mask); @@ -2110,7 +2111,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  			    bool lazy)  { -	if (!is_nocb_cpu(rdp->cpu)) +	if (!rcu_is_nocb_cpu(rdp->cpu))  		return 0;  	__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);  	if (__is_kfree_rcu_offset((unsigned long)rhp->func)) @@ -2134,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,  	long qll = rsp->qlen_lazy;  	/* If this is not a no-CBs CPU, tell the caller to do it the old way. */ -	if (!is_nocb_cpu(smp_processor_id())) +	if (!rcu_is_nocb_cpu(smp_processor_id()))  		return 0;  	rsp->qlen = 0;  	rsp->qlen_lazy = 0; @@ -2306,11 +2307,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)  {  } -static bool is_nocb_cpu(int cpu) -{ -	return false; -} -  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  			    bool lazy)  { @@ -2337,3 +2333,20 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)  }  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + +/* + * An adaptive-ticks CPU can potentially execute in kernel mode for an + * arbitrarily long period of time with the scheduling-clock tick turned + * off.  RCU will be paying attention to this CPU because it is in the + * kernel, but the CPU cannot be guaranteed to be executing the RCU state + * machine because the scheduling-clock tick has been disabled.  Therefore, + * if an adaptive-ticks CPU is failing to respond to the current grace + * period and has not be idle from an RCU perspective, kick it. + */ +static void rcu_kick_nohz_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL +	if (tick_nohz_full_cpu(cpu)) +		smp_send_reschedule(cpu); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5662f58f0b6..58453b8272f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -544,7 +544,7 @@ void resched_cpu(int cpu)  	raw_spin_unlock_irqrestore(&rq->lock, flags);  } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * In the semi idle case, use the nearest busy cpu for migrating timers   * from an idle cpu.  This is good for power-savings. @@ -582,7 +582,7 @@ unlock:   * account when the CPU goes back to idle and evaluates the timer   * wheel for the next timer event.   */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu)  {  	struct rq *rq = cpu_rq(cpu); @@ -612,20 +612,56 @@ void wake_up_idle_cpu(int cpu)  		smp_send_reschedule(cpu);  } +static bool wake_up_full_nohz_cpu(int cpu) +{ +	if (tick_nohz_full_cpu(cpu)) { +		if (cpu != smp_processor_id() || +		    tick_nohz_tick_stopped()) +			smp_send_reschedule(cpu); +		return true; +	} + +	return false; +} + +void wake_up_nohz_cpu(int cpu) +{ +	if (!wake_up_full_nohz_cpu(cpu)) +		wake_up_idle_cpu(cpu); +} +  static inline bool got_nohz_idle_kick(void)  {  	int cpu = smp_processor_id();  	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));  } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */  static inline bool got_nohz_idle_kick(void)  {  	return false;  } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ +       struct rq *rq; + +       rq = this_rq(); + +       /* Make sure rq->nr_running update is visible after the IPI */ +       smp_rmb(); + +       /* More than one running task need preemption */ +       if (rq->nr_running > 1) +               return false; + +       return true; +} +#endif /* CONFIG_NO_HZ_FULL */  void sched_avg_update(struct rq *rq)  { @@ -1357,7 +1393,8 @@ static void sched_ttwu_pending(void)  void scheduler_ipi(void)  { -	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) +	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() +	    && !tick_nohz_full_cpu(smp_processor_id()))  		return;  	/* @@ -1374,6 +1411,7 @@ void scheduler_ipi(void)  	 * somewhat pessimize the simple resched case.  	 */  	irq_enter(); +	tick_nohz_full_check();  	sched_ttwu_pending();  	/* @@ -1855,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  		kprobe_flush_task(prev);  		put_task_struct(prev);  	} + +	tick_nohz_task_switch(current);  }  #ifdef CONFIG_SMP @@ -2118,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)  	return load >> FSHIFT;  } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * Handle NO_HZ for the global load-average.   * @@ -2344,12 +2384,12 @@ static void calc_global_nohz(void)  	smp_wmb();  	calc_load_idx++;  } -#else /* !CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */  static inline long calc_load_fold_idle(void) { return 0; }  static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */  /*   * calc_load - update the avenrun load estimates 10 ticks after the @@ -2509,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,  	sched_avg_update(this_rq);  } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * There is no sane way to deal with nohz on smp when using jiffies because the   * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -2569,7 +2609,7 @@ void update_cpu_load_nohz(void)  	}  	raw_spin_unlock(&this_rq->lock);  } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */  /*   * Called from scheduler_tick() @@ -2696,7 +2736,34 @@ void scheduler_tick(void)  	rq->idle_balance = idle_cpu(cpu);  	trigger_load_balance(rq, cpu);  #endif +	rq_last_tick_reset(rq); +} + +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ +	struct rq *rq = this_rq(); +	unsigned long next, now = ACCESS_ONCE(jiffies); + +	next = rq->last_sched_tick + HZ; + +	if (time_before_eq(next, now)) +		return 0; + +	return jiffies_to_usecs(next - now) * NSEC_PER_USEC;  } +#endif  notrace unsigned long get_parent_ip(unsigned long addr)  { @@ -6951,9 +7018,12 @@ void __init sched_init(void)  		INIT_LIST_HEAD(&rq->cfs_tasks);  		rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  		rq->nohz_flags = 0;  #endif +#ifdef CONFIG_NO_HZ_FULL +		rq->last_sched_tick = 0; +#endif  #endif  		init_rq_hrtick(rq);  		atomic_set(&rq->nr_iowait, 0); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bf7081b1ec..c61a614465c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5355,7 +5355,7 @@ out_unlock:  	return 0;  } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * idle load balancing details   * - When one of the busy CPUs notice that there may be an idle rebalancing @@ -5572,9 +5572,9 @@ out:  		rq->next_balance = next_balance;  } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /* - * In CONFIG_NO_HZ case, the idle balance kickee will do the + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the   * rebalancing for all the cpus for whom scheduler ticks are stopped.   */  static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) @@ -5717,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)  	if (time_after_eq(jiffies, rq->next_balance) &&  	    likely(!on_null_domain(cpu)))  		raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))  		nohz_balancer_kick(cpu);  #endif @@ -6187,7 +6187,7 @@ __init void init_sched_fair_class(void)  #ifdef CONFIG_SMP  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	nohz.next_balance = jiffies;  	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);  	cpu_notifier(sched_ilb_notifier, 0); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b8ce7732834..d8da01008d3 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -17,6 +17,7 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)  static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)  {  	idle_exit_fair(rq); +	rq_last_tick_reset(rq);  }  static void post_schedule_idle(struct rq *rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c225c4c711..ce39224d615 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@  #include <linux/mutex.h>  #include <linux/spinlock.h>  #include <linux/stop_machine.h> +#include <linux/tick.h>  #include "cpupri.h"  #include "cpuacct.h" @@ -405,10 +406,13 @@ struct rq {  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX];  	unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	u64 nohz_stamp;  	unsigned long nohz_flags;  #endif +#ifdef CONFIG_NO_HZ_FULL +	unsigned long last_sched_tick; +#endif  	int skip_clock_update;  	/* capture load from *all* tasks on this cpu: */ @@ -1072,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal)  static inline void inc_nr_running(struct rq *rq)  {  	rq->nr_running++; + +#ifdef CONFIG_NO_HZ_FULL +	if (rq->nr_running == 2) { +		if (tick_nohz_full_cpu(rq->cpu)) { +			/* Order rq->nr_running write against the IPI */ +			smp_wmb(); +			smp_send_reschedule(rq->cpu); +		} +       } +#endif  }  static inline void dec_nr_running(struct rq *rq) @@ -1079,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)  	rq->nr_running--;  } +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL +	rq->last_sched_tick = jiffies; +#endif +} +  extern void update_rq_clock(struct rq *rq);  extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1299,7 +1320,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);  extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  enum rq_nohz_flag_bits {  	NOHZ_TICK_STOPPED,  	NOHZ_BALANCE_KICK, diff --git a/kernel/softirq.c b/kernel/softirq.c index aa82723c720..b5197dcb0da 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -329,6 +329,19 @@ static inline void invoke_softirq(void)  		wakeup_softirqd();  } +static inline void tick_irq_exit(void) +{ +#ifdef CONFIG_NO_HZ_COMMON +	int cpu = smp_processor_id(); + +	/* Make sure that timer wheel updates are propagated */ +	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { +		if (!in_interrupt()) +			tick_nohz_irq_exit(); +	} +#endif +} +  /*   * Exit an interrupt context. Process softirqs if needed and possible:   */ @@ -346,11 +359,7 @@ void irq_exit(void)  	if (!in_interrupt() && local_softirq_pending())  		invoke_softirq(); -#ifdef CONFIG_NO_HZ -	/* Make sure that timer wheel updates are propagated */ -	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) -		tick_nohz_irq_exit(); -#endif +	tick_irq_exit();  	rcu_irq_exit();  } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd..e4c07b0692b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE  if GENERIC_CLOCKEVENTS  menu "Timers subsystem" -# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is  # only related to the tick functionality. Oneshot clockevent devices  # are supported independ of this.  config TICK_ONESHOT  	bool -config NO_HZ -	bool "Tickless System (Dynamic Ticks)" +config NO_HZ_COMMON +	bool  	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS  	select TICK_ONESHOT + +choice +	prompt "Timer tick handling" +	default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC +	bool "Periodic timer ticks (constant rate, no dynticks)" +	help +	  This option keeps the tick running periodically at a constant +	  rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE +	bool "Idle dynticks system (tickless idle)" +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS +	select NO_HZ_COMMON +	help +	  This option enables a tickless idle system: timer interrupts +	  will only trigger on an as-needed basis when the system is idle. +	  This is usually interesting for energy saving. + +	  Most of the time you want to say Y here. + +config NO_HZ_FULL +	bool "Full dynticks system (tickless)" +	# NO_HZ_COMMON dependency +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS +	# We need at least one periodic CPU for timekeeping +	depends on SMP +	# RCU_USER_QS dependency +	depends on HAVE_CONTEXT_TRACKING +	# VIRT_CPU_ACCOUNTING_GEN dependency +	depends on 64BIT +	select NO_HZ_COMMON +	select RCU_USER_QS +	select RCU_NOCB_CPU +	select VIRT_CPU_ACCOUNTING_GEN +	select CONTEXT_TRACKING_FORCE +	select IRQ_WORK +	help +	 Adaptively try to shutdown the tick whenever possible, even when +	 the CPU is running tasks. Typically this requires running a single +	 task on the CPU. Chances for running tickless are maximized when +	 the task mostly runs in userspace and has few kernel activity. + +	 You need to fill up the nohz_full boot parameter with the +	 desired range of dynticks CPUs. + +	 This is implemented at the expense of some overhead in user <-> kernel +	 transitions: syscalls, exceptions and interrupts. Even when it's +	 dynamically off. + +	 Say N. + +endchoice + +config NO_HZ_FULL_ALL +       bool "Full dynticks system on all CPUs by default" +       depends on NO_HZ_FULL +       help +         If the user doesn't pass the nohz_full boot option to +	 define the range of full dynticks CPUs, consider that all +	 CPUs in the system are full dynticks by default. +	 Note the boot CPU will still be kept outside the range to +	 handle the timekeeping duty. + +config NO_HZ +	bool "Old Idle dynticks config" +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS  	help -	  This option enables a tickless system: timer interrupts will -	  only trigger on an as-needed basis both when the system is -	  busy and when the system is idle. +	  This is the old config entry that enables dynticks idle. +	  We keep it around for a little while to enforce backward +	  compatibility with older config files.  config HIGH_RES_TIMERS  	bool "High Resolution Timer Support" diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 61d00a8cdf2..206bbfb34e0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -693,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  		bc->event_handler = tick_handle_oneshot_broadcast;  		/* Take the do_timer update */ -		tick_do_timer_cpu = cpu; +		if (!tick_nohz_full_cpu(cpu)) +			tick_do_timer_cpu = cpu;  		/*  		 * We must be careful here. There might be other CPUs diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 6176a3e4570..5d3fb100bc0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,  		 * this cpu:  		 */  		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { -			tick_do_timer_cpu = cpu; +			if (!tick_nohz_full_cpu(cpu)) +				tick_do_timer_cpu = cpu; +			else +				tick_do_timer_cpu = TICK_DO_TIMER_NONE;  			tick_next_period = ktime_get();  			tick_period = ktime_set(0, NSEC_PER_SEC / HZ);  		} diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 225f8bf1909..bc67d4245e1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -21,11 +21,15 @@  #include <linux/sched.h>  #include <linux/module.h>  #include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h>  #include <asm/irq_regs.h>  #include "tick-internal.h" +#include <trace/events/timer.h> +  /*   * Per cpu nohz control structure   */ @@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)  {  	int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	/*  	 * Check if the do_timer duty was dropped. We don't care about  	 * concurrency: This happens only when the cpu in charge went @@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)  	 * this duty, then the jiffies update is still serialized by  	 * jiffies_lock.  	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) +	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) +	    && !tick_nohz_full_cpu(cpu))  		tick_do_timer_cpu = cpu;  #endif @@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)  static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)  { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	/*  	 * When we are idle and the tick is stopped, we have to touch  	 * the watchdog as we might not schedule for a really long @@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)  	profile_tick(CPU_PROFILING);  } +#ifdef CONFIG_NO_HZ_FULL +static cpumask_var_t nohz_full_mask; +bool have_nohz_full_mask; + +static bool can_stop_full_tick(void) +{ +	WARN_ON_ONCE(!irqs_disabled()); + +	if (!sched_can_stop_tick()) { +		trace_tick_stop(0, "more than 1 task in runqueue\n"); +		return false; +	} + +	if (!posix_cpu_timers_can_stop_tick(current)) { +		trace_tick_stop(0, "posix timers running\n"); +		return false; +	} + +	if (!perf_event_can_stop_tick()) { +		trace_tick_stop(0, "perf events running\n"); +		return false; +	} + +	/* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +	/* +	 * TODO: kick full dynticks CPUs when +	 * sched_clock_stable is set. +	 */ +	if (!sched_clock_stable) { +		trace_tick_stop(0, "unstable sched clock\n"); +		return false; +	} +#endif + +	return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void tick_nohz_full_check(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + +	if (tick_nohz_full_cpu(smp_processor_id())) { +		if (ts->tick_stopped && !is_idle_task(current)) { +			if (!can_stop_full_tick()) +				tick_nohz_restart_sched_tick(ts, ktime_get()); +		} +	} +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ +	tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { +	.func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ +	if (tick_nohz_full_cpu(smp_processor_id())) +		irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ +	tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ +	if (!have_nohz_full_mask) +		return; + +	preempt_disable(); +	smp_call_function_many(nohz_full_mask, +			       nohz_full_kick_ipi, NULL, false); +	preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void tick_nohz_task_switch(struct task_struct *tsk) +{ +	unsigned long flags; + +	local_irq_save(flags); + +	if (!tick_nohz_full_cpu(smp_processor_id())) +		goto out; + +	if (tick_nohz_tick_stopped() && !can_stop_full_tick()) +		tick_nohz_full_kick(); + +out: +	local_irq_restore(flags); +} + +int tick_nohz_full_cpu(int cpu) +{ +	if (!have_nohz_full_mask) +		return 0; + +	return cpumask_test_cpu(cpu, nohz_full_mask); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ +	int cpu; + +	alloc_bootmem_cpumask_var(&nohz_full_mask); +	if (cpulist_parse(str, nohz_full_mask) < 0) { +		pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); +		return 1; +	} + +	cpu = smp_processor_id(); +	if (cpumask_test_cpu(cpu, nohz_full_mask)) { +		pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); +		cpumask_clear_cpu(cpu, nohz_full_mask); +	} +	have_nohz_full_mask = true; + +	return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, +						 unsigned long action, +						 void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		/* +		 * If we handle the timekeeping duty for full dynticks CPUs, +		 * we can't safely shutdown that CPU. +		 */ +		if (have_nohz_full_mask && tick_do_timer_cpu == cpu) +			return -EINVAL; +		break; +	} +	return NOTIFY_OK; +} + +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_full_buf[NR_CPUS + 1]; + +static int tick_nohz_init_all(void) +{ +	int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL +	if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { +		pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); +		return err; +	} +	err = 0; +	cpumask_setall(nohz_full_mask); +	cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); +	have_nohz_full_mask = true; +#endif +	return err; +} + +void __init tick_nohz_init(void) +{ +	int cpu; + +	if (!have_nohz_full_mask) { +		if (tick_nohz_init_all() < 0) +			return; +	} + +	cpu_notifier(tick_nohz_cpu_down_callback, 0); + +	/* Make sure full dynticks CPU are also RCU nocbs */ +	for_each_cpu(cpu, nohz_full_mask) { +		if (!rcu_is_nocb_cpu(cpu)) { +			pr_warning("NO_HZ: CPU %d is not RCU nocb: " +				   "cleared from nohz_full range", cpu); +			cpumask_clear_cpu(cpu, nohz_full_mask); +		} +	} + +	cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); +	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); +} +#else +#define have_nohz_full_mask (0) +#endif +  /*   * NOHZ - aka dynamic tick functionality   */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * NO HZ enabled ?   */ @@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  			delta_jiffies = rcu_delta_jiffies;  		}  	} +  	/* -	 * Do not stop the tick, if we are only one off -	 * or if the cpu is required for rcu +	 * Do not stop the tick, if we are only one off (or less) +	 * or if the cpu is required for RCU:  	 */ -	if (!ts->tick_stopped && delta_jiffies == 1) +	if (!ts->tick_stopped && delta_jiffies <= 1)  		goto out;  	/* Schedule the tick, if we are at least one jiffie off */ @@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  			time_delta = KTIME_MAX;  		} +#ifdef CONFIG_NO_HZ_FULL +		if (!ts->inidle) { +			time_delta = min(time_delta, +					 scheduler_tick_max_deferment()); +		} +#endif +  		/*  		 * calculate the expiry time for the next timer wheel  		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);  			ts->tick_stopped = 1; +			trace_tick_stop(1, " ");  		}  		/* @@ -457,6 +687,24 @@ out:  	return ret;  } +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL +       int cpu = smp_processor_id(); + +       if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) +               return; + +       if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) +	       return; + +       if (!can_stop_full_tick()) +               return; + +       tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} +  static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)  {  	/* @@ -489,6 +737,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)  		return false;  	} +	if (have_nohz_full_mask) { +		/* +		 * Keep the tick alive to guarantee timekeeping progression +		 * if there are full dynticks CPUs around +		 */ +		if (tick_do_timer_cpu == cpu) +			return false; +		/* +		 * Boot safety: make sure the timekeeping duty has been +		 * assigned before entering dyntick-idle mode, +		 */ +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) +			return false; +	} +  	return true;  } @@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void)  {  	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); -	if (!ts->inidle) -		return; - -	/* Cancel the timer because CPU already waken up from the C-states*/ -	menu_hrtimer_cancel(); -	__tick_nohz_idle_enter(ts); +	if (ts->inidle) { +		/* Cancel the timer because CPU already waken up from the C-states*/ +		menu_hrtimer_cancel(); +		__tick_nohz_idle_enter(ts); +	} else { +		tick_nohz_full_stop_tick(ts); +	}  }  /** @@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu)  static inline void tick_nohz_switch_to_nohz(void) { }  static inline void tick_check_nohz(int cpu) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */  /*   * Called from irq_enter to notify about the possible interruption of idle() @@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void)  		now = ktime_get();  	} -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	if (tick_nohz_enabled)  		ts->nohz_mode = NOHZ_MODE_HIGHRES;  #endif  }  #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS  void tick_cancel_sched_timer(int cpu)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); diff --git a/kernel/timer.c b/kernel/timer.c index 09bca8ce977..a860bba3441 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -739,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,  	cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)  	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))  		cpu = get_nohz_timer_target();  #endif @@ -931,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu)  	debug_activate(timer, timer->expires);  	internal_add_timer(base, timer);  	/* -	 * Check whether the other CPU is idle and needs to be -	 * triggered to reevaluate the timer wheel when nohz is -	 * active. We are protected against the other CPU fiddling +	 * Check whether the other CPU is in dynticks mode and needs +	 * to be triggered to reevaluate the timer wheel. +	 * We are protected against the other CPU fiddling  	 * with the timer by holding the timer base lock. This also -	 * makes sure that a CPU on the way to idle can not evaluate -	 * the timer wheel. +	 * makes sure that a CPU on the way to stop its tick can not +	 * evaluate the timer wheel.  	 */ -	wake_up_idle_cpu(cpu); +	wake_up_nohz_cpu(cpu);  	spin_unlock_irqrestore(&base->lock, flags);  }  EXPORT_SYMBOL_GPL(add_timer_on); @@ -1189,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base)  	spin_unlock_irq(&base->lock);  } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * Find out when the next timer event is due to happen. This   * is used on S/390 to stop all activity when a CPU is idle.  |