diff options
| author | Olof Johansson <olof@lixom.net> | 2011-12-15 22:02:34 -0800 | 
|---|---|---|
| committer | Olof Johansson <olof@lixom.net> | 2011-12-15 22:02:34 -0800 | 
| commit | 02735a29d8ce882ec698803f064e17888874780c (patch) | |
| tree | 6a4afa3bc8b6d4334df24910a56f77adf126b0c7 /kernel | |
| parent | 8d685b7f4d9c9882442bf1b492558d5f17b694fa (diff) | |
| parent | 3d911ad22e8405c1a333a6812e405cb1a5ae9829 (diff) | |
| download | olio-linux-3.10-02735a29d8ce882ec698803f064e17888874780c.tar.xz olio-linux-3.10-02735a29d8ce882ec698803f064e17888874780c.zip  | |
Merge branch 'at91/defconfig' into next/cleanup
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup_freezer.c | 11 | ||||
| -rw-r--r-- | kernel/events/core.c | 91 | ||||
| -rw-r--r-- | kernel/events/internal.h | 3 | ||||
| -rw-r--r-- | kernel/events/ring_buffer.c | 3 | ||||
| -rw-r--r-- | kernel/hrtimer.c | 6 | ||||
| -rw-r--r-- | kernel/irq/manage.c | 7 | ||||
| -rw-r--r-- | kernel/irq/spurious.c | 4 | ||||
| -rw-r--r-- | kernel/jump_label.c | 3 | ||||
| -rw-r--r-- | kernel/lockdep.c | 8 | ||||
| -rw-r--r-- | kernel/power/hibernate.c | 16 | ||||
| -rw-r--r-- | kernel/printk.c | 3 | ||||
| -rw-r--r-- | kernel/sched.c | 17 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 159 | ||||
| -rw-r--r-- | kernel/sched_features.h | 1 | ||||
| -rw-r--r-- | kernel/sched_rt.c | 3 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
| -rw-r--r-- | kernel/time/clockevents.c | 1 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 62 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 92 | ||||
| -rw-r--r-- | kernel/timer.c | 2 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 5 | ||||
| -rw-r--r-- | kernel/trace/trace_events.c | 1 | ||||
| -rw-r--r-- | kernel/trace/trace_events_filter.c | 13 | 
24 files changed, 438 insertions, 77 deletions
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5e828a2ca8e..213c0351dad 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss,  	kfree(cgroup_freezer(cgroup));  } +/* task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ +	return frozen(task) || +		(task_is_stopped_or_traced(task) && freezing(task)); +} +  /*   * The call to cgroup_lock() in the freezer.state write method prevents   * a write to that file racing against an attach, and hence the @@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup,  	cgroup_iter_start(cgroup, &it);  	while ((task = cgroup_iter_next(cgroup, &it))) {  		ntotal++; -		if (frozen(task)) +		if (is_task_frozen_enough(task))  			nfrozen++;  	} @@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)  	while ((task = cgroup_iter_next(cgroup, &it))) {  		if (!freeze_task(task, true))  			continue; -		if (frozen(task)) +		if (is_task_frozen_enough(task))  			continue;  		if (!freezing(task) && !freezer_should_skip(task))  			num_cant_freeze_now++; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f9..d3b9df5962c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,  static void update_context_time(struct perf_event_context *ctx);  static u64 perf_event_time(struct perf_event *event); +static void ring_buffer_attach(struct perf_event *event, +			       struct ring_buffer *rb); +  void __weak perf_event_print_debug(void)	{ }  extern __weak const char *perf_pmu_name(void) @@ -2171,9 +2174,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,  	 */  	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); -	perf_event_sched_in(cpuctx, ctx, task); +	if (ctx->nr_events) +		cpuctx->task_ctx = ctx; -	cpuctx->task_ctx = ctx; +	perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);  	perf_pmu_enable(ctx->pmu);  	perf_ctx_unlock(cpuctx, ctx); @@ -3190,12 +3194,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)  	struct ring_buffer *rb;  	unsigned int events = POLL_HUP; +	/* +	 * Race between perf_event_set_output() and perf_poll(): perf_poll() +	 * grabs the rb reference but perf_event_set_output() overrides it. +	 * Here is the timeline for two threads T1, T2: +	 * t0: T1, rb = rcu_dereference(event->rb) +	 * t1: T2, old_rb = event->rb +	 * t2: T2, event->rb = new rb +	 * t3: T2, ring_buffer_detach(old_rb) +	 * t4: T1, ring_buffer_attach(rb1) +	 * t5: T1, poll_wait(event->waitq) +	 * +	 * To avoid this problem, we grab mmap_mutex in perf_poll() +	 * thereby ensuring that the assignment of the new ring buffer +	 * and the detachment of the old buffer appear atomic to perf_poll() +	 */ +	mutex_lock(&event->mmap_mutex); +  	rcu_read_lock();  	rb = rcu_dereference(event->rb); -	if (rb) +	if (rb) { +		ring_buffer_attach(event, rb);  		events = atomic_xchg(&rb->poll, 0); +	}  	rcu_read_unlock(); +	mutex_unlock(&event->mmap_mutex); +  	poll_wait(file, &event->waitq, wait);  	return events; @@ -3496,6 +3521,49 @@ unlock:  	return ret;  } +static void ring_buffer_attach(struct perf_event *event, +			       struct ring_buffer *rb) +{ +	unsigned long flags; + +	if (!list_empty(&event->rb_entry)) +		return; + +	spin_lock_irqsave(&rb->event_lock, flags); +	if (!list_empty(&event->rb_entry)) +		goto unlock; + +	list_add(&event->rb_entry, &rb->event_list); +unlock: +	spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_detach(struct perf_event *event, +			       struct ring_buffer *rb) +{ +	unsigned long flags; + +	if (list_empty(&event->rb_entry)) +		return; + +	spin_lock_irqsave(&rb->event_lock, flags); +	list_del_init(&event->rb_entry); +	wake_up_all(&event->waitq); +	spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_wakeup(struct perf_event *event) +{ +	struct ring_buffer *rb; + +	rcu_read_lock(); +	rb = rcu_dereference(event->rb); +	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { +		wake_up_all(&event->waitq); +	} +	rcu_read_unlock(); +} +  static void rb_free_rcu(struct rcu_head *rcu_head)  {  	struct ring_buffer *rb; @@ -3521,9 +3589,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)  static void ring_buffer_put(struct ring_buffer *rb)  { +	struct perf_event *event, *n; +	unsigned long flags; +  	if (!atomic_dec_and_test(&rb->refcount))  		return; +	spin_lock_irqsave(&rb->event_lock, flags); +	list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { +		list_del_init(&event->rb_entry); +		wake_up_all(&event->waitq); +	} +	spin_unlock_irqrestore(&rb->event_lock, flags); +  	call_rcu(&rb->rcu_head, rb_free_rcu);  } @@ -3546,6 +3624,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)  		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);  		vma->vm_mm->pinned_vm -= event->mmap_locked;  		rcu_assign_pointer(event->rb, NULL); +		ring_buffer_detach(event, rb);  		mutex_unlock(&event->mmap_mutex);  		ring_buffer_put(rb); @@ -3700,7 +3779,7 @@ static const struct file_operations perf_fops = {  void perf_event_wakeup(struct perf_event *event)  { -	wake_up_all(&event->waitq); +	ring_buffer_wakeup(event);  	if (event->pending_kill) {  		kill_fasync(&event->fasync, SIGIO, event->pending_kill); @@ -5822,6 +5901,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	INIT_LIST_HEAD(&event->group_entry);  	INIT_LIST_HEAD(&event->event_entry);  	INIT_LIST_HEAD(&event->sibling_list); +	INIT_LIST_HEAD(&event->rb_entry); +  	init_waitqueue_head(&event->waitq);  	init_irq_work(&event->pending, perf_pending_event); @@ -6028,6 +6109,8 @@ set:  	old_rb = event->rb;  	rcu_assign_pointer(event->rb, rb); +	if (old_rb) +		ring_buffer_detach(event, old_rb);  	ret = 0;  unlock:  	mutex_unlock(&event->mmap_mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116..64568a69937 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -22,6 +22,9 @@ struct ring_buffer {  	local_t				lost;		/* nr records lost   */  	long				watermark;	/* wakeup watermark  */ +	/* poll crap */ +	spinlock_t			event_lock; +	struct list_head		event_list;  	struct perf_event_mmap_page	*user_page;  	void				*data_pages[0]; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a29205cc0..7f3011c6b57 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)  		rb->writable = 1;  	atomic_set(&rb->refcount, 1); + +	INIT_LIST_HEAD(&rb->event_list); +	spin_lock_init(&rb->event_lock);  }  #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 422e567eecf..ae34bf51682 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer,  			     struct hrtimer_clock_base *base,  			     unsigned long newstate, int reprogram)  { +	struct timerqueue_node *next_timer;  	if (!(timer->state & HRTIMER_STATE_ENQUEUED))  		goto out; -	if (&timer->node == timerqueue_getnext(&base->active)) { +	next_timer = timerqueue_getnext(&base->active); +	timerqueue_del(&base->active, &timer->node); +	if (&timer->node == next_timer) {  #ifdef CONFIG_HIGH_RES_TIMERS  		/* Reprogram the clock event device. if enabled */  		if (reprogram && hrtimer_hres_active()) { @@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer,  		}  #endif  	} -	timerqueue_del(&base->active, &timer->node);  	if (!timerqueue_getnext(&base->active))  		base->cpu_base->active_bases &= ~(1 << base->index);  out: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 67ce837ae52..1da999f5e74 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -623,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)  static int irq_wait_for_interrupt(struct irqaction *action)  { +	set_current_state(TASK_INTERRUPTIBLE); +  	while (!kthread_should_stop()) { -		set_current_state(TASK_INTERRUPTIBLE);  		if (test_and_clear_bit(IRQTF_RUNTHREAD,  				       &action->thread_flags)) { @@ -632,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)  			return 0;  		}  		schedule(); +		set_current_state(TASK_INTERRUPTIBLE);  	} +	__set_current_state(TASK_RUNNING);  	return -1;  } @@ -1596,7 +1599,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,  		return -ENOMEM;  	action->handler = handler; -	action->flags = IRQF_PERCPU; +	action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;  	action->name = devname;  	action->percpu_dev_id = dev_id; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b5f4742693c..dc813a948be 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)  	 */  	action = desc->action;  	if (!action || !(action->flags & IRQF_SHARED) || -	    (action->flags & __IRQF_TIMER) || !action->next) +	    (action->flags & __IRQF_TIMER) || +	    (action->handler(irq, action->dev_id) == IRQ_HANDLED) || +	    !action->next)  		goto out;  	/* Already running on another processor */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bbdfe2a462a..66ff7109f69 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -66,8 +66,9 @@ void jump_label_inc(struct jump_label_key *key)  		return;  	jump_label_lock(); -	if (atomic_add_return(1, &key->enabled) == 1) +	if (atomic_read(&key->enabled) == 0)  		jump_label_update(key, JUMP_LABEL_ENABLE); +	atomic_inc(&key->enabled);  	jump_label_unlock();  } diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e69434b070d..b2e08c932d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -44,6 +44,7 @@  #include <linux/stringify.h>  #include <linux/bitops.h>  #include <linux/gfp.h> +#include <linux/kmemcheck.h>  #include <asm/sections.h> @@ -2948,7 +2949,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,  void lockdep_init_map(struct lockdep_map *lock, const char *name,  		      struct lock_class_key *key, int subclass)  { -	memset(lock, 0, sizeof(*lock)); +	int i; + +	kmemcheck_mark_initialized(lock, sizeof(*lock)); + +	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) +		lock->class_cache[i] = NULL;  #ifdef CONFIG_LOCK_STAT  	lock->cpu = raw_smp_processor_id(); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 196c01268eb..a6b0503574e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -347,7 +347,7 @@ int hibernation_snapshot(int platform_mode)  	error = freeze_kernel_threads();  	if (error) -		goto Close; +		goto Cleanup;  	if (hibernation_test(TEST_FREEZER) ||  		hibernation_testmode(HIBERNATION_TESTPROC)) { @@ -357,12 +357,14 @@ int hibernation_snapshot(int platform_mode)  		 * successful freezer test.  		 */  		freezer_test_done = true; -		goto Close; +		goto Cleanup;  	}  	error = dpm_prepare(PMSG_FREEZE); -	if (error) -		goto Complete_devices; +	if (error) { +		dpm_complete(msg); +		goto Cleanup; +	}  	suspend_console();  	pm_restrict_gfp_mask(); @@ -391,8 +393,6 @@ int hibernation_snapshot(int platform_mode)  		pm_restore_gfp_mask();  	resume_console(); - - Complete_devices:  	dpm_complete(msg);   Close: @@ -402,6 +402,10 @@ int hibernation_snapshot(int platform_mode)   Recover_platform:  	platform_recover(platform_mode);  	goto Resume_devices; + + Cleanup: +	swsusp_free(); +	goto Close;  }  /** diff --git a/kernel/printk.c b/kernel/printk.c index 1455a0d4eed..7982a0a841e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1293,10 +1293,11 @@ again:  	raw_spin_lock(&logbuf_lock);  	if (con_start != log_end)  		retry = 1; +	raw_spin_unlock_irqrestore(&logbuf_lock, flags); +  	if (retry && console_trylock())  		goto again; -	raw_spin_unlock_irqrestore(&logbuf_lock, flags);  	if (wake_klogd)  		wake_up_klogd();  } diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be..d6b149ccf92 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@  #include <linux/ctype.h>  #include <linux/ftrace.h>  #include <linux/slab.h> +#include <linux/init_task.h>  #include <asm/tlb.h>  #include <asm/irq_regs.h> @@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);   * This waits for either a completion of a specific task to be signaled or for a   * specified timeout to expire. The timeout is in jiffies. It is not   * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed.   */  unsigned long __sched  wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);   *   * This waits for completion of a specific task to be signaled. It is   * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed.   */  int __sched wait_for_completion_interruptible(struct completion *x)  { @@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);   *   * This waits for either a completion of a specific task to be signaled or for a   * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed.   */  long __sched  wait_for_completion_interruptible_timeout(struct completion *x, @@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);   *   * This waits to be signaled for completion of a specific task. It can be   * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed.   */  int __sched wait_for_completion_killable(struct completion *x)  { @@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);   * This waits for either a completion of a specific task to be   * signaled or for a specified timeout to expire. It can be   * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed.   */  long __sched  wait_for_completion_killable_timeout(struct completion *x, @@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)  	 */  	idle->sched_class = &idle_sched_class;  	ftrace_graph_init_idle_task(idle, cpu); +#if defined(CONFIG_SMP) +	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif  }  /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c9e67923b7..a78ed2736ba 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)  		list_del_leaf_cfs_rq(cfs_rq);  } +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ +	long tg_weight; + +	/* +	 * Use this CPU's actual weight instead of the last load_contribution +	 * to gain a more accurate current total weight. See +	 * update_cfs_rq_load_contribution(). +	 */ +	tg_weight = atomic_read(&tg->load_weight); +	tg_weight -= cfs_rq->load_contribution; +	tg_weight += cfs_rq->load.weight; + +	return tg_weight; +} +  static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  { -	long load_weight, load, shares; +	long tg_weight, load, shares; +	tg_weight = calc_tg_weight(tg, cfs_rq);  	load = cfs_rq->load.weight; -	load_weight = atomic_read(&tg->load_weight); -	load_weight += load; -	load_weight -= cfs_rq->load_contribution; -  	shares = (tg->shares * load); -	if (load_weight) -		shares /= load_weight; +	if (tg_weight) +		shares /= tg_weight;  	if (shares < MIN_SHARES)  		shares = MIN_SHARES; @@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)  { -	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) +	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)  		return;  	__return_cfs_rq_runtime(cfs_rq); @@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)   * Adding load to a group doesn't make a group heavier, but can cause movement   * of group shares between cpus. Assuming the shares were perfectly aligned one   * can calculate the shift in shares. + * + * Calculate the effective load difference if @wl is added (subtracted) to @tg + * on this @cpu and results in a total addition (subtraction) of @wg to the + * total group weight. + * + * Given a runqueue weight distribution (rw_i) we can compute a shares + * distribution (s_i) using: + * + *   s_i = rw_i / \Sum rw_j						(1) + * + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting + * shares distribution (s_i): + * + *   rw_i = {   2,   4,   1,   0 } + *   s_i  = { 2/7, 4/7, 1/7,   0 } + * + * As per wake_affine() we're interested in the load of two CPUs (the CPU the + * task used to run on and the CPU the waker is running on), we need to + * compute the effect of waking a task on either CPU and, in case of a sync + * wakeup, compute the effect of the current task going to sleep. + * + * So for a change of @wl to the local @cpu with an overall group weight change + * of @wl we can compute the new shares distribution (s'_i) using: + * + *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2) + * + * Suppose we're interested in CPUs 0 and 1, and want to compute the load + * differences in waking a task to CPU 0. The additional task changes the + * weight and shares distributions like: + * + *   rw'_i = {   3,   4,   1,   0 } + *   s'_i  = { 3/8, 4/8, 1/8,   0 } + * + * We can then compute the difference in effective weight by using: + * + *   dw_i = S * (s'_i - s_i)						(3) + * + * Where 'S' is the group weight as seen by its parent. + * + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - + * 4/7) times the weight of the group.   */  static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  {  	struct sched_entity *se = tg->se[cpu]; -	if (!tg->parent) +	if (!tg->parent)	/* the trivial, non-cgroup case */  		return wl;  	for_each_sched_entity(se) { -		long lw, w; +		long w, W;  		tg = se->my_q->tg; -		w = se->my_q->load.weight; -		/* use this cpu's instantaneous contribution */ -		lw = atomic_read(&tg->load_weight); -		lw -= se->my_q->load_contribution; -		lw += w + wg; +		/* +		 * W = @wg + \Sum rw_j +		 */ +		W = wg + calc_tg_weight(tg, se->my_q); -		wl += w; +		/* +		 * w = rw_i + @wl +		 */ +		w = se->my_q->load.weight + wl; -		if (lw > 0 && wl < lw) -			wl = (wl * tg->shares) / lw; +		/* +		 * wl = S * s'_i; see (2) +		 */ +		if (W > 0 && w < W) +			wl = (w * tg->shares) / W;  		else  			wl = tg->shares; -		/* zero point is MIN_SHARES */ +		/* +		 * Per the above, wl is the new se->load.weight value; since +		 * those are clipped to [MIN_SHARES, ...) do so now. See +		 * calc_cfs_shares(). +		 */  		if (wl < MIN_SHARES)  			wl = MIN_SHARES; + +		/* +		 * wl = dw_i = S * (s'_i - s_i); see (3) +		 */  		wl -= se->load.weight; + +		/* +		 * Recursively apply this logic to all parent groups to compute +		 * the final effective load change on the root group. Since +		 * only the @tg group gets extra weight, all parent groups can +		 * only redistribute existing shares. @wl is the shift in shares +		 * resulting from this level per the above. +		 */  		wg = 0;  	} @@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)  	int cpu = smp_processor_id();  	int prev_cpu = task_cpu(p);  	struct sched_domain *sd; -	int i; +	struct sched_group *sg; +	int i, smt = 0;  	/*  	 * If the task is going to be woken-up on this cpu and if it is @@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)  	 * Otherwise, iterate the domains and find an elegible idle cpu.  	 */  	rcu_read_lock(); +again:  	for_each_domain(target, sd) { -		if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) -			break; +		if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) +			continue; -		for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { -			if (idle_cpu(i)) { -				target = i; -				break; +		if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) { +			if (!smt) { +				smt = 1; +				goto again;  			} +			break;  		} -		/* -		 * Lets stop looking for an idle sibling when we reached -		 * the domain that spans the current cpu and prev_cpu. -		 */ -		if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && -		    cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) -			break; +		sg = sd->groups; +		do { +			if (!cpumask_intersects(sched_group_cpus(sg), +						tsk_cpus_allowed(p))) +				goto next; + +			for_each_cpu(i, sched_group_cpus(sg)) { +				if (!idle_cpu(i)) +					goto next; +			} + +			target = cpumask_first_and(sched_group_cpus(sg), +					tsk_cpus_allowed(p)); +			goto done; +next: +			sg = sg->next; +		} while (sg != sd->groups);  	} +done:  	rcu_read_unlock();  	return target; @@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,  }  /** - * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * update_sd_lb_stats - Update sched_domain's statistics for load balancing.   * @sd: sched_domain whose statistics are to be updated.   * @this_cpu: Cpu for which load balance is currently performed.   * @idle: Idle status of this_cpu diff --git a/kernel/sched_features.h b/kernel/sched_features.h index efa0a7b75dd..84802245abd 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)  SCHED_FEAT(TTWU_QUEUE, 1)  SCHED_FEAT(FORCE_SD_OVERLAP, 0) +SCHED_FEAT(RT_RUNTIME_SHARE, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 056cbd2e2a2..583a1368afe 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)  {  	int more = 0; +	if (!sched_feat(RT_RUNTIME_SHARE)) +		return more; +  	if (rt_rq->rt_time > rt_rq->rt_runtime) {  		raw_spin_unlock(&rt_rq->rt_runtime_lock);  		more = do_balance_runtime(rt_rq); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c436e790b21..8a46f5d6450 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -195,7 +195,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)  		struct alarm *alarm;  		ktime_t expired = next->expires; -		if (expired.tv64 >= now.tv64) +		if (expired.tv64 > now.tv64)  			break;  		alarm = container_of(next, struct alarm, node); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6..c4eb71c8b2e 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -387,6 +387,7 @@ void clockevents_exchange_device(struct clock_event_device *old,  	 * released list and do a notify add later.  	 */  	if (old) { +		old->event_handler = clockevents_handle_noop;  		clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);  		list_del(&old->list);  		list_add(&old->list, &clockevents_released); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e09..da2f760e780 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void)  }  /** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs:         Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ +	u64 ret; +	/* +	 * We won't try to correct for more then 11% adjustments (110,000 ppm), +	 */ +	ret = (u64)cs->mult * 11; +	do_div(ret,100); +	return (u32)ret; +} + +/**   * clocksource_max_deferment - Returns max time the clocksource can be deferred   * @cs:         Pointer to clocksource   * @@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs)  	/*  	 * Calculate the maximum number of cycles that we can pass to the  	 * cyc2ns function without overflowing a 64-bit signed result. The -	 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which -	 * is equivalent to the below. -	 * max_cycles < (2^63)/cs->mult -	 * max_cycles < 2^(log2((2^63)/cs->mult)) -	 * max_cycles < 2^(log2(2^63) - log2(cs->mult)) -	 * max_cycles < 2^(63 - log2(cs->mult)) -	 * max_cycles < 1 << (63 - log2(cs->mult)) +	 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) +	 * which is equivalent to the below. +	 * max_cycles < (2^63)/(cs->mult + cs->maxadj) +	 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) +	 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) +	 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) +	 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))  	 * Please note that we add 1 to the result of the log2 to account for  	 * any rounding errors, ensure the above inequality is satisfied and  	 * no overflow will occur.  	 */ -	max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); +	max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));  	/*  	 * The actual maximum number of cycles we can defer the clocksource is  	 * determined by the minimum of max_cycles and cs->mask. +	 * Note: Here we subtract the maxadj to make sure we don't sleep for +	 * too long if there's a large negative adjustment.  	 */  	max_cycles = min_t(u64, max_cycles, (u64) cs->mask); -	max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); +	max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, +					cs->shift);  	/*  	 * To ensure that the clocksource does not wrap whilst we are idle, @@ -529,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)  	 * note a margin of 12.5% is used because this can be computed with  	 * a shift, versus say 10% which would require division.  	 */ -	return max_nsecs - (max_nsecs >> 5); +	return max_nsecs - (max_nsecs >> 3);  }  #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs)  void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)  {  	u64 sec; -  	/*  	 * Calc the maximum number of seconds which we can run before  	 * wrapping around. For clocksources which have a mask > 32bit @@ -651,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)  	 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%  	 * margin as we do in clocksource_max_deferment()  	 */ -	sec = (cs->mask - (cs->mask >> 5)); +	sec = (cs->mask - (cs->mask >> 3));  	do_div(sec, freq);  	do_div(sec, scale);  	if (!sec) @@ -661,6 +679,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)  	clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,  			       NSEC_PER_SEC / scale, sec * scale); + +	/* +	 * for clocksources that have large mults, to avoid overflow. +	 * Since mult may be adjusted by ntp, add an safety extra margin +	 * +	 */ +	cs->maxadj = clocksource_max_adjustment(cs); +	while ((cs->mult + cs->maxadj < cs->mult) +		|| (cs->mult - cs->maxadj > cs->mult)) { +		cs->mult >>= 1; +		cs->shift--; +		cs->maxadj = clocksource_max_adjustment(cs); +	} +  	cs->max_idle_ns = clocksource_max_deferment(cs);  }  EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -701,6 +733,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);   */  int clocksource_register(struct clocksource *cs)  { +	/* calculate max adjustment for given mult/shift */ +	cs->maxadj = clocksource_max_adjustment(cs); +	WARN_ONCE(cs->mult + cs->maxadj < cs->mult, +		"Clocksource %s might overflow on 11%% adjustment\n", +		cs->name); +  	/* calculate max idle time permitted for this clocksource */  	cs->max_idle_ns = clocksource_max_deferment(cs); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f954282d9a8..fd4a7b1625a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)  	     (dev->features & CLOCK_EVT_FEAT_C3STOP))  		return 0; -	clockevents_exchange_device(NULL, dev); +	clockevents_exchange_device(tick_broadcast_device.evtdev, dev);  	tick_broadcast_device.evtdev = dev;  	if (!cpumask_empty(tick_get_broadcast_mask()))  		tick_broadcast_start_periodic(dev); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e850..237841378c0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -249,6 +249,8 @@ ktime_t ktime_get(void)  		secs = xtime.tv_sec + wall_to_monotonic.tv_sec;  		nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;  		nsecs += timekeeping_get_ns(); +		/* If arch requires, add in gettimeoffset() */ +		nsecs += arch_gettimeoffset();  	} while (read_seqretry(&xtime_lock, seq));  	/* @@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts)  		*ts = xtime;  		tomono = wall_to_monotonic;  		nsecs = timekeeping_get_ns(); +		/* If arch requires, add in gettimeoffset() */ +		nsecs += arch_gettimeoffset();  	} while (read_seqretry(&xtime_lock, seq)); @@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset)  	s64 error, interval = timekeeper.cycle_interval;  	int adj; +	/* +	 * The point of this is to check if the error is greater then half +	 * an interval. +	 * +	 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. +	 * +	 * Note we subtract one in the shift, so that error is really error*2. +	 * This "saves" dividing(shifting) intererval twice, but keeps the +	 * (error > interval) comparision as still measuring if error is +	 * larger then half an interval. +	 * +	 * Note: It does not "save" on aggrivation when reading the code. +	 */  	error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);  	if (error > interval) { +		/* +		 * We now divide error by 4(via shift), which checks if +		 * the error is greater then twice the interval. +		 * If it is greater, we need a bigadjust, if its smaller, +		 * we can adjust by 1. +		 */  		error >>= 2; +		/* +		 * XXX - In update_wall_time, we round up to the next +		 * nanosecond, and store the amount rounded up into +		 * the error. This causes the likely below to be unlikely. +		 * +		 * The properfix is to avoid rounding up by using +		 * the high precision timekeeper.xtime_nsec instead of +		 * xtime.tv_nsec everywhere. Fixing this will take some +		 * time. +		 */  		if (likely(error <= interval))  			adj = 1;  		else  			adj = timekeeping_bigadjust(error, &interval, &offset);  	} else if (error < -interval) { +		/* See comment above, this is just switched for the negative */  		error >>= 2;  		if (likely(error >= -interval)) {  			adj = -1; @@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset)  			offset = -offset;  		} else  			adj = timekeeping_bigadjust(error, &interval, &offset); -	} else +	} else /* No adjustment needed */  		return; +	WARN_ONCE(timekeeper.clock->maxadj && +			(timekeeper.mult + adj > timekeeper.clock->mult + +						timekeeper.clock->maxadj), +			"Adjusting %s more then 11%% (%ld vs %ld)\n", +			timekeeper.clock->name, (long)timekeeper.mult + adj, +			(long)timekeeper.clock->mult + +				timekeeper.clock->maxadj); +	/* +	 * So the following can be confusing. +	 * +	 * To keep things simple, lets assume adj == 1 for now. +	 * +	 * When adj != 1, remember that the interval and offset values +	 * have been appropriately scaled so the math is the same. +	 * +	 * The basic idea here is that we're increasing the multiplier +	 * by one, this causes the xtime_interval to be incremented by +	 * one cycle_interval. This is because: +	 *	xtime_interval = cycle_interval * mult +	 * So if mult is being incremented by one: +	 *	xtime_interval = cycle_interval * (mult + 1) +	 * Its the same as: +	 *	xtime_interval = (cycle_interval * mult) + cycle_interval +	 * Which can be shortened to: +	 *	xtime_interval += cycle_interval +	 * +	 * So offset stores the non-accumulated cycles. Thus the current +	 * time (in shifted nanoseconds) is: +	 *	now = (offset * adj) + xtime_nsec +	 * Now, even though we're adjusting the clock frequency, we have +	 * to keep time consistent. In other words, we can't jump back +	 * in time, and we also want to avoid jumping forward in time. +	 * +	 * So given the same offset value, we need the time to be the same +	 * both before and after the freq adjustment. +	 *	now = (offset * adj_1) + xtime_nsec_1 +	 *	now = (offset * adj_2) + xtime_nsec_2 +	 * So: +	 *	(offset * adj_1) + xtime_nsec_1 = +	 *		(offset * adj_2) + xtime_nsec_2 +	 * And we know: +	 *	adj_2 = adj_1 + 1 +	 * So: +	 *	(offset * adj_1) + xtime_nsec_1 = +	 *		(offset * (adj_1+1)) + xtime_nsec_2 +	 *	(offset * adj_1) + xtime_nsec_1 = +	 *		(offset * adj_1) + offset + xtime_nsec_2 +	 * Canceling the sides: +	 *	xtime_nsec_1 = offset + xtime_nsec_2 +	 * Which gives us: +	 *	xtime_nsec_2 = xtime_nsec_1 - offset +	 * Which simplfies to: +	 *	xtime_nsec -= offset +	 * +	 * XXX - TODO: Doc ntp_error calculation. +	 */  	timekeeper.mult += adj;  	timekeeper.xtime_interval += interval;  	timekeeper.xtime_nsec -= offset; diff --git a/kernel/timer.c b/kernel/timer.c index dbaa62422b1..9c3c62b0c4b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1368,7 +1368,7 @@ SYSCALL_DEFINE0(getppid)  	int pid;  	rcu_read_lock(); -	pid = task_tgid_vnr(current->real_parent); +	pid = task_tgid_vnr(rcu_dereference(current->real_parent));  	rcu_read_unlock();  	return pid; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900b409543d..b1e8943fed1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -152,7 +152,6 @@ void clear_ftrace_function(void)  	ftrace_pid_function = ftrace_stub;  } -#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST  #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST  /*   * For those archs that do not test ftrace_trace_stop in their @@ -1212,7 +1211,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	if (!src->count) {  		free_ftrace_hash_rcu(*dst);  		rcu_assign_pointer(*dst, EMPTY_HASH); -		return 0; +		/* still need to update the function records */ +		ret = 0; +		goto out;  	}  	/* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f38..c212a7f934e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)  	/* First see if we did not already create this dir */  	list_for_each_entry(system, &event_subsystems, list) {  		if (strcmp(system->name, name) == 0) { -			__get_system(system);  			system->nr_events++;  			return system->entry;  		} diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 816d3d07497..95dc31efd6d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1649,7 +1649,9 @@ static int replace_system_preds(struct event_subsystem *system,  		 */  		err = replace_preds(call, NULL, ps, filter_string, true);  		if (err) -			goto fail; +			call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; +		else +			call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;  	}  	list_for_each_entry(call, &ftrace_events, list) { @@ -1658,6 +1660,9 @@ static int replace_system_preds(struct event_subsystem *system,  		if (strcmp(call->class->system, system->name) != 0)  			continue; +		if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) +			continue; +  		filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);  		if (!filter_item)  			goto fail_mem; @@ -1686,7 +1691,7 @@ static int replace_system_preds(struct event_subsystem *system,  		 * replace the filter for the call.  		 */  		filter = call->filter; -		call->filter = filter_item->filter; +		rcu_assign_pointer(call->filter, filter_item->filter);  		filter_item->filter = filter;  		fail = false; @@ -1741,7 +1746,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)  		filter = call->filter;  		if (!filter)  			goto out_unlock; -		call->filter = NULL; +		RCU_INIT_POINTER(call->filter, NULL);  		/* Make sure the filter is not being used */  		synchronize_sched();  		__free_filter(filter); @@ -1782,7 +1787,7 @@ out:  	 * string  	 */  	tmp = call->filter; -	call->filter = filter; +	rcu_assign_pointer(call->filter, filter);  	if (tmp) {  		/* Make sure the call is done with the filter */  		synchronize_sched();  |