diff options
Diffstat (limited to 'kernel')
55 files changed, 1169 insertions, 614 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf..b6eadfe30e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,6 +47,7 @@  #include <linux/hash.h>  #include <linux/namei.h>  #include <linux/smp_lock.h> +#include <linux/pid_namespace.h>  #include <asm/atomic.h> @@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)   * reference to css->refcnt. In general, this refcnt is expected to goes down   * to zero, soon.   * - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;   */  DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)  { -	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) +	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))  		wake_up_all(&cgroup_rmdir_waitq);  } +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ +	css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ +	cgroup_wakeup_rmdir_waiter(css->cgroup); +	css_put(css); +} + +  static int rebind_subsystems(struct cgroupfs_root *root,  			      unsigned long final_bits)  { @@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	INIT_LIST_HEAD(&cgrp->children);  	INIT_LIST_HEAD(&cgrp->css_sets);  	INIT_LIST_HEAD(&cgrp->release_list); +	INIT_LIST_HEAD(&cgrp->pids_list);  	init_rwsem(&cgrp->pids_mutex);  }  static void init_cgroup_root(struct cgroupfs_root *root) @@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup  	 * is no longer empty.  	 */ -	cgroup_wakeup_rmdir_waiters(cgrp); +	cgroup_wakeup_rmdir_waiter(cgrp);  	return 0;  } @@ -2201,12 +2215,30 @@ err:  	return ret;  } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { +	/* The node in cgrp->pids_list */ +	struct list_head list; +	/* The cgroup those pids belong to */ +	struct cgroup *cgrp; +	/* The namepsace those pids belong to */ +	struct pid_namespace *ns; +	/* Array of process ids in the cgroup */ +	pid_t *tasks_pids; +	/* How many files are using the this tasks_pids array */ +	int use_count; +	/* Length of the current tasks_pids array */ +	int length; +}; +  static int cmppid(const void *a, const void *b)  {  	return *(pid_t *)a - *(pid_t *)b;  } -  /*   * seq_file methods for the "tasks" file. The seq_file position is the   * next pid to display; the seq_file iterator is a pointer to the pid @@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)  	 * after a seek to the start). Use a binary-search to find the  	 * next pid to display, if any  	 */ -	struct cgroup *cgrp = s->private; +	struct cgroup_pids *cp = s->private; +	struct cgroup *cgrp = cp->cgrp;  	int index = 0, pid = *pos;  	int *iter;  	down_read(&cgrp->pids_mutex);  	if (pid) { -		int end = cgrp->pids_length; +		int end = cp->length;  		while (index < end) {  			int mid = (index + end) / 2; -			if (cgrp->tasks_pids[mid] == pid) { +			if (cp->tasks_pids[mid] == pid) {  				index = mid;  				break; -			} else if (cgrp->tasks_pids[mid] <= pid) +			} else if (cp->tasks_pids[mid] <= pid)  				index = mid + 1;  			else  				end = mid;  		}  	}  	/* If we're off the end of the array, we're done */ -	if (index >= cgrp->pids_length) +	if (index >= cp->length)  		return NULL;  	/* Update the abstract position to be the actual pid that we found */ -	iter = cgrp->tasks_pids + index; +	iter = cp->tasks_pids + index;  	*pos = *iter;  	return iter;  }  static void cgroup_tasks_stop(struct seq_file *s, void *v)  { -	struct cgroup *cgrp = s->private; +	struct cgroup_pids *cp = s->private; +	struct cgroup *cgrp = cp->cgrp;  	up_read(&cgrp->pids_mutex);  }  static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)  { -	struct cgroup *cgrp = s->private; +	struct cgroup_pids *cp = s->private;  	int *p = v; -	int *end = cgrp->tasks_pids + cgrp->pids_length; +	int *end = cp->tasks_pids + cp->length;  	/*  	 * Advance to the next pid in the array. If this goes off the @@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {  	.show = cgroup_tasks_show,  }; -static void release_cgroup_pid_array(struct cgroup *cgrp) +static void release_cgroup_pid_array(struct cgroup_pids *cp)  { +	struct cgroup *cgrp = cp->cgrp; +  	down_write(&cgrp->pids_mutex); -	BUG_ON(!cgrp->pids_use_count); -	if (!--cgrp->pids_use_count) { -		kfree(cgrp->tasks_pids); -		cgrp->tasks_pids = NULL; -		cgrp->pids_length = 0; +	BUG_ON(!cp->use_count); +	if (!--cp->use_count) { +		list_del(&cp->list); +		put_pid_ns(cp->ns); +		kfree(cp->tasks_pids); +		kfree(cp);  	}  	up_write(&cgrp->pids_mutex);  }  static int cgroup_tasks_release(struct inode *inode, struct file *file)  { -	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); +	struct seq_file *seq; +	struct cgroup_pids *cp;  	if (!(file->f_mode & FMODE_READ))  		return 0; -	release_cgroup_pid_array(cgrp); +	seq = file->private_data; +	cp = seq->private; + +	release_cgroup_pid_array(cp);  	return seq_release(inode, file);  } @@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {  static int cgroup_tasks_open(struct inode *unused, struct file *file)  {  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); +	struct pid_namespace *ns = current->nsproxy->pid_ns; +	struct cgroup_pids *cp;  	pid_t *pidarray;  	int npids;  	int retval; @@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)  	 * array if necessary  	 */  	down_write(&cgrp->pids_mutex); -	kfree(cgrp->tasks_pids); -	cgrp->tasks_pids = pidarray; -	cgrp->pids_length = npids; -	cgrp->pids_use_count++; + +	list_for_each_entry(cp, &cgrp->pids_list, list) { +		if (ns == cp->ns) +			goto found; +	} + +	cp = kzalloc(sizeof(*cp), GFP_KERNEL); +	if (!cp) { +		up_write(&cgrp->pids_mutex); +		kfree(pidarray); +		return -ENOMEM; +	} +	cp->cgrp = cgrp; +	cp->ns = ns; +	get_pid_ns(ns); +	list_add(&cp->list, &cgrp->pids_list); +found: +	kfree(cp->tasks_pids); +	cp->tasks_pids = pidarray; +	cp->length = npids; +	cp->use_count++;  	up_write(&cgrp->pids_mutex);  	file->f_op = &cgroup_tasks_operations;  	retval = seq_open(file, &cgroup_tasks_seq_operations);  	if (retval) { -		release_cgroup_pid_array(cgrp); +		release_cgroup_pid_array(cp);  		return retval;  	} -	((struct seq_file *)file->private_data)->private = cgrp; +	((struct seq_file *)file->private_data)->private = cp;  	return 0;  } @@ -2696,33 +2756,42 @@ again:  	mutex_unlock(&cgroup_mutex);  	/* +	 * In general, subsystem has no css->refcnt after pre_destroy(). But +	 * in racy cases, subsystem may have to get css->refcnt after +	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes +	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue +	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir +	 * and subsystem's reference count handling. Please see css_get/put +	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. +	 */ +	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + +	/*  	 * Call pre_destroy handlers of subsys. Notify subsystems  	 * that rmdir() request comes.  	 */  	ret = cgroup_call_pre_destroy(cgrp); -	if (ret) +	if (ret) { +		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  		return ret; +	}  	mutex_lock(&cgroup_mutex);  	parent = cgrp->parent;  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { +		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  		mutex_unlock(&cgroup_mutex);  		return -EBUSY;  	} -	/* -	 * css_put/get is provided for subsys to grab refcnt to css. In typical -	 * case, subsystem has no reference after pre_destroy(). But, under -	 * hierarchy management, some *temporal* refcnt can be hold. -	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys -	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up -	 * is called when css_put() is called and refcnt goes down to 0. -	 */ -	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); -  	if (!cgroup_clear_css_refs(cgrp)) {  		mutex_unlock(&cgroup_mutex); -		schedule(); +		/* +		 * Because someone may call cgroup_wakeup_rmdir_waiter() before +		 * prepare_to_wait(), we need to check this flag. +		 */ +		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) +			schedule();  		finish_wait(&cgroup_rmdir_waitq, &wait);  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);  		if (signal_pending(current)) @@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)  			set_bit(CGRP_RELEASABLE, &cgrp->flags);  			check_for_release(cgrp);  		} -		cgroup_wakeup_rmdir_waiters(cgrp); +		cgroup_wakeup_rmdir_waiter(cgrp);  	}  	rcu_read_unlock();  } diff --git a/kernel/exit.c b/kernel/exit.c index 628d41f0dd5..869dc221733 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -12,7 +12,6 @@  #include <linux/completion.h>  #include <linux/personality.h>  #include <linux/tty.h> -#include <linux/mnt_namespace.h>  #include <linux/iocontext.h>  #include <linux/key.h>  #include <linux/security.h> diff --git a/kernel/fork.c b/kernel/fork.c index 467746b3f0a..144326b7af5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -17,7 +17,6 @@  #include <linux/module.h>  #include <linux/vmalloc.h>  #include <linux/completion.h> -#include <linux/mnt_namespace.h>  #include <linux/personality.h>  #include <linux/mempolicy.h>  #include <linux/sem.h> @@ -568,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)  	 * the value intact in a core dump, and to save the unnecessary  	 * trouble otherwise.  Userland only wants this done for a sys_exit.  	 */ -	if (tsk->clear_child_tid -	    && !(tsk->flags & PF_SIGNALED) -	    && atomic_read(&mm->mm_users) > 1) { -		u32 __user * tidptr = tsk->clear_child_tid; +	if (tsk->clear_child_tid) { +		if (!(tsk->flags & PF_SIGNALED) && +		    atomic_read(&mm->mm_users) > 1) { +			/* +			 * We don't check the error code - if userspace has +			 * not set up a proper pointer then tough luck. +			 */ +			put_user(0, tsk->clear_child_tid); +			sys_futex(tsk->clear_child_tid, FUTEX_WAKE, +					1, NULL, NULL, 0); +		}  		tsk->clear_child_tid = NULL; - -		/* -		 * We don't check the error code - if userspace has -		 * not set up a proper pointer then tough luck. -		 */ -		put_user(0, tidptr); -		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);  	}  } @@ -1269,6 +1268,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	write_unlock_irq(&tasklist_lock);  	proc_fork_connector(p);  	cgroup_post_fork(p); +	perf_counter_fork(p);  	return p;  bad_fork_free_pid: @@ -1408,12 +1408,6 @@ long do_fork(unsigned long clone_flags,  		if (clone_flags & CLONE_VFORK) {  			p->vfork_done = &vfork;  			init_completion(&vfork); -		} else if (!(clone_flags & CLONE_VM)) { -			/* -			 * vfork will do an exec which will call -			 * set_task_comm() -			 */ -			perf_counter_fork(p);  		}  		audit_finish_fork(p); diff --git a/kernel/freezer.c b/kernel/freezer.c index 2f4936cf708..bd1d42b17cb 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,12 +44,19 @@ void refrigerator(void)  	recalc_sigpending(); /* We sent fake signal, clean it up */  	spin_unlock_irq(¤t->sighand->siglock); +	/* prevent accounting of that task to load */ +	current->flags |= PF_FREEZING; +  	for (;;) {  		set_current_state(TASK_UNINTERRUPTIBLE);  		if (!frozen(current))  			break;  		schedule();  	} + +	/* Remove the accounting blocker */ +	current->flags &= ~PF_FREEZING; +  	pr_debug("%s left refrigerator\n", current->comm);  	__set_current_state(save);  } diff --git a/kernel/futex.c b/kernel/futex.c index 794c862125f..e18cfbdc719 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -247,6 +247,7 @@ again:  	if (err < 0)  		return err; +	page = compound_head(page);  	lock_page(page);  	if (!page->mapping) {  		unlock_page(page); @@ -1009,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,   * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue   * q:	the futex_q   * key:	the key of the requeue target futex + * hb:  the hash_bucket of the requeue target futex   *   * During futex_requeue, with requeue_pi=1, it is possible to acquire the   * target futex if it is uncontended or via a lock steal.  Set the futex_q key   * to the requeue target futex so the waiter can detect the wakeup on the right   * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition.  Must be called with the q->lock_ptr held. + * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later.  Must be called + * with both q->lock_ptr and hb->lock held.   */  static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, +			   struct futex_hash_bucket *hb)  {  	drop_futex_key_refs(&q->key);  	get_futex_key_refs(key); @@ -1029,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)  	WARN_ON(!q->rt_waiter);  	q->rt_waiter = NULL; +	q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST +	q->list.plist.lock = &hb->lock; +#endif +  	wake_up_state(q->task, TASK_NORMAL);  } @@ -1087,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,  	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,  				   set_waiters);  	if (ret == 1) -		requeue_pi_wake_futex(top_waiter, key2); +		requeue_pi_wake_futex(top_waiter, key2, hb2);  	return ret;  } @@ -1246,8 +1256,15 @@ retry_private:  		if (!match_futex(&this->key, &key1))  			continue; -		WARN_ON(!requeue_pi && this->rt_waiter); -		WARN_ON(requeue_pi && !this->rt_waiter); +		/* +		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always +		 * be paired with each other and no other futex ops. +		 */ +		if ((requeue_pi && !this->rt_waiter) || +		    (!requeue_pi && this->rt_waiter)) { +			ret = -EINVAL; +			break; +		}  		/*  		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the @@ -1272,7 +1289,7 @@ retry_private:  							this->task, 1);  			if (ret == 1) {  				/* We got the lock. */ -				requeue_pi_wake_futex(this, &key2); +				requeue_pi_wake_futex(this, &key2, hb2);  				continue;  			} else if (ret) {  				/* -EDEADLK */ diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b9ee2..235716556bf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,  	int cmd = op & FUTEX_CMD_MASK;  	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || -		      cmd == FUTEX_WAIT_BITSET)) { +		      cmd == FUTEX_WAIT_BITSET || +		      cmd == FUTEX_WAIT_REQUEUE_PI)) {  		if (get_compat_timespec(&ts, utime))  			return -EFAULT;  		if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,  			t = ktime_add_safe(ktime_get(), t);  		tp = &t;  	} -	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) +	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || +	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)  		val2 = (int) (unsigned long) utime;  	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9002958a96e..49da79ab848 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,  	}  } + +/* + * Get the preferred target CPU for NOHZ + */ +static int hrtimer_get_target(int this_cpu, int pinned) +{ +#ifdef CONFIG_NO_HZ +	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { +		int preferred_cpu = get_nohz_load_balancer(); + +		if (preferred_cpu >= 0) +			return preferred_cpu; +	} +#endif +	return this_cpu; +} + +/* + * With HIGHRES=y we do not migrate the timer when it is expiring + * before the next event on the target cpu because we cannot reprogram + * the target cpu hardware and we would cause it to fire late. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ +#ifdef CONFIG_HIGH_RES_TIMERS +	ktime_t expires; + +	if (!new_base->cpu_base->hres_active) +		return 0; + +	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); +	return expires.tv64 <= new_base->cpu_base->expires_next.tv64; +#else +	return 0; +#endif +} +  /*   * Switch the timer base to the current CPU when possible.   */ @@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,  {  	struct hrtimer_clock_base *new_base;  	struct hrtimer_cpu_base *new_cpu_base; -	int cpu, preferred_cpu = -1; - -	cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) -	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { -		preferred_cpu = get_nohz_load_balancer(); -		if (preferred_cpu >= 0) -			cpu = preferred_cpu; -	} -#endif +	int this_cpu = smp_processor_id(); +	int cpu = hrtimer_get_target(this_cpu, pinned);  again:  	new_cpu_base = &per_cpu(hrtimer_bases, cpu); @@ -217,7 +249,7 @@ again:  	if (base != new_base) {  		/* -		 * We are trying to schedule the timer on the local CPU. +		 * We are trying to move timer to new_base.  		 * However we can't change timer's base while it is running,  		 * so we keep it on the same CPU. No hassle vs. reprogramming  		 * the event source in the high resolution case. The softirq @@ -233,38 +265,12 @@ again:  		spin_unlock(&base->cpu_base->lock);  		spin_lock(&new_base->cpu_base->lock); -		/* Optimized away for NOHZ=n SMP=n */ -		if (cpu == preferred_cpu) { -			/* Calculate clock monotonic expiry time */ -#ifdef CONFIG_HIGH_RES_TIMERS -			ktime_t expires = ktime_sub(hrtimer_get_expires(timer), -							new_base->offset); -#else -			ktime_t expires = hrtimer_get_expires(timer); -#endif - -			/* -			 * Get the next event on target cpu from the -			 * clock events layer. -			 * This covers the highres=off nohz=on case as well. -			 */ -			ktime_t next = clockevents_get_next_event(cpu); - -			ktime_t delta = ktime_sub(expires, next); - -			/* -			 * We do not migrate the timer when it is expiring -			 * before the next event on the target cpu because -			 * we cannot reprogram the target cpu hardware and -			 * we would cause it to fire late. -			 */ -			if (delta.tv64 < 0) { -				cpu = smp_processor_id(); -				spin_unlock(&new_base->cpu_base->lock); -				spin_lock(&base->cpu_base->lock); -				timer->base = base; -				goto again; -			} +		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { +			cpu = this_cpu; +			spin_unlock(&new_base->cpu_base->lock); +			spin_lock(&base->cpu_base->lock); +			timer->base = base; +			goto again;  		}  		timer->base = new_base;  	} @@ -1276,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)  	expires_next.tv64 = KTIME_MAX; +	spin_lock(&cpu_base->lock); +	/* +	 * We set expires_next to KTIME_MAX here with cpu_base->lock +	 * held to prevent that a timer is enqueued in our queue via +	 * the migration code. This does not affect enqueueing of +	 * timers which run their callback and need to be requeued on +	 * this CPU. +	 */ +	cpu_base->expires_next.tv64 = KTIME_MAX; +  	base = cpu_base->clock_base;  	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {  		ktime_t basenow;  		struct rb_node *node; -		spin_lock(&cpu_base->lock); -  		basenow = ktime_add(now, base->offset);  		while ((node = base->first)) { @@ -1316,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)  			__run_hrtimer(timer);  		} -		spin_unlock(&cpu_base->lock);  		base++;  	} +	/* +	 * Store the new expiry value so the migration code can verify +	 * against it. +	 */  	cpu_base->expires_next = expires_next; +	spin_unlock(&cpu_base->lock);  	/* Reprogramming necessary ? */  	if (expires_next.tv64 != KTIME_MAX) { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 73468253143..e70ed5592eb 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq,  extern int irq_select_affinity_usr(unsigned int irq); -extern void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); +extern void irq_set_thread_affinity(struct irq_desc *desc);  /*   * Debugging printout: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50da6767290..0ec9ed83173 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)  	return 1;  } -void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +/** + *	irq_set_thread_affinity - Notify irq threads to adjust affinity + *	@desc:		irq descriptor which has affitnity changed + * + *	We just set IRQTF_AFFINITY and delegate the affinity setting + *	to the interrupt thread itself. We can not call + *	set_cpus_allowed_ptr() here as we hold desc->lock and this + *	code can be called from hard interrupt context. + */ +void irq_set_thread_affinity(struct irq_desc *desc)  {  	struct irqaction *action = desc->action;  	while (action) {  		if (action->thread) -			set_cpus_allowed_ptr(action->thread, cpumask); +			set_bit(IRQTF_AFFINITY, &action->thread_flags);  		action = action->next;  	}  } @@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)  	if (desc->status & IRQ_MOVE_PCNTXT) {  		if (!desc->chip->set_affinity(irq, cpumask)) {  			cpumask_copy(desc->affinity, cpumask); -			irq_set_thread_affinity(desc, cpumask); +			irq_set_thread_affinity(desc);  		}  	}  	else { @@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)  #else  	if (!desc->chip->set_affinity(irq, cpumask)) {  		cpumask_copy(desc->affinity, cpumask); -		irq_set_thread_affinity(desc, cpumask); +		irq_set_thread_affinity(desc);  	}  #endif  	desc->status |= IRQ_AFFINITY_SET; @@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)  	spin_lock_irqsave(&desc->lock, flags);  	ret = setup_affinity(irq, desc);  	if (!ret) -		irq_set_thread_affinity(desc, desc->affinity); +		irq_set_thread_affinity(desc);  	spin_unlock_irqrestore(&desc->lock, flags);  	return ret; @@ -443,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)  	return -1;  } +#ifdef CONFIG_SMP +/* + * Check whether we need to change the affinity of the interrupt thread. + */ +static void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ +	cpumask_var_t mask; + +	if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) +		return; + +	/* +	 * In case we are out of memory we set IRQTF_AFFINITY again and +	 * try again next time +	 */ +	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { +		set_bit(IRQTF_AFFINITY, &action->thread_flags); +		return; +	} + +	spin_lock_irq(&desc->lock); +	cpumask_copy(mask, desc->affinity); +	spin_unlock_irq(&desc->lock); + +	set_cpus_allowed_ptr(current, mask); +	free_cpumask_var(mask); +} +#else +static inline void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } +#endif +  /*   * Interrupt handler thread   */ @@ -458,6 +499,8 @@ static int irq_thread(void *data)  	while (!irq_wait_for_interrupt(action)) { +		irq_thread_check_affinity(desc, action); +  		atomic_inc(&desc->threads_active);  		spin_lock_irq(&desc->lock); @@ -564,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		 */  		get_task_struct(t);  		new->thread = t; -		wake_up_process(t);  	}  	/* @@ -647,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  				(int)(new->flags & IRQF_TRIGGER_MASK));  	} +	new->irq = irq;  	*old_ptr = new;  	/* Reset broken irq detection when installing new handler */ @@ -664,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  	spin_unlock_irqrestore(&desc->lock, flags); -	new->irq = irq; +	/* +	 * Strictly no need to wake it up, but hung_task complains +	 * when no hard interrupt wakes the thread up. +	 */ +	if (new->thread) +		wake_up_process(new->thread); +  	register_irq_proc(irq, desc);  	new->dir = NULL;  	register_handler_proc(irq, new); @@ -718,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  {  	struct irq_desc *desc = irq_to_desc(irq);  	struct irqaction *action, **action_ptr; -	struct task_struct *irqthread;  	unsigned long flags;  	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -766,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  			desc->chip->disable(irq);  	} -	irqthread = action->thread; -	action->thread = NULL; -  	spin_unlock_irqrestore(&desc->lock, flags);  	unregister_handler_proc(irq, action); @@ -776,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  	/* Make sure it's not being used on another CPU: */  	synchronize_irq(irq); -	if (irqthread) { -		if (!test_bit(IRQTF_DIED, &action->thread_flags)) -			kthread_stop(irqthread); -		put_task_struct(irqthread); -	} -  #ifdef CONFIG_DEBUG_SHIRQ  	/*  	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -797,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  		local_irq_restore(flags);  	}  #endif + +	if (action->thread) { +		if (!test_bit(IRQTF_DIED, &action->thread_flags)) +			kthread_stop(action->thread); +		put_task_struct(action->thread); +	} +  	return action;  } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index cfe767ca154..fcb6c96f262 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -45,7 +45,7 @@ void move_masked_irq(int irq)  		   < nr_cpu_ids))  		if (!desc->chip->set_affinity(irq, desc->pending_mask)) {  			cpumask_copy(desc->affinity, desc->pending_mask); -			irq_set_thread_affinity(desc, desc->pending_mask); +			irq_set_thread_affinity(desc);  		}  	cpumask_clear(desc->pending_mask); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee57bf..3fd30197da2 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock:  struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)  { -	/* those all static, do move them */ -	if (desc->irq < NR_IRQS_LEGACY) +	/* those static or target node is -1, do not move them */ +	if (desc->irq < NR_IRQS_LEGACY || node == -1)  		return desc;  	if (desc->node != node) diff --git a/kernel/kexec.c b/kernel/kexec.c index ae1c35201cc..f336e2107f9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char 			*cmdline,  	} while (*cur++ == ',');  	if (*crash_size > 0) { -		while (*cur != ' ' && *cur != '@') +		while (*cur && *cur != ' ' && *cur != '@')  			cur++;  		if (*cur == '@') {  			cur++; diff --git a/kernel/kmod.c b/kernel/kmod.c index 7e95bedb2bf..385c31a1bdb 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -24,7 +24,6 @@  #include <linux/unistd.h>  #include <linux/kmod.h>  #include <linux/slab.h> -#include <linux/mnt_namespace.h>  #include <linux/completion.h>  #include <linux/file.h>  #include <linux/fdtable.h> diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c0fa54b276d..0540948e29a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)  {  	struct kprobe_insn_page *kip;  	struct hlist_node *pos, *next; -	int safety;  	/* Ensure no-one is preepmted on the garbages */ -	mutex_unlock(&kprobe_insn_mutex); -	safety = check_safety(); -	mutex_lock(&kprobe_insn_mutex); -	if (safety != 0) +	if (check_safety())  		return -EAGAIN;  	hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { @@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)  	p->addr = addr;  	preempt_disable(); -	if (!__kernel_text_address((unsigned long) p->addr) || +	if (!kernel_text_address((unsigned long) p->addr) ||  	    in_kprobes_functions((unsigned long) p->addr)) {  		preempt_enable();  		return -EINVAL; diff --git a/kernel/kthread.c b/kernel/kthread.c index 9b1a7de2697..eb8751aa041 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind);   * @k: thread created by kthread_create().   *   * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit.  Your threadfn() must not call do_exit() - * itself if you use this function!  This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away.   *   * Returns the result of threadfn(), or %-EINTR if wake_up_process()   * was never called. diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa2d2c..e94caa666db 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)  		    &proc_lockdep_stats_operations);  #ifdef CONFIG_LOCK_STAT -	proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); +	proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, +		    &proc_lock_stat_operations);  #endif  	return 0; diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2..fd141140355 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,  {  	const unsigned long *crc; -	if (!find_symbol("module_layout", NULL, &crc, true, false)) +	if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, +			 &crc, true, false))  		BUG();  	return check_version(sechdrs, versindex, "module_layout", mod, crc);  } @@ -2451,9 +2452,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  		return ret;  	}  	if (ret > 0) { -		printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " -				    "it should follow 0/-E convention\n" -		       KERN_WARNING "%s: loading module anyway...\n", +		printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n",  		       __func__, mod->name, ret,  		       __func__);  		dump_stack(); diff --git a/kernel/panic.c b/kernel/panic.c index 984b3ecbd72..512ab73b0ca 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -301,6 +301,7 @@ int oops_may_print(void)   */  void oops_enter(void)  { +	tracing_off();  	/* can't trust the integrity of the kernel anymore: */  	debug_locks_off();  	do_oops_enter_exit(); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da234..36f65e2b8b5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;  static atomic_t nr_counters __read_mostly;  static atomic_t nr_mmap_counters __read_mostly;  static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly;  /*   * perf counter paranoia level: @@ -87,6 +88,7 @@ void __weak hw_perf_disable(void)		{ barrier(); }  void __weak hw_perf_enable(void)		{ barrier(); }  void __weak hw_perf_counter_setup(int cpu)	{ barrier(); } +void __weak hw_perf_counter_setup_online(int cpu)	{ barrier(); }  int __weak  hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -146,6 +148,28 @@ static void put_ctx(struct perf_counter_context *ctx)  	}  } +static void unclone_ctx(struct perf_counter_context *ctx) +{ +	if (ctx->parent_ctx) { +		put_ctx(ctx->parent_ctx); +		ctx->parent_ctx = NULL; +	} +} + +/* + * If we inherit counters we want to return the parent counter id + * to userspace. + */ +static u64 primary_counter_id(struct perf_counter *counter) +{ +	u64 id = counter->id; + +	if (counter->parent) +		id = counter->parent->id; + +	return id; +} +  /*   * Get the perf_counter_context for a task and lock it.   * This has to cope with with the fact that until it is locked, @@ -283,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,  		return;  	counter->state = PERF_COUNTER_STATE_INACTIVE; +	if (counter->pending_disable) { +		counter->pending_disable = 0; +		counter->state = PERF_COUNTER_STATE_OFF; +	}  	counter->tstamp_stopped = ctx->time;  	counter->pmu->disable(counter);  	counter->oncpu = -1; @@ -1081,7 +1109,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,  		__perf_counter_sync_stat(counter, next_counter);  		counter = list_next_entry(counter, event_entry); -		next_counter = list_next_entry(counter, event_entry); +		next_counter = list_next_entry(next_counter, event_entry);  	}  } @@ -1288,7 +1316,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)  #define MAX_INTERRUPTS (~0ULL)  static void perf_log_throttle(struct perf_counter *counter, int enable); -static void perf_log_period(struct perf_counter *counter, u64 period);  static void perf_adjust_period(struct perf_counter *counter, u64 events)  { @@ -1307,8 +1334,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)  	if (!sample_period)  		sample_period = 1; -	perf_log_period(counter, sample_period); -  	hwc->sample_period = sample_period;  } @@ -1463,10 +1488,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task)  	/*  	 * Unclone this context if we enabled any counter.  	 */ -	if (enabled && ctx->parent_ctx) { -		put_ctx(ctx->parent_ctx); -		ctx->parent_ctx = NULL; -	} +	if (enabled) +		unclone_ctx(ctx);  	spin_unlock(&ctx->lock); @@ -1480,10 +1503,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task)   */  static void __perf_counter_read(void *info)  { +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);  	struct perf_counter *counter = info;  	struct perf_counter_context *ctx = counter->ctx;  	unsigned long flags; +	/* +	 * If this is a task context, we need to check whether it is +	 * the current task context of this cpu.  If not it has been +	 * scheduled out before the smp call arrived.  In that case +	 * counter->count would have been updated to a recent sample +	 * when the counter was scheduled out. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) +		return; +  	local_irq_save(flags);  	if (ctx->is_active)  		update_context_time(ctx); @@ -1526,7 +1560,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,  static struct perf_counter_context *find_get_context(pid_t pid, int cpu)  { -	struct perf_counter_context *parent_ctx;  	struct perf_counter_context *ctx;  	struct perf_cpu_context *cpuctx;  	struct task_struct *task; @@ -1586,11 +1619,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)   retry:  	ctx = perf_lock_task_context(task, &flags);  	if (ctx) { -		parent_ctx = ctx->parent_ctx; -		if (parent_ctx) { -			put_ctx(parent_ctx); -			ctx->parent_ctx = NULL;		/* no longer a clone */ -		} +		unclone_ctx(ctx);  		spin_unlock_irqrestore(&ctx->lock, flags);  	} @@ -1642,6 +1671,8 @@ static void free_counter(struct perf_counter *counter)  			atomic_dec(&nr_mmap_counters);  		if (counter->attr.comm)  			atomic_dec(&nr_comm_counters); +		if (counter->attr.task) +			atomic_dec(&nr_task_counters);  	}  	if (counter->destroy) @@ -1676,14 +1707,133 @@ static int perf_release(struct inode *inode, struct file *file)  	return 0;  } +static int perf_counter_read_size(struct perf_counter *counter) +{ +	int entry = sizeof(u64); /* value */ +	int size = 0; +	int nr = 1; + +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) +		size += sizeof(u64); + +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) +		size += sizeof(u64); + +	if (counter->attr.read_format & PERF_FORMAT_ID) +		entry += sizeof(u64); + +	if (counter->attr.read_format & PERF_FORMAT_GROUP) { +		nr += counter->group_leader->nr_siblings; +		size += sizeof(u64); +	} + +	size += entry * nr; + +	return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) +{ +	struct perf_counter *child; +	u64 total = 0; + +	total += perf_counter_read(counter); +	list_for_each_entry(child, &counter->child_list, child_list) +		total += perf_counter_read(child); + +	return total; +} + +static int perf_counter_read_entry(struct perf_counter *counter, +				   u64 read_format, char __user *buf) +{ +	int n = 0, count = 0; +	u64 values[2]; + +	values[n++] = perf_counter_read_value(counter); +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(counter); + +	count = n * sizeof(u64); + +	if (copy_to_user(buf, values, count)) +		return -EFAULT; + +	return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, +				   u64 read_format, char __user *buf) +{ +	struct perf_counter *leader = counter->group_leader, *sub; +	int n = 0, size = 0, err = -EFAULT; +	u64 values[3]; + +	values[n++] = 1 + leader->nr_siblings; +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { +		values[n++] = leader->total_time_enabled + +			atomic64_read(&leader->child_total_time_enabled); +	} +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { +		values[n++] = leader->total_time_running + +			atomic64_read(&leader->child_total_time_running); +	} + +	size = n * sizeof(u64); + +	if (copy_to_user(buf, values, size)) +		return -EFAULT; + +	err = perf_counter_read_entry(leader, read_format, buf + size); +	if (err < 0) +		return err; + +	size += err; + +	list_for_each_entry(sub, &leader->sibling_list, list_entry) { +		err = perf_counter_read_entry(counter, read_format, +				buf + size); +		if (err < 0) +			return err; + +		size += err; +	} + +	return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, +				 u64 read_format, char __user *buf) +{ +	u64 values[4]; +	int n = 0; + +	values[n++] = perf_counter_read_value(counter); +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { +		values[n++] = counter->total_time_enabled + +			atomic64_read(&counter->child_total_time_enabled); +	} +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { +		values[n++] = counter->total_time_running + +			atomic64_read(&counter->child_total_time_running); +	} +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(counter); + +	if (copy_to_user(buf, values, n * sizeof(u64))) +		return -EFAULT; + +	return n * sizeof(u64); +} +  /*   * Read the performance counter - simple non blocking version for now   */  static ssize_t  perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)  { -	u64 values[4]; -	int n; +	u64 read_format = counter->attr.read_format; +	int ret;  	/*  	 * Return end-of-file for a read on a counter that is in @@ -1693,28 +1843,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)  	if (counter->state == PERF_COUNTER_STATE_ERROR)  		return 0; +	if (count < perf_counter_read_size(counter)) +		return -ENOSPC; +  	WARN_ON_ONCE(counter->ctx->parent_ctx);  	mutex_lock(&counter->child_mutex); -	values[0] = perf_counter_read(counter); -	n = 1; -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) -		values[n++] = counter->total_time_enabled + -			atomic64_read(&counter->child_total_time_enabled); -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) -		values[n++] = counter->total_time_running + -			atomic64_read(&counter->child_total_time_running); -	if (counter->attr.read_format & PERF_FORMAT_ID) -		values[n++] = counter->id; +	if (read_format & PERF_FORMAT_GROUP) +		ret = perf_counter_read_group(counter, read_format, buf); +	else +		ret = perf_counter_read_one(counter, read_format, buf);  	mutex_unlock(&counter->child_mutex); -	if (count < n * sizeof(u64)) -		return -EINVAL; -	count = n * sizeof(u64); - -	if (copy_to_user(buf, values, count)) -		return -EFAULT; - -	return count; +	return ret;  }  static ssize_t @@ -1811,8 +1951,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)  		counter->attr.sample_freq = value;  	} else { -		perf_log_period(counter, value); -  		counter->attr.sample_period = value;  		counter->hw.sample_period = value;  	} @@ -1881,6 +2019,10 @@ int perf_counter_task_disable(void)  	return 0;  } +#ifndef PERF_COUNTER_INDEX_OFFSET +# define PERF_COUNTER_INDEX_OFFSET 0 +#endif +  static int perf_counter_index(struct perf_counter *counter)  {  	if (counter->state != PERF_COUNTER_STATE_ACTIVE) @@ -2020,7 +2162,7 @@ fail:  static void perf_mmap_free_page(unsigned long addr)  { -	struct page *page = virt_to_page(addr); +	struct page *page = virt_to_page((void *)addr);  	page->mapping = NULL;  	__free_page(page); @@ -2220,7 +2362,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)  	if (counter->pending_disable) {  		counter->pending_disable = 0; -		perf_counter_disable(counter); +		__perf_counter_disable(counter);  	}  	if (counter->pending_wakeup) { @@ -2605,7 +2747,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)  	return task_pid_nr_ns(p, counter->ns);  } -static void perf_counter_output(struct perf_counter *counter, int nmi, +static void perf_output_read_one(struct perf_output_handle *handle, +				 struct perf_counter *counter) +{ +	u64 read_format = counter->attr.read_format; +	u64 values[4]; +	int n = 0; + +	values[n++] = atomic64_read(&counter->count); +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { +		values[n++] = counter->total_time_enabled + +			atomic64_read(&counter->child_total_time_enabled); +	} +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { +		values[n++] = counter->total_time_running + +			atomic64_read(&counter->child_total_time_running); +	} +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(counter); + +	perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, +			    struct perf_counter *counter) +{ +	struct perf_counter *leader = counter->group_leader, *sub; +	u64 read_format = counter->attr.read_format; +	u64 values[5]; +	int n = 0; + +	values[n++] = 1 + leader->nr_siblings; + +	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) +		values[n++] = leader->total_time_enabled; + +	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) +		values[n++] = leader->total_time_running; + +	if (leader != counter) +		leader->pmu->read(leader); + +	values[n++] = atomic64_read(&leader->count); +	if (read_format & PERF_FORMAT_ID) +		values[n++] = primary_counter_id(leader); + +	perf_output_copy(handle, values, n * sizeof(u64)); + +	list_for_each_entry(sub, &leader->sibling_list, list_entry) { +		n = 0; + +		if (sub != counter) +			sub->pmu->read(sub); + +		values[n++] = atomic64_read(&sub->count); +		if (read_format & PERF_FORMAT_ID) +			values[n++] = primary_counter_id(sub); + +		perf_output_copy(handle, values, n * sizeof(u64)); +	} +} + +static void perf_output_read(struct perf_output_handle *handle, +			     struct perf_counter *counter) +{ +	if (counter->attr.read_format & PERF_FORMAT_GROUP) +		perf_output_read_group(handle, counter); +	else +		perf_output_read_one(handle, counter); +} + +void perf_counter_output(struct perf_counter *counter, int nmi,  				struct perf_sample_data *data)  {  	int ret; @@ -2616,10 +2831,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  	struct {  		u32 pid, tid;  	} tid_entry; -	struct { -		u64 id; -		u64 counter; -	} group_entry;  	struct perf_callchain_entry *callchain = NULL;  	int callchain_size = 0;  	u64 time; @@ -2661,19 +2872,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  	if (sample_type & PERF_SAMPLE_ID)  		header.size += sizeof(u64); +	if (sample_type & PERF_SAMPLE_STREAM_ID) +		header.size += sizeof(u64); +  	if (sample_type & PERF_SAMPLE_CPU) {  		header.size += sizeof(cpu_entry);  		cpu_entry.cpu = raw_smp_processor_id(); +		cpu_entry.reserved = 0;  	}  	if (sample_type & PERF_SAMPLE_PERIOD)  		header.size += sizeof(u64); -	if (sample_type & PERF_SAMPLE_GROUP) { -		header.size += sizeof(u64) + -			counter->nr_siblings * sizeof(group_entry); -	} +	if (sample_type & PERF_SAMPLE_READ) +		header.size += perf_counter_read_size(counter);  	if (sample_type & PERF_SAMPLE_CALLCHAIN) {  		callchain = perf_callchain(data->regs); @@ -2685,6 +2898,18 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  			header.size += sizeof(u64);  	} +	if (sample_type & PERF_SAMPLE_RAW) { +		int size = sizeof(u32); + +		if (data->raw) +			size += data->raw->size; +		else +			size += sizeof(u32); + +		WARN_ON_ONCE(size & (sizeof(u64)-1)); +		header.size += size; +	} +  	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);  	if (ret)  		return; @@ -2703,7 +2928,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  	if (sample_type & PERF_SAMPLE_ADDR)  		perf_output_put(&handle, data->addr); -	if (sample_type & PERF_SAMPLE_ID) +	if (sample_type & PERF_SAMPLE_ID) { +		u64 id = primary_counter_id(counter); + +		perf_output_put(&handle, id); +	} + +	if (sample_type & PERF_SAMPLE_STREAM_ID)  		perf_output_put(&handle, counter->id);  	if (sample_type & PERF_SAMPLE_CPU) @@ -2712,26 +2943,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  	if (sample_type & PERF_SAMPLE_PERIOD)  		perf_output_put(&handle, data->period); -	/* -	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. -	 */ -	if (sample_type & PERF_SAMPLE_GROUP) { -		struct perf_counter *leader, *sub; -		u64 nr = counter->nr_siblings; - -		perf_output_put(&handle, nr); - -		leader = counter->group_leader; -		list_for_each_entry(sub, &leader->sibling_list, list_entry) { -			if (sub != counter) -				sub->pmu->read(sub); - -			group_entry.id = sub->id; -			group_entry.counter = atomic64_read(&sub->count); - -			perf_output_put(&handle, group_entry); -		} -	} +	if (sample_type & PERF_SAMPLE_READ) +		perf_output_read(&handle, counter);  	if (sample_type & PERF_SAMPLE_CALLCHAIN) {  		if (callchain) @@ -2742,6 +2955,22 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,  		}  	} +	if (sample_type & PERF_SAMPLE_RAW) { +		if (data->raw) { +			perf_output_put(&handle, data->raw->size); +			perf_output_copy(&handle, data->raw->data, data->raw->size); +		} else { +			struct { +				u32	size; +				u32	data; +			} raw = { +				.size = sizeof(u32), +				.data = 0, +			}; +			perf_output_put(&handle, raw); +		} +	} +  	perf_output_end(&handle);  } @@ -2754,8 +2983,6 @@ struct perf_read_event {  	u32				pid;  	u32				tid; -	u64				value; -	u64				format[3];  };  static void @@ -2767,87 +2994,74 @@ perf_counter_read_event(struct perf_counter *counter,  		.header = {  			.type = PERF_EVENT_READ,  			.misc = 0, -			.size = sizeof(event) - sizeof(event.format), +			.size = sizeof(event) + perf_counter_read_size(counter),  		},  		.pid = perf_counter_pid(counter, task),  		.tid = perf_counter_tid(counter, task), -		.value = atomic64_read(&counter->count),  	}; -	int ret, i = 0; - -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { -		event.header.size += sizeof(u64); -		event.format[i++] = counter->total_time_enabled; -	} - -	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { -		event.header.size += sizeof(u64); -		event.format[i++] = counter->total_time_running; -	} - -	if (counter->attr.read_format & PERF_FORMAT_ID) { -		u64 id; - -		event.header.size += sizeof(u64); -		if (counter->parent) -			id = counter->parent->id; -		else -			id = counter->id; - -		event.format[i++] = id; -	} +	int ret;  	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);  	if (ret)  		return; -	perf_output_copy(&handle, &event, event.header.size); +	perf_output_put(&handle, event); +	perf_output_read(&handle, counter); +  	perf_output_end(&handle);  }  /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task   */ -struct perf_fork_event { -	struct task_struct	*task; +struct perf_task_event { +	struct task_struct		*task; +	struct perf_counter_context	*task_ctx;  	struct {  		struct perf_event_header	header;  		u32				pid;  		u32				ppid; +		u32				tid; +		u32				ptid;  	} event;  }; -static void perf_counter_fork_output(struct perf_counter *counter, -				     struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, +				     struct perf_task_event *task_event)  {  	struct perf_output_handle handle; -	int size = fork_event->event.header.size; -	struct task_struct *task = fork_event->task; +	int size = task_event->event.header.size; +	struct task_struct *task = task_event->task;  	int ret = perf_output_begin(&handle, counter, size, 0, 0);  	if (ret)  		return; -	fork_event->event.pid = perf_counter_pid(counter, task); -	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); +	task_event->event.pid = perf_counter_pid(counter, task); +	task_event->event.ppid = perf_counter_pid(counter, current); -	perf_output_put(&handle, fork_event->event); +	task_event->event.tid = perf_counter_tid(counter, task); +	task_event->event.ptid = perf_counter_tid(counter, current); + +	perf_output_put(&handle, task_event->event);  	perf_output_end(&handle);  } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter)  { -	if (counter->attr.comm || counter->attr.mmap) +	if (counter->attr.comm || counter->attr.mmap || counter->attr.task)  		return 1;  	return 0;  } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, -				  struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, +				  struct perf_task_event *task_event)  {  	struct perf_counter *counter; @@ -2856,51 +3070,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,  	rcu_read_lock();  	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { -		if (perf_counter_fork_match(counter)) -			perf_counter_fork_output(counter, fork_event); +		if (perf_counter_task_match(counter)) +			perf_counter_task_output(counter, task_event);  	}  	rcu_read_unlock();  } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event)  {  	struct perf_cpu_context *cpuctx; -	struct perf_counter_context *ctx; +	struct perf_counter_context *ctx = task_event->task_ctx;  	cpuctx = &get_cpu_var(perf_cpu_context); -	perf_counter_fork_ctx(&cpuctx->ctx, fork_event); +	perf_counter_task_ctx(&cpuctx->ctx, task_event);  	put_cpu_var(perf_cpu_context);  	rcu_read_lock(); -	/* -	 * doesn't really matter which of the child contexts the -	 * events ends up in. -	 */ -	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (!ctx) +		ctx = rcu_dereference(task_event->task->perf_counter_ctxp);  	if (ctx) -		perf_counter_fork_ctx(ctx, fork_event); +		perf_counter_task_ctx(ctx, task_event);  	rcu_read_unlock();  } -void perf_counter_fork(struct task_struct *task) +static void perf_counter_task(struct task_struct *task, +			      struct perf_counter_context *task_ctx, +			      int new)  { -	struct perf_fork_event fork_event; +	struct perf_task_event task_event;  	if (!atomic_read(&nr_comm_counters) && -	    !atomic_read(&nr_mmap_counters)) +	    !atomic_read(&nr_mmap_counters) && +	    !atomic_read(&nr_task_counters))  		return; -	fork_event = (struct perf_fork_event){ -		.task	= task, -		.event  = { +	task_event = (struct perf_task_event){ +		.task	  = task, +		.task_ctx = task_ctx, +		.event    = {  			.header = { -				.type = PERF_EVENT_FORK, -				.size = sizeof(fork_event.event), +				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, +				.misc = 0, +				.size = sizeof(task_event.event),  			}, +			/* .pid  */ +			/* .ppid */ +			/* .tid  */ +			/* .ptid */  		},  	}; -	perf_counter_fork_event(&fork_event); +	perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ +	perf_counter_task(task, NULL, 1);  }  /* @@ -2968,8 +3193,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)  	struct perf_cpu_context *cpuctx;  	struct perf_counter_context *ctx;  	unsigned int size; -	char *comm = comm_event->task->comm; +	char comm[TASK_COMM_LEN]; +	memset(comm, 0, sizeof(comm)); +	strncpy(comm, comm_event->task->comm, sizeof(comm));  	size = ALIGN(strlen(comm)+1, sizeof(u64));  	comm_event->comm = comm; @@ -3004,8 +3231,16 @@ void perf_counter_comm(struct task_struct *task)  	comm_event = (struct perf_comm_event){  		.task	= task, +		/* .comm      */ +		/* .comm_size */  		.event  = { -			.header = { .type = PERF_EVENT_COMM, }, +			.header = { +				.type = PERF_EVENT_COMM, +				.misc = 0, +				/* .size */ +			}, +			/* .pid */ +			/* .tid */  		},  	}; @@ -3088,8 +3323,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)  	char *buf = NULL;  	const char *name; +	memset(tmp, 0, sizeof(tmp)); +  	if (file) { -		buf = kzalloc(PATH_MAX, GFP_KERNEL); +		/* +		 * d_path works from the end of the buffer backwards, so we +		 * need to add enough zero bytes after the string to handle +		 * the 64bit alignment we do later. +		 */ +		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);  		if (!buf) {  			name = strncpy(tmp, "//enomem", sizeof(tmp));  			goto got_name; @@ -3100,9 +3342,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)  			goto got_name;  		}  	} else { -		name = arch_vma_name(mmap_event->vma); -		if (name) +		if (arch_vma_name(mmap_event->vma)) { +			name = strncpy(tmp, arch_vma_name(mmap_event->vma), +				       sizeof(tmp));  			goto got_name; +		}  		if (!vma->vm_mm) {  			name = strncpy(tmp, "[vdso]", sizeof(tmp)); @@ -3147,8 +3391,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)  	mmap_event = (struct perf_mmap_event){  		.vma	= vma, +		/* .file_name */ +		/* .file_size */  		.event  = { -			.header = { .type = PERF_EVENT_MMAP, }, +			.header = { +				.type = PERF_EVENT_MMAP, +				.misc = 0, +				/* .size */ +			}, +			/* .pid */ +			/* .tid */  			.start  = vma->vm_start,  			.len    = vma->vm_end - vma->vm_start,  			.pgoff  = vma->vm_pgoff, @@ -3159,49 +3411,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)  }  /* - * Log sample_period changes so that analyzing tools can re-normalize the - * event flow. - */ - -struct freq_event { -	struct perf_event_header	header; -	u64				time; -	u64				id; -	u64				period; -}; - -static void perf_log_period(struct perf_counter *counter, u64 period) -{ -	struct perf_output_handle handle; -	struct freq_event event; -	int ret; - -	if (counter->hw.sample_period == period) -		return; - -	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) -		return; - -	event = (struct freq_event) { -		.header = { -			.type = PERF_EVENT_PERIOD, -			.misc = 0, -			.size = sizeof(event), -		}, -		.time = sched_clock(), -		.id = counter->id, -		.period = period, -	}; - -	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); -	if (ret) -		return; - -	perf_output_put(&handle, event); -	perf_output_end(&handle); -} - -/*   * IRQ throttle logging   */ @@ -3214,16 +3423,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)  		struct perf_event_header	header;  		u64				time;  		u64				id; +		u64				stream_id;  	} throttle_event = {  		.header = { -			.type = PERF_EVENT_THROTTLE + 1, +			.type = PERF_EVENT_THROTTLE,  			.misc = 0,  			.size = sizeof(throttle_event),  		}, -		.time	= sched_clock(), -		.id	= counter->id, +		.time		= sched_clock(), +		.id		= primary_counter_id(counter), +		.stream_id	= counter->id,  	}; +	if (enable) +		throttle_event.header.type = PERF_EVENT_UNTHROTTLE; +  	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);  	if (ret)  		return; @@ -3300,125 +3514,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,   * Generic software counter infrastructure   */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter)  {  	struct hw_perf_counter *hwc = &counter->hw; -	u64 prev, now; -	s64 delta; +	u64 period = hwc->last_period; +	u64 nr, offset; +	s64 old, val; + +	hwc->last_period = hwc->sample_period;  again: -	prev = atomic64_read(&hwc->prev_count); -	now = atomic64_read(&hwc->count); -	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) -		goto again; +	old = val = atomic64_read(&hwc->period_left); +	if (val < 0) +		return 0; -	delta = now - prev; +	nr = div64_u64(period + val, period); +	offset = nr * period; +	val -= offset; +	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) +		goto again; -	atomic64_add(delta, &counter->count); -	atomic64_sub(delta, &hwc->period_left); +	return nr;  } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, +				    int nmi, struct perf_sample_data *data)  {  	struct hw_perf_counter *hwc = &counter->hw; -	s64 left = atomic64_read(&hwc->period_left); -	s64 period = hwc->sample_period; +	u64 overflow; -	if (unlikely(left <= -period)) { -		left = period; -		atomic64_set(&hwc->period_left, left); -		hwc->last_period = period; -	} +	data->period = counter->hw.last_period; +	overflow = perf_swcounter_set_period(counter); -	if (unlikely(left <= 0)) { -		left += period; -		atomic64_add(period, &hwc->period_left); -		hwc->last_period = period; -	} +	if (hwc->interrupts == MAX_INTERRUPTS) +		return; -	atomic64_set(&hwc->prev_count, -left); -	atomic64_set(&hwc->count, -left); +	for (; overflow; overflow--) { +		if (perf_counter_overflow(counter, nmi, data)) { +			/* +			 * We inhibit the overflow from happening when +			 * hwc->interrupts == MAX_INTERRUPTS. +			 */ +			break; +		} +	}  } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter)  { -	enum hrtimer_restart ret = HRTIMER_RESTART; -	struct perf_sample_data data; -	struct perf_counter *counter; -	u64 period; - -	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer); -	counter->pmu->read(counter); - -	data.addr = 0; -	data.regs = get_irq_regs();  	/* -	 * In case we exclude kernel IPs or are somehow not in interrupt -	 * context, provide the next best thing, the user IP. +	 * Nothing to do, we already reset hwc->interrupts.  	 */ -	if ((counter->attr.exclude_kernel || !data.regs) && -			!counter->attr.exclude_user) -		data.regs = task_pt_regs(current); +} -	if (data.regs) { -		if (perf_counter_overflow(counter, 0, &data)) -			ret = HRTIMER_NORESTART; -	} +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, +			       int nmi, struct perf_sample_data *data) +{ +	struct hw_perf_counter *hwc = &counter->hw; -	period = max_t(u64, 10000, counter->hw.sample_period); -	hrtimer_forward_now(hrtimer, ns_to_ktime(period)); +	atomic64_add(nr, &counter->count); -	return ret; -} +	if (!hwc->sample_period) +		return; -static void perf_swcounter_overflow(struct perf_counter *counter, -				    int nmi, struct perf_sample_data *data) -{ -	data->period = counter->hw.last_period; +	if (!data->regs) +		return; -	perf_swcounter_update(counter); -	perf_swcounter_set_period(counter); -	if (perf_counter_overflow(counter, nmi, data)) -		/* soft-disable the counter */ -		; +	if (!atomic64_add_negative(nr, &hwc->period_left)) +		perf_swcounter_overflow(counter, nmi, data);  }  static int perf_swcounter_is_counting(struct perf_counter *counter)  { -	struct perf_counter_context *ctx; -	unsigned long flags; -	int count; - +	/* +	 * The counter is active, we're good! +	 */  	if (counter->state == PERF_COUNTER_STATE_ACTIVE)  		return 1; +	/* +	 * The counter is off/error, not counting. +	 */  	if (counter->state != PERF_COUNTER_STATE_INACTIVE)  		return 0;  	/* -	 * If the counter is inactive, it could be just because -	 * its task is scheduled out, or because it's in a group -	 * which could not go on the PMU.  We want to count in -	 * the first case but not the second.  If the context is -	 * currently active then an inactive software counter must -	 * be the second case.  If it's not currently active then -	 * we need to know whether the counter was active when the -	 * context was last active, which we can determine by -	 * comparing counter->tstamp_stopped with ctx->time. -	 * -	 * We are within an RCU read-side critical section, -	 * which protects the existence of *ctx. +	 * The counter is inactive, if the context is active +	 * we're part of a group that didn't make it on the 'pmu', +	 * not counting.  	 */ -	ctx = counter->ctx; -	spin_lock_irqsave(&ctx->lock, flags); -	count = 1; -	/* Re-check state now we have the lock */ -	if (counter->state < PERF_COUNTER_STATE_INACTIVE || -	    counter->ctx->is_active || -	    counter->tstamp_stopped < ctx->time) -		count = 0; -	spin_unlock_irqrestore(&ctx->lock, flags); -	return count; +	if (counter->ctx->is_active) +		return 0; + +	/* +	 * We're inactive and the context is too, this means the +	 * task is scheduled out, we're counting events that happen +	 * to us, like migration events. +	 */ +	return 1;  }  static int perf_swcounter_match(struct perf_counter *counter, @@ -3444,15 +3644,6 @@ static int perf_swcounter_match(struct perf_counter *counter,  	return 1;  } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, -			       int nmi, struct perf_sample_data *data) -{ -	int neg = atomic64_add_negative(nr, &counter->hw.count); - -	if (counter->hw.sample_period && !neg && data->regs) -		perf_swcounter_overflow(counter, nmi, data); -} -  static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,  				     enum perf_type_id type,  				     u32 event, u64 nr, int nmi, @@ -3531,27 +3722,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,  static void perf_swcounter_read(struct perf_counter *counter)  { -	perf_swcounter_update(counter);  }  static int perf_swcounter_enable(struct perf_counter *counter)  { -	perf_swcounter_set_period(counter); +	struct hw_perf_counter *hwc = &counter->hw; + +	if (hwc->sample_period) { +		hwc->last_period = hwc->sample_period; +		perf_swcounter_set_period(counter); +	}  	return 0;  }  static void perf_swcounter_disable(struct perf_counter *counter)  { -	perf_swcounter_update(counter);  }  static const struct pmu perf_ops_generic = {  	.enable		= perf_swcounter_enable,  	.disable	= perf_swcounter_disable,  	.read		= perf_swcounter_read, +	.unthrottle	= perf_swcounter_unthrottle,  };  /* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ +	enum hrtimer_restart ret = HRTIMER_RESTART; +	struct perf_sample_data data; +	struct perf_counter *counter; +	u64 period; + +	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer); +	counter->pmu->read(counter); + +	data.addr = 0; +	data.regs = get_irq_regs(); +	/* +	 * In case we exclude kernel IPs or are somehow not in interrupt +	 * context, provide the next best thing, the user IP. +	 */ +	if ((counter->attr.exclude_kernel || !data.regs) && +			!counter->attr.exclude_user) +		data.regs = task_pt_regs(current); + +	if (data.regs) { +		if (perf_counter_overflow(counter, 0, &data)) +			ret = HRTIMER_NORESTART; +	} + +	period = max_t(u64, 10000, counter->hw.sample_period); +	hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + +	return ret; +} + +/*   * Software counter: cpu wall time clock   */ @@ -3668,17 +3898,24 @@ static const struct pmu perf_ops_task_clock = {  };  #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, +			  int entry_size)  { +	struct perf_raw_record raw = { +		.size = entry_size, +		.data = record, +	}; +  	struct perf_sample_data data = { -		.regs = get_irq_regs(); -		.addr = 0, +		.regs = get_irq_regs(), +		.addr = addr, +		.raw = &raw,  	};  	if (!data.regs)  		data.regs = task_pt_regs(current); -	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); +	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);  }  EXPORT_SYMBOL_GPL(perf_tpcounter_event); @@ -3687,16 +3924,20 @@ extern void ftrace_profile_disable(int);  static void tp_perf_counter_destroy(struct perf_counter *counter)  { -	ftrace_profile_disable(perf_event_id(&counter->attr)); +	ftrace_profile_disable(counter->attr.config);  }  static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)  { -	int event_id = perf_event_id(&counter->attr); -	int ret; +	/* +	 * Raw tracepoint data is a severe data leak, only allow root to +	 * have these. +	 */ +	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && +			!capable(CAP_SYS_ADMIN)) +		return ERR_PTR(-EPERM); -	ret = ftrace_profile_enable(event_id); -	if (ret) +	if (ftrace_profile_enable(counter->attr.config))  		return NULL;  	counter->destroy = tp_perf_counter_destroy; @@ -3829,9 +4070,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,  	atomic64_set(&hwc->period_left, hwc->sample_period);  	/* -	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters +	 * we currently do not support PERF_FORMAT_GROUP on inherited counters  	 */ -	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) +	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))  		goto done;  	switch (attr->type) { @@ -3874,6 +4115,8 @@ done:  			atomic_inc(&nr_mmap_counters);  		if (counter->attr.comm)  			atomic_inc(&nr_comm_counters); +		if (counter->attr.task) +			atomic_inc(&nr_task_counters);  	}  	return counter; @@ -4235,8 +4478,10 @@ void perf_counter_exit_task(struct task_struct *child)  	struct perf_counter_context *child_ctx;  	unsigned long flags; -	if (likely(!child->perf_counter_ctxp)) +	if (likely(!child->perf_counter_ctxp)) { +		perf_counter_task(child, NULL, 0);  		return; +	}  	local_irq_save(flags);  	/* @@ -4255,17 +4500,20 @@ void perf_counter_exit_task(struct task_struct *child)  	 */  	spin_lock(&child_ctx->lock);  	child->perf_counter_ctxp = NULL; -	if (child_ctx->parent_ctx) { -		/* -		 * This context is a clone; unclone it so it can't get -		 * swapped to another process while we're removing all -		 * the counters from it. -		 */ -		put_ctx(child_ctx->parent_ctx); -		child_ctx->parent_ctx = NULL; -	} -	spin_unlock(&child_ctx->lock); -	local_irq_restore(flags); +	/* +	 * If this context is a clone; unclone it so it can't get +	 * swapped to another process while we're removing all +	 * the counters from it. +	 */ +	unclone_ctx(child_ctx); +	spin_unlock_irqrestore(&child_ctx->lock, flags); + +	/* +	 * Report the task dead after unscheduling the counters so that we +	 * won't get any samples after PERF_EVENT_EXIT. We can however still +	 * get a few PERF_EVENT_READ events. +	 */ +	perf_counter_task(child, child_ctx, 0);  	/*  	 * We can recurse on the same lock type through: @@ -4486,6 +4734,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)  		perf_counter_init_cpu(cpu);  		break; +	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN: +		hw_perf_counter_setup_online(cpu); +		break; +  	case CPU_DOWN_PREPARE:  	case CPU_DOWN_PREPARE_FROZEN:  		perf_counter_exit_cpu(cpu); @@ -4510,6 +4763,8 @@ void __init perf_counter_init(void)  {  	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,  			(void *)(long)smp_processor_id()); +	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, +			(void *)(long)smp_processor_id());  	register_cpu_notifier(&perf_cpu_nb);  } diff --git a/kernel/pid.c b/kernel/pid.c index 5fa1db48d8b..31310b5d3f5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,7 +36,6 @@  #include <linux/pid_namespace.h>  #include <linux/init_task.h>  #include <linux/syscalls.h> -#include <linux/kmemleak.h>  #define pid_hashfn(nr, ns)	\  	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -513,12 +512,6 @@ void __init pidhash_init(void)  	pid_hash = alloc_bootmem(pidhash_size *	sizeof(*(pid_hash)));  	if (!pid_hash)  		panic("Could not alloc pidhash!\n"); -	/* -	 * pid_hash contains references to allocated struct pid objects and it -	 * must be scanned by kmemleak to avoid false positives. -	 */ -	kmemleak_alloc(pid_hash, pidhash_size *	sizeof(*(pid_hash)), 0, -		       GFP_KERNEL);  	for (i = 0; i < pidhash_size; i++)  		INIT_HLIST_HEAD(&pid_hash[i]);  } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0b67b..e33a21cb940 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)  }  void posix_cpu_timers_exit_group(struct task_struct *tsk)  { -	struct task_cputime cputime; +	struct signal_struct *const sig = tsk->signal; -	thread_group_cputimer(tsk, &cputime);  	cleanup_timers(tsk->signal->cpu_timers, -		       cputime.utime, cputime.stime, cputime.sum_exec_runtime); +		       cputime_add(tsk->utime, sig->utime), +		       cputime_add(tsk->stime, sig->stime), +		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);  }  static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 052ec4d195c..d089d052c4a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)  	return -EOPNOTSUPP;  } +static int no_nsleep(const clockid_t which_clock, int flags, +		     struct timespec *tsave, struct timespec __user *rmtp) +{ +	return -EOPNOTSUPP; +} +  /*   * Return nonzero if we know a priori this clockid_t value is bogus.   */ @@ -254,6 +260,7 @@ static __init int init_posix_timers(void)  		.clock_get = posix_get_monotonic_raw,  		.clock_set = do_posix_clock_nosettime,  		.timer_create = no_timer_create, +		.nsleep = no_nsleep,  	};  	register_posix_clock(CLOCK_REALTIME, &clock_realtime); diff --git a/kernel/power/user.c b/kernel/power/user.c index ed97375daae..bf0014d6a5f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -23,7 +23,6 @@  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/freezer.h> -#include <linux/smp_lock.h>  #include <scsi/scsi_scan.h>  #include <asm/uaccess.h> diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745e..419250ebec4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void)  	cpumask_copy(prof_cpu_mask, cpu_possible_mask); -	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); +	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);  	if (prof_buffer)  		return 0; -	prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); +	prof_buffer = alloc_pages_exact(buffer_bytes, +					GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);  	if (prof_buffer)  		return 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61c78b2c07b..082c320e4db 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)  	 * interference; SUID, SGID and LSM creds get determined differently  	 * under ptrace.  	 */ -	retval = mutex_lock_interruptible(&task->cred_guard_mutex); -	if (retval < 0) +	retval = -ERESTARTNOINTR; +	if (mutex_lock_interruptible(&task->cred_guard_mutex))  		goto out;  	task_lock(task); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0dccfbba6d2..7717b95c202 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1533,7 +1533,7 @@ void __init __rcu_init(void)  	int j;  	struct rcu_node *rnp; -	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); +	printk(KERN_INFO "Hierarchical RCU implementation.\n");  #ifdef CONFIG_RCU_CPU_STALL_DETECTOR  	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ @@ -1546,7 +1546,6 @@ void __init __rcu_init(void)  		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);  	/* Register notifier for non-boot CPUs */  	register_cpu_notifier(&rcu_nb); -	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");  }  module_param(blimit, int, 0); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index fcd107a78c5..29bd4baf9e7 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,  	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {  		/* We got the lock for task. */  		debug_rt_mutex_lock(lock); -  		rt_mutex_set_owner(lock, task, 0); - +		spin_unlock(&lock->wait_lock);  		rt_mutex_deadlock_account_lock(lock, task);  		return 1;  	}  	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); -  	if (ret && !waiter->task) {  		/*  		 * Reset the return value. We might have diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e..1b59e265273 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -493,6 +493,7 @@ struct rt_rq {  #endif  #ifdef CONFIG_SMP  	unsigned long rt_nr_migratory; +	unsigned long rt_nr_total;  	int overloaded;  	struct plist_head pushable_tasks;  #endif @@ -2571,15 +2572,37 @@ static void __sched_fork(struct task_struct *p)  	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;  #ifdef CONFIG_SCHEDSTATS -	p->se.wait_start		= 0; -	p->se.sum_sleep_runtime		= 0; -	p->se.sleep_start		= 0; -	p->se.block_start		= 0; -	p->se.sleep_max			= 0; -	p->se.block_max			= 0; -	p->se.exec_max			= 0; -	p->se.slice_max			= 0; -	p->se.wait_max			= 0; +	p->se.wait_start			= 0; +	p->se.wait_max				= 0; +	p->se.wait_count			= 0; +	p->se.wait_sum				= 0; + +	p->se.sleep_start			= 0; +	p->se.sleep_max				= 0; +	p->se.sum_sleep_runtime			= 0; + +	p->se.block_start			= 0; +	p->se.block_max				= 0; +	p->se.exec_max				= 0; +	p->se.slice_max				= 0; + +	p->se.nr_migrations_cold		= 0; +	p->se.nr_failed_migrations_affine	= 0; +	p->se.nr_failed_migrations_running	= 0; +	p->se.nr_failed_migrations_hot		= 0; +	p->se.nr_forced_migrations		= 0; +	p->se.nr_forced2_migrations		= 0; + +	p->se.nr_wakeups			= 0; +	p->se.nr_wakeups_sync			= 0; +	p->se.nr_wakeups_migrate		= 0; +	p->se.nr_wakeups_local			= 0; +	p->se.nr_wakeups_remote			= 0; +	p->se.nr_wakeups_affine			= 0; +	p->se.nr_wakeups_affine_attempts	= 0; +	p->se.nr_wakeups_passive		= 0; +	p->se.nr_wakeups_idle			= 0; +  #endif  	INIT_LIST_HEAD(&p->rt.run_list); @@ -6541,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } +static inline int should_resched(void) +{ +	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} +  static void __cond_resched(void)  {  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6560,8 +6588,7 @@ static void __cond_resched(void)  int __sched _cond_resched(void)  { -	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && -					system_state == SYSTEM_RUNNING) { +	if (should_resched()) {  		__cond_resched();  		return 1;  	} @@ -6579,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);   */  int cond_resched_lock(spinlock_t *lock)  { -	int resched = need_resched() && system_state == SYSTEM_RUNNING; +	int resched = should_resched();  	int ret = 0;  	if (spin_needbreak(lock) || resched) {  		spin_unlock(lock); -		if (resched && need_resched()) +		if (resched)  			__cond_resched();  		else  			cpu_relax(); @@ -6599,7 +6626,7 @@ int __sched cond_resched_softirq(void)  {  	BUG_ON(!in_softirq()); -	if (need_resched() && system_state == SYSTEM_RUNNING) { +	if (should_resched()) {  		local_bh_enable();  		__cond_resched();  		local_bh_disable(); @@ -7262,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)  static void calc_global_load_remove(struct rq *rq)  {  	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +	rq->calc_load_active = 0;  }  #endif /* CONFIG_HOTPLUG_CPU */ @@ -7488,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		task_rq_unlock(rq, &flags);  		get_task_struct(p);  		cpu_rq(cpu)->migration_thread = p; +		rq->calc_load_update = calc_load_update;  		break;  	case CPU_ONLINE: @@ -7498,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		/* Update our root-domain */  		rq = cpu_rq(cpu);  		spin_lock_irqsave(&rq->lock, flags); -		rq->calc_load_update = calc_load_update; -		rq->calc_load_active = 0;  		if (rq->rd) {  			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -9070,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  #ifdef CONFIG_SMP  	rt_rq->rt_nr_migratory = 0;  	rt_rq->overloaded = 0; -	plist_head_init(&rq->rt.pushable_tasks, &rq->lock); +	plist_head_init(&rt_rq->pushable_tasks, &rq->lock);  #endif  	rt_rq->rt_time = 0; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6c251790dd..d014efbf947 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,  		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)  			continue; -		if (lowest_mask) +		if (lowest_mask) {  			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + +			/* +			 * We have to ensure that we have at least one bit +			 * still set in the array, since the map could have +			 * been concurrently emptied between the first and +			 * second reads of vec->mask.  If we hit this +			 * condition, simply act as though we never hit this +			 * priority level and continue on. +			 */ +			if (cpumask_any(lowest_mask) >= nr_cpu_ids) +				continue; +		} +  		return 1;  	} diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ba7fd6e9556..652e8bdef9a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)  	return min_vruntime;  } +static inline int entity_before(struct sched_entity *a, +				struct sched_entity *b) +{ +	return (s64)(a->vruntime - b->vruntime) < 0; +} +  static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	return se->vruntime - cfs_rq->min_vruntime; @@ -605,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  #ifdef CONFIG_SCHEDSTATS +	struct task_struct *tsk = NULL; + +	if (entity_is_task(se)) +		tsk = task_of(se); +  	if (se->sleep_start) {  		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; -		struct task_struct *tsk = task_of(se);  		if ((s64)delta < 0)  			delta = 0; @@ -618,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		se->sleep_start = 0;  		se->sum_sleep_runtime += delta; -		account_scheduler_latency(tsk, delta >> 10, 1); +		if (tsk) +			account_scheduler_latency(tsk, delta >> 10, 1);  	}  	if (se->block_start) {  		u64 delta = rq_of(cfs_rq)->clock - se->block_start; -		struct task_struct *tsk = task_of(se);  		if ((s64)delta < 0)  			delta = 0; @@ -633,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		se->block_start = 0;  		se->sum_sleep_runtime += delta; -		/* -		 * Blocking time is in units of nanosecs, so shift by 20 to -		 * get a milliseconds-range estimation of the amount of -		 * time that the task spent sleeping: -		 */ -		if (unlikely(prof_on == SLEEP_PROFILING)) { - -			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), -				     delta >> 20); +		if (tsk) { +			/* +			 * Blocking time is in units of nanosecs, so shift by +			 * 20 to get a milliseconds-range estimation of the +			 * amount of time that the task spent sleeping: +			 */ +			if (unlikely(prof_on == SLEEP_PROFILING)) { +				profile_hits(SLEEP_PROFILING, +						(void *)get_wchan(tsk), +						delta >> 20); +			} +			account_scheduler_latency(tsk, delta >> 10, 0);  		} -		account_scheduler_latency(tsk, delta >> 10, 0);  	}  #endif  } @@ -687,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  			 * all of which have the same weight.  			 */  			if (sched_feat(NORMALIZED_SLEEPER) && -					task_of(se)->policy != SCHED_IDLE) +					(!entity_is_task(se) || +					 task_of(se)->policy != SCHED_IDLE))  				thresh = calc_delta_fair(thresh, se);  			vruntime -= thresh; @@ -1016,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)  	/*  	 * Already in the rightmost position?  	 */ -	if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) +	if (unlikely(!rightmost || entity_before(rightmost, se)))  		return;  	/* @@ -1712,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)  	/* 'curr' will be NULL if the child belongs to a different group */  	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && -			curr && curr->vruntime < se->vruntime) { +			curr && entity_before(curr, se)) {  		/*  		 * Upon rescheduling, sched_class::put_prev_task() will place  		 * 'current' within the tree based on its new key value. diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9bf0d2a7304..3918e01994e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)  #ifdef CONFIG_RT_GROUP_SCHED +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) +  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)  {  	return rt_rq->rq; @@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)  #else /* CONFIG_RT_GROUP_SCHED */ +#define rt_entity_is_task(rt_se) (1) +  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)  {  	return container_of(rt_rq, struct rq, rt); @@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)  static void update_rt_migration(struct rt_rq *rt_rq)  { -	if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { +	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {  		if (!rt_rq->overloaded) {  			rt_set_overload(rq_of_rt_rq(rt_rq));  			rt_rq->overloaded = 1; @@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)  static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  { +	if (!rt_entity_is_task(rt_se)) +		return; + +	rt_rq = &rq_of_rt_rq(rt_rq)->rt; + +	rt_rq->rt_nr_total++;  	if (rt_se->nr_cpus_allowed > 1)  		rt_rq->rt_nr_migratory++; @@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  { +	if (!rt_entity_is_task(rt_se)) +		return; + +	rt_rq = &rq_of_rt_rq(rt_rq)->rt; + +	rt_rq->rt_nr_total--;  	if (rt_se->nr_cpus_allowed > 1)  		rt_rq->rt_nr_migratory--; diff --git a/kernel/signal.c b/kernel/signal.c index ccf1ceedaeb..64c5deeaca5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s  	stack_t oss;  	int error; -	if (uoss) { -		oss.ss_sp = (void __user *) current->sas_ss_sp; -		oss.ss_size = current->sas_ss_size; -		oss.ss_flags = sas_ss_flags(sp); -	} +	oss.ss_sp = (void __user *) current->sas_ss_sp; +	oss.ss_size = current->sas_ss_size; +	oss.ss_flags = sas_ss_flags(sp);  	if (uss) {  		void __user *ss_sp; @@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s  		int ss_flags;  		error = -EFAULT; -		if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) -		    || __get_user(ss_sp, &uss->ss_sp) -		    || __get_user(ss_flags, &uss->ss_flags) -		    || __get_user(ss_size, &uss->ss_size)) +		if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) +			goto out; +		error = __get_user(ss_sp, &uss->ss_sp) | +			__get_user(ss_flags, &uss->ss_flags) | +			__get_user(ss_size, &uss->ss_size); +		if (error)  			goto out;  		error = -EPERM; @@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s  		current->sas_ss_size = ss_size;  	} +	error = 0;  	if (uoss) {  		error = -EFAULT; -		if (copy_to_user(uoss, &oss, sizeof(oss))) +		if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))  			goto out; +		error = __put_user(oss.ss_sp, &uoss->ss_sp) | +			__put_user(oss.ss_size, &uoss->ss_size) | +			__put_user(oss.ss_flags, &uoss->ss_flags);  	} -	error = 0;  out:  	return error;  } diff --git a/kernel/smp.c b/kernel/smp.c index ad63d850120..94188b8ecc3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)  			return NOTIFY_BAD;  		break; -#ifdef CONFIG_CPU_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN: diff --git a/kernel/softirq.c b/kernel/softirq.c index 3a94905fa5d..eb5e131a048 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))  	softirq_vec[nr].action = action;  } -/* Tasklets */ +/* + * Tasklets + */  struct tasklet_head  {  	struct tasklet_struct *head; @@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)  EXPORT_SYMBOL(tasklet_kill); +/* + * tasklet_hrtimer + */ + +/* + * The trampoline is called when the hrtimer expires. If this is + * called from the hrtimer interrupt then we schedule the tasklet as + * the timer callback function expects to run in softirq context. If + * it's called in softirq context anyway (i.e. high resolution timers + * disabled) then the hrtimer callback is called right away. + */ +static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) +{ +	struct tasklet_hrtimer *ttimer = +		container_of(timer, struct tasklet_hrtimer, timer); + +	if (hrtimer_is_hres_active(timer)) { +		tasklet_hi_schedule(&ttimer->tasklet); +		return HRTIMER_NORESTART; +	} +	return ttimer->function(timer); +} + +/* + * Helper function which calls the hrtimer callback from + * tasklet/softirq context + */ +static void __tasklet_hrtimer_trampoline(unsigned long data) +{ +	struct tasklet_hrtimer *ttimer = (void *)data; +	enum hrtimer_restart restart; + +	restart = ttimer->function(&ttimer->timer); +	if (restart != HRTIMER_NORESTART) +		hrtimer_restart(&ttimer->timer); +} + +/** + * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks + * @ttimer:	 tasklet_hrtimer which is initialized + * @function:	 hrtimer callback funtion which gets called from softirq context + * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) + * @mode:	 hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) + */ +void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, +			  enum hrtimer_restart (*function)(struct hrtimer *), +			  clockid_t which_clock, enum hrtimer_mode mode) +{ +	hrtimer_init(&ttimer->timer, which_clock, mode); +	ttimer->timer.function = __hrtimer_tasklet_trampoline; +	tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, +		     (unsigned long)ttimer); +	ttimer->function = function; +} +EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); + +/* + * Remote softirq bits + */ +  DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);  EXPORT_PER_CPU_SYMBOL(softirq_work_list); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e02328c67..58be76017fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@  #include <linux/acpi.h>  #include <linux/reboot.h>  #include <linux/ftrace.h> +#include <linux/security.h>  #include <linux/slow-work.h>  #include <linux/perf_counter.h> @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = {  	{  		.ctl_name	= CTL_UNNUMBERED,  		.procname	= "mmap_min_addr", -		.data		= &mmap_min_addr, -		.maxlen         = sizeof(unsigned long), +		.data		= &dac_mmap_min_addr, +		.maxlen		= sizeof(unsigned long),  		.mode		= 0644, -		.proc_handler	= &proc_doulongvec_minmax, +		.proc_handler	= &mmap_min_addr_handler,  	},  #ifdef CONFIG_NUMA  	{ diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ad6dd46111..a6dcd67b041 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg)  	spin_unlock(&clockevents_lock);  }  EXPORT_SYMBOL_GPL(clockevents_notify); - -ktime_t clockevents_get_next_event(int cpu) -{ -	struct tick_device *td; -	struct clock_event_device *dev; - -	td = &per_cpu(tick_cpu_device, cpu); -	dev = td->evtdev; - -	return dev->next_event; -}  #endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 592bf584d1d..7466cb81125 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,  	 * Check to make sure we don't switch to a non-highres capable  	 * clocksource if the tick code is in oneshot mode (highres or nohz)  	 */ -	if (tick_oneshot_mode_active() && +	if (tick_oneshot_mode_active() && ovr &&  	    !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {  		printk(KERN_WARNING "%s clocksource is not HRT compatible. "  			"Cannot switch while in HRT/NOHZ mode\n", ovr->name); diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e5cc8..a7f07d5a624 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -714,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)  	 * networking code - if the timer is re-modified  	 * to be the same thing then just return:  	 */ -	if (timer->expires == expires && timer_pending(timer)) +	if (timer_pending(timer) && timer->expires == expires)  		return 1;  	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1551f47e766..019f380fd76 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -226,13 +226,13 @@ config BOOT_TRACER  	  the timings of the initcalls and traces key events and the identity  	  of tasks that can cause boot delays, such as context-switches. -	  Its aim is to be parsed by the /scripts/bootgraph.pl tool to +	  Its aim is to be parsed by the scripts/bootgraph.pl tool to  	  produce pretty graphics about boot inefficiencies, giving a visual  	  representation of the delays during initcalls - but the raw  	  /debug/tracing/trace text output is readable too. -	  You must pass in ftrace=initcall to the kernel command line -	  to enable this on bootup. +	  You must pass in initcall_debug and ftrace=initcall to the kernel +	  command line to enable this on bootup.  config TRACE_BRANCH_PROFILING  	bool diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 39af8af6fc3..7a34cb563fe 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -22,6 +22,7 @@  #include <linux/init.h>  #include <linux/mutex.h>  #include <linux/debugfs.h> +#include <linux/smp_lock.h>  #include <linux/time.h>  #include <linux/uaccess.h> @@ -266,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt)  {  	debugfs_remove(bt->msg_file);  	debugfs_remove(bt->dropped_file); -	debugfs_remove(bt->dir);  	relay_close(bt->rchan); +	debugfs_remove(bt->dir);  	free_percpu(bt->sequence);  	free_percpu(bt->msg_data);  	kfree(bt); @@ -377,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,  static int blk_remove_buf_file_callback(struct dentry *dentry)  { -	struct dentry *parent = dentry->d_parent;  	debugfs_remove(dentry); -	/* -	* this will fail for all but the last file, but that is ok. what we -	* care about is the top level buts->name directory going away, when -	* the last trace file is gone. Then we don't have to rmdir() that -	* manually on trace stop, so it nicely solves the issue with -	* force killing of running traces. -	*/ - -	debugfs_remove(parent);  	return 0;  } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3716bf04df..1e1d23c2630 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -768,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {  	.stat_show	= function_stat_show  }; -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer)  {  	struct ftrace_profile_stat *stat;  	struct dentry *entry; @@ -786,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)  			 * The files created are permanent, if something happens  			 * we still do not free memory.  			 */ -			kfree(stat);  			WARN(1,  			     "Could not allocate stat file for cpu %d\n",  			     cpu); @@ -813,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)  }  #else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer)  {  }  #endif /* CONFIG_FUNCTION_PROFILER */ @@ -1663,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)  	mutex_lock(&ftrace_regex_lock);  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) +	    (file->f_flags & O_TRUNC))  		ftrace_filter_reset(enable);  	if (file->f_mode & FMODE_READ) { @@ -2578,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)  	mutex_lock(&graph_lock);  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) { +	    (file->f_flags & O_TRUNC)) {  		ftrace_graph_count = 0;  		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));  	} @@ -2597,6 +2596,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)  }  static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ +	if (file->f_mode & FMODE_READ) +		seq_release(inode, file); +	return 0; +} + +static int  ftrace_set_func(unsigned long *array, int *idx, char *buffer)  {  	struct dyn_ftrace *rec; @@ -2725,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,  }  static const struct file_operations ftrace_graph_fops = { -	.open = ftrace_graph_open, -	.read = seq_read, -	.write = ftrace_graph_write, +	.open		= ftrace_graph_open, +	.read		= seq_read, +	.write		= ftrace_graph_write, +	.release	= ftrace_graph_release,  };  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -3160,10 +3168,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos); -	if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) +	if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))  		goto out; -	last_ftrace_enabled = ftrace_enabled; +	last_ftrace_enabled = !!ftrace_enabled;  	if (ftrace_enabled) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e..a330513d96c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)  	put_online_cpus(); +	kfree(buffer->buffers);  	free_cpumask_var(buffer->cpumask);  	kfree(buffer); @@ -1785,7 +1786,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,  	 */  	RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); -	if (!rb_try_to_discard(cpu_buffer, event)) +	if (rb_try_to_discard(cpu_buffer, event))  		goto out;  	/* @@ -2383,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  		 * the box. Return the padding, and we will release  		 * the current locks, and try again.  		 */ -		rb_advance_reader(cpu_buffer);  		return event;  	case RINGBUF_TYPE_TIME_EXTEND: @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void)  	 * buffer too. A one time deal is all you get from reading  	 * the ring buffer from an NMI.  	 */ -	if (likely(!in_nmi() && !oops_in_progress)) +	if (likely(!in_nmi()))  		return 1;  	tracing_off_permanent(); @@ -2519,6 +2519,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	if (dolock)  		spin_lock(&cpu_buffer->reader_lock);  	event = rb_buffer_peek(buffer, cpu, ts); +	if (event && event->type_len == RINGBUF_TYPE_PADDING) +		rb_advance_reader(cpu_buffer);  	if (dolock)  		spin_unlock(&cpu_buffer->reader_lock);  	local_irq_restore(flags); @@ -2590,12 +2592,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)  		spin_lock(&cpu_buffer->reader_lock);  	event = rb_buffer_peek(buffer, cpu, ts); -	if (!event) -		goto out_unlock; - -	rb_advance_reader(cpu_buffer); +	if (event) +		rb_advance_reader(cpu_buffer); - out_unlock:  	if (dolock)  		spin_unlock(&cpu_buffer->reader_lock);  	local_irq_restore(flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3aa0a0dfdfa..c22b40f8f57 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,6 +17,7 @@  #include <linux/writeback.h>  #include <linux/kallsyms.h>  #include <linux/seq_file.h> +#include <linux/smp_lock.h>  #include <linux/notifier.h>  #include <linux/irqflags.h>  #include <linux/debugfs.h> @@ -847,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |  		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);  } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update);  struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,  						    int type, @@ -2030,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)  	/* If this file was open for write, then erase contents */  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) { +	    (file->f_flags & O_TRUNC)) {  		long cpu = (long) inode->i_private;  		if (cpu == TRACE_PIPE_ALL_CPU) @@ -3084,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)  			break;  		} -		trace_consume(iter); +		if (ret != TRACE_TYPE_NO_CONSUME) +			trace_consume(iter);  		rem -= count;  		if (!find_next_entry_inc(iter))	{  			rem = 0; @@ -4232,8 +4235,11 @@ static void __ftrace_dump(bool disable_tracing)  		iter.pos = -1;  		if (find_next_entry_inc(&iter) != NULL) { -			print_trace_line(&iter); -			trace_consume(&iter); +			int ret; + +			ret = print_trace_line(&iter); +			if (ret != TRACE_TYPE_NO_CONSUME) +				trace_consume(&iter);  		}  		trace_printk_seq(&iter.seq); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc78..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,  struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,  					  int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, -				  unsigned long flags, -				  int pc); -  void default_wait_pipe(struct trace_iterator *iter);  void poll_wait_pipe(struct trace_iterator *iter); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895afecf..11ba5bb4ed0 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)  	mutex_lock(&event_mutex);  	list_for_each_entry(event, &ftrace_events, list) { -		if (event->id == event_id) { +		if (event->id == event_id && event->profile_enable) {  			ret = event->profile_enable(event);  			break;  		} diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5e32e375134..6db005e1248 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,  		   ftrace_graph_ret_entry, ignore,  	TRACE_STRUCT(  		TRACE_FIELD(unsigned long, ret.func, func) +		TRACE_FIELD(unsigned long long, ret.calltime, calltime) +		TRACE_FIELD(unsigned long long, ret.rettime, rettime) +		TRACE_FIELD(unsigned long, ret.overrun, overrun)  		TRACE_FIELD(int, ret.depth, depth)  	),  	TP_RAW_FMT("<-- %lx (%d)") diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a8..e75276a49cf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)  	const struct seq_operations *seq_ops;  	if ((file->f_mode & FMODE_WRITE) && -	    !(file->f_flags & O_APPEND)) +	    (file->f_flags & O_TRUNC))  		ftrace_clear_events();  	seq_ops = inode->i_private; @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,  		entry = trace_create_file("enable", 0644, call->dir, call,  					  enable); -	if (call->id) +	if (call->id && call->profile_enable)  		entry = trace_create_file("id", 0444, call->dir, call,  					  id); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf4..f32dc9d1ea7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,  		return -ENOSPC;  	} -	filter->preds[filter->n_preds] = pred; -	filter->n_preds++; -  	list_for_each_entry(call, &ftrace_events, list) {  		if (!call->define_fields) @@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,  		}  		replace_filter_string(call->filter, filter_string);  	} + +	filter->preds[filter->n_preds] = pred; +	filter->n_preds++;  out:  	return err;  } @@ -1029,12 +1029,17 @@ static int replace_preds(struct event_subsystem *system,  		if (elt->op == OP_AND || elt->op == OP_OR) {  			pred = create_logical_pred(elt->op); +			if (!pred) +				return -ENOMEM;  			if (call) {  				err = filter_add_pred(ps, call, pred);  				filter_free_pred(pred); -			} else +			} else {  				err = filter_add_subsystem_pred(ps, system,  							pred, filter_string); +				if (err) +					filter_free_pred(pred); +			}  			if (err)  				return err; @@ -1048,12 +1053,17 @@ static int replace_preds(struct event_subsystem *system,  		}  		pred = create_pred(elt->op, operand1, operand2); +		if (!pred) +			return -ENOMEM;  		if (call) {  			err = filter_add_pred(ps, call, pred);  			filter_free_pred(pred); -		} else +		} else {  			err = filter_add_subsystem_pred(ps, system, pred,  							filter_string); +			if (err) +				filter_free_pred(pred); +		}  		if (err)  			return err; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7402144bff2..75ef000613c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -363,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)   out_reg:  	ret = register_ftrace_function_probe(glob, ops, count); -	return ret; +	return ret < 0 ? ret : 0;  }  static struct ftrace_func_command ftrace_traceon_cmd = { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb5..420ec348757 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)  	switch (entry->type) {  	case TRACE_GRAPH_ENT: { -		struct ftrace_graph_ent_entry *field; +		/* +		 * print_graph_entry() may consume the current event, +		 * thus @field may become invalid, so we need to save it. +		 * sizeof(struct ftrace_graph_ent_entry) is very small, +		 * it can be safely saved at the stack. +		 */ +		struct ftrace_graph_ent_entry *field, saved;  		trace_assign_type(field, entry); -		return print_graph_entry(field, s, iter); +		saved = *field; +		return print_graph_entry(&saved, s, iter);  	}  	case TRACE_GRAPH_RET: {  		struct ftrace_graph_ret_entry *field; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7938f3ae93e..e0c2545622e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)  {  	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; -	s->buffer[len] = 0; -	seq_puts(m, s->buffer); +	seq_write(m, s->buffer, len);  	trace_seq_init(s);  } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7b627811082..687699d365a 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)  	const char *str = *fmt;  	int i; -	seq_printf(m, "0x%lx : \"", (unsigned long)fmt); +	seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);  	/*  	 * Tabs and new lines need to be converted. diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2d7aebd71db..6a2a9d484cd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {  static int stack_trace_open(struct inode *inode, struct file *file)  { -	int ret; - -	ret = seq_open(file, &stack_trace_seq_ops); - -	return ret; +	return seq_open(file, &stack_trace_seq_ops);  }  static const struct file_operations stack_trace_fops = {  	.open		= stack_trace_open,  	.read		= seq_read,  	.llseek		= seq_lseek, +	.release	= seq_release,  };  int @@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,  	ret = proc_dointvec(table, write, file, buffer, lenp, ppos);  	if (ret || !write || -	    (last_stack_tracer_enabled == stack_tracer_enabled)) +	    (last_stack_tracer_enabled == !!stack_tracer_enabled))  		goto out; -	last_stack_tracer_enabled = stack_tracer_enabled; +	last_stack_tracer_enabled = !!stack_tracer_enabled;  	if (stack_tracer_enabled)  		register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e49334..aea321c82fa 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node)  	}  } -static void reset_stat_session(struct stat_session *session) +static void __reset_stat_session(struct stat_session *session)  {  	struct rb_node *node = session->stat_root.rb_node; @@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session)  	session->stat_root = RB_ROOT;  } +static void reset_stat_session(struct stat_session *session) +{ +	mutex_lock(&session->stat_mutex); +	__reset_stat_session(session); +	mutex_unlock(&session->stat_mutex); +} +  static void destroy_session(struct stat_session *session)  {  	debugfs_remove(session->file); -	reset_stat_session(session); +	__reset_stat_session(session);  	mutex_destroy(&session->stat_mutex);  	kfree(session);  } @@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session)  	int i;  	mutex_lock(&session->stat_mutex); -	reset_stat_session(session); +	__reset_stat_session(session);  	if (!ts->stat_cmp)  		ts->stat_cmp = dummy_cmp; @@ -183,7 +190,7 @@ exit:  	return ret;  exit_free_rbtree: -	reset_stat_session(session); +	__reset_stat_session(session);  	mutex_unlock(&session->stat_mutex);  	return ret;  } @@ -250,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = {  static int tracing_stat_open(struct inode *inode, struct file *file)  {  	int ret; - +	struct seq_file *m;  	struct stat_session *session = inode->i_private; +	ret = stat_seq_init(session); +	if (ret) +		return ret; +  	ret = seq_open(file, &trace_stat_seq_ops); -	if (!ret) { -		struct seq_file *m = file->private_data; -		m->private = session; -		ret = stat_seq_init(session); +	if (ret) { +		reset_stat_session(session); +		return ret;  	} +	m = file->private_data; +	m->private = session;  	return ret;  } @@ -270,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)  {  	struct stat_session *session = i->i_private; -	mutex_lock(&session->stat_mutex);  	reset_stat_session(session); -	mutex_unlock(&session->stat_mutex); -	return 0; +	return seq_release(i, f);  }  static const struct file_operations tracing_stat_fops = { diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4275c..c4bd3d825f3 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@  #include <linux/wait.h>  #include <linux/hash.h> -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)  {  	spin_lock_init(&q->lock); +	lockdep_set_class(&q->lock, key);  	INIT_LIST_HEAD(&q->task_list);  } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head);  void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)  {  |