diff options
Diffstat (limited to 'kernel')
98 files changed, 11899 insertions, 3514 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 42423665660..90b53f6dc22 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -93,8 +93,10 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o  obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o  obj-$(CONFIG_FUNCTION_TRACER) += trace/  obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_X86_DS) += trace/  obj-$(CONFIG_SMP) += sched_cpupri.o  obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/async.c b/kernel/async.c index 968ef9457d4..27235f5de19 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,19 +92,18 @@ extern int initcall_debug;  static async_cookie_t  __lowest_in_progress(struct list_head *running)  {  	struct async_entry *entry; +  	if (!list_empty(running)) {  		entry = list_first_entry(running,  			struct async_entry, list);  		return entry->cookie; -	} else if (!list_empty(&async_pending)) { -		entry = list_first_entry(&async_pending, -			struct async_entry, list); -		return entry->cookie; -	} else { -		/* nothing in progress... next_cookie is "infinity" */ -		return next_cookie;  	} +	list_for_each_entry(entry, &async_pending, list) +		if (entry->running == running) +			return entry->cookie; + +	return next_cookie;	/* "infinity" value */  }  static async_cookie_t  lowest_in_progress(struct list_head *running) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 6e7351739a8..1f6396d7668 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -568,7 +568,7 @@ void audit_trim_trees(void)  		if (err)  			goto skip_it; -		root_mnt = collect_mounts(path.mnt, path.dentry); +		root_mnt = collect_mounts(&path);  		path_put(&path);  		if (!root_mnt)  			goto skip_it; @@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)  	err = kern_path(tree->pathname, 0, &path);  	if (err)  		goto Err; -	mnt = collect_mounts(path.mnt, path.dentry); +	mnt = collect_mounts(&path);  	path_put(&path);  	if (!mnt) {  		err = -ENOMEM; @@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)  	err = kern_path(new, 0, &path);  	if (err)  		return err; -	tagged = collect_mounts(path.mnt, path.dentry); +	tagged = collect_mounts(&path);  	path_put(&path);  	if (!tagged)  		return -ENOMEM; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7267bfd376..3fb789f6df9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,6 +46,7 @@  #include <linux/cgroupstats.h>  #include <linux/hash.h>  #include <linux/namei.h> +#include <linux/smp_lock.h>  #include <asm/atomic.h> @@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	struct cgroup *cgrp = &root->top_cgroup;  	struct cgroup_sb_opts opts; +	lock_kernel();  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);  	mutex_lock(&cgroup_mutex); @@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	kfree(opts.release_agent);  	mutex_unlock(&cgroup_mutex);  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex); +	unlock_kernel();  	return ret;  } diff --git a/kernel/compat.c b/kernel/compat.c index 42d56544460..f6c204f07ea 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,  } +asmlinkage long +compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, +			     struct compat_siginfo __user *uinfo) +{ +	siginfo_t info; + +	if (copy_siginfo_from_user32(&info, uinfo)) +		return -EFAULT; +	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} +  #ifdef __ARCH_WANT_COMPAT_SYS_TIME  /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 026faccca86..d5a7e17474e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {  int __init cpuset_init_early(void)  { -	alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); +	alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);  	top_cpuset.mems_generation = cpuset_mems_generation++;  	return 0; diff --git a/kernel/cred.c b/kernel/cred.c index 3a039189d70..1bb4d7e5d61 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);  /*   * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_exec_mutex + * - The caller must hold current->cred_guard_mutex   */  struct cred *prepare_exec_creds(void)  { @@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)  	struct cred *new;  	int ret; -	mutex_init(&p->cred_exec_mutex); +	mutex_init(&p->cred_guard_mutex);  	if (  #ifdef CONFIG_KEYS diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3b95c..b6c90b5ef50 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,8 @@  #include <linux/tracehook.h>  #include <linux/fs_struct.h>  #include <linux/init_task.h> -#include <trace/sched.h> +#include <linux/perf_counter.h> +#include <trace/events/sched.h>  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -56,10 +57,6 @@  #include <asm/mmu_context.h>  #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); -  static void exit_mm(struct task_struct * tsk);  static void __unhash_process(struct task_struct *p) @@ -158,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)  {  	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS +	WARN_ON_ONCE(tsk->perf_counter_ctxp); +#endif  	trace_sched_process_free(tsk);  	put_task_struct(tsk);  } @@ -174,6 +174,7 @@ repeat:  	atomic_dec(&__task_cred(p)->user->processes);  	proc_flush_task(p); +  	write_lock_irq(&tasklist_lock);  	tracehook_finish_release_task(p);  	__exit_signal(p); @@ -975,16 +976,19 @@ NORET_TYPE void do_exit(long code)  		module_put(tsk->binfmt->module);  	proc_exit_connector(tsk); + +	/* +	 * Flush inherited counters to the parent - before the parent +	 * gets woken up by child-exit notifications. +	 */ +	perf_counter_exit_task(tsk); +  	exit_notify(tsk, group_dead);  #ifdef CONFIG_NUMA  	mpol_put(tsk->mempolicy);  	tsk->mempolicy = NULL;  #endif  #ifdef CONFIG_FUTEX -	/* -	 * This must happen late, after the PID is not -	 * hashed anymore: -	 */  	if (unlikely(!list_empty(&tsk->pi_state_list)))  		exit_pi_state_list(tsk);  	if (unlikely(current->pi_state_cache)) @@ -1476,6 +1480,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,  		 */  		if (*notask_error)  			*notask_error = ret; +		return 0;  	}  	if (likely(!ptrace) && unlikely(p->ptrace)) { diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..4430eb1376f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,8 +61,8 @@  #include <linux/proc_fs.h>  #include <linux/blkdev.h>  #include <linux/fs_struct.h> -#include <trace/sched.h>  #include <linux/magic.h> +#include <linux/perf_counter.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h> @@ -71,6 +71,8 @@  #include <asm/cacheflush.h>  #include <asm/tlbflush.h> +#include <trace/events/sched.h> +  /*   * Protected counters by write_lock_irq(&tasklist_lock)   */ @@ -83,8 +85,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;  __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */ -DEFINE_TRACE(sched_process_fork); -  int nr_processes(void)  {  	int cpu; @@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	if (!p)  		goto fork_out; +	ftrace_graph_init_task(p); +  	rt_mutex_init_task(p);  #ifdef CONFIG_PROVE_LOCKING @@ -1089,12 +1091,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,  #ifdef CONFIG_DEBUG_MUTEXES  	p->blocked_on = NULL; /* not blocked yet */  #endif -	if (unlikely(current->ptrace)) -		ptrace_fork(p, clone_flags); + +	p->bts = NULL;  	/* Perform scheduler related setup. Assign this task to a CPU. */  	sched_fork(p, clone_flags); +	retval = perf_counter_init_task(p); +	if (retval) +		goto bad_fork_cleanup_policy; +  	if ((retval = audit_alloc(p)))  		goto bad_fork_cleanup_policy;  	/* copy all the process information */ @@ -1131,8 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  		}  	} -	ftrace_graph_init_task(p); -  	p->pid = pid_nr(pid);  	p->tgid = p->pid;  	if (clone_flags & CLONE_THREAD) @@ -1141,7 +1145,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	if (current->nsproxy != p->nsproxy) {  		retval = ns_cgroup_clone(p, pid);  		if (retval) -			goto bad_fork_free_graph; +			goto bad_fork_free_pid;  	}  	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1233,7 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  		spin_unlock(¤t->sighand->siglock);  		write_unlock_irq(&tasklist_lock);  		retval = -ERESTARTNOINTR; -		goto bad_fork_free_graph; +		goto bad_fork_free_pid;  	}  	if (clone_flags & CLONE_THREAD) { @@ -1268,8 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	cgroup_post_fork(p);  	return p; -bad_fork_free_graph: -	ftrace_graph_exit_task(p);  bad_fork_free_pid:  	if (pid != &init_struct_pid)  		free_pid(pid); @@ -1293,6 +1295,7 @@ bad_fork_cleanup_semundo:  bad_fork_cleanup_audit:  	audit_free(p);  bad_fork_cleanup_policy: +	perf_counter_free_task(p);  #ifdef CONFIG_NUMA  	mpol_put(p->mempolicy);  bad_fork_cleanup_cgroup: @@ -1406,10 +1409,16 @@ long do_fork(unsigned long clone_flags,  		if (clone_flags & CLONE_VFORK) {  			p->vfork_done = &vfork;  			init_completion(&vfork); +		} else if (!(clone_flags & CLONE_VM)) { +			/* +			 * vfork will do an exec which will call +			 * set_task_comm() +			 */ +			perf_counter_fork(p);  		}  		audit_finish_fork(p); -		tracehook_report_clone(trace, regs, clone_flags, nr, p); +		tracehook_report_clone(regs, clone_flags, nr, p);  		/*  		 * We set PF_STARTING at creation in case tracing wants to diff --git a/kernel/futex.c b/kernel/futex.c index eef8cd26b5e..80b5ce71659 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -19,6 +19,10 @@   *  PRIVATE futexes by Eric Dumazet   *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>   * + *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> + *  Copyright (C) IBM Corporation, 2009 + *  Thanks to Thomas Gleixner for conceptual design and careful reviews. + *   *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly   *  enough at me, Linus for the original (flawed) idea, Matthew   *  Kirkwood for proof-of-concept implementation. @@ -96,8 +100,8 @@ struct futex_pi_state {   */  struct futex_q {  	struct plist_node list; -	/* There can only be a single waiter */ -	wait_queue_head_t waiter; +	/* Waiter reference */ +	struct task_struct *task;  	/* Which hash list lock to use: */  	spinlock_t *lock_ptr; @@ -107,7 +111,9 @@ struct futex_q {  	/* Optional priority inheritance state: */  	struct futex_pi_state *pi_state; -	struct task_struct *task; + +	/* rt_waiter storage for requeue_pi: */ +	struct rt_mutex_waiter *rt_waiter;  	/* Bitset for the optional bitmasked wakeup */  	u32 bitset; @@ -193,6 +199,7 @@ static void drop_futex_key_refs(union futex_key *key)   * @uaddr: virtual address of the futex   * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED   * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)   *   * Returns a negative error code or 0   * The key words are stored in *key on success. @@ -203,7 +210,8 @@ static void drop_futex_key_refs(union futex_key *key)   *   * lock_page() might sleep, the caller should not hold a spinlock.   */ -static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) +static int +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)  {  	unsigned long address = (unsigned long)uaddr;  	struct mm_struct *mm = current->mm; @@ -226,7 +234,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)  	 *        but access_ok() should be faster than find_vma()  	 */  	if (!fshared) { -		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) +		if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))  			return -EFAULT;  		key->private.mm = mm;  		key->private.address = address; @@ -235,7 +243,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)  	}  again: -	err = get_user_pages_fast(address, 1, 0, &page); +	err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);  	if (err < 0)  		return err; @@ -276,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)  	drop_futex_key_refs(key);  } +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb:     the hash bucket the futex_q's reside in + * @key:    the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, +					union futex_key *key) +{ +	struct futex_q *this; + +	plist_for_each_entry(this, &hb->chain, list) { +		if (match_futex(&this->key, key)) +			return this; +	} +	return NULL; +} +  static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)  {  	u32 curval; @@ -537,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  	return 0;  } +/** + * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * @uaddr:		the pi futex user address + * @hb:			the pi futex hash bucket + * @key:		the futex key associated with uaddr and hb + * @ps:			the pi_state pointer where we store the result of the + *			lookup + * @task:		the task to perform the atomic lock work for.  This will + *			be "current" except in the case of requeue pi. + * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Returns: + *  0 - ready to wait + *  1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, +				union futex_key *key, +				struct futex_pi_state **ps, +				struct task_struct *task, int set_waiters) +{ +	int lock_taken, ret, ownerdied = 0; +	u32 uval, newval, curval; + +retry: +	ret = lock_taken = 0; + +	/* +	 * To avoid races, we attempt to take the lock here again +	 * (by doing a 0 -> TID atomic cmpxchg), while holding all +	 * the locks. It will most likely not succeed. +	 */ +	newval = task_pid_vnr(task); +	if (set_waiters) +		newval |= FUTEX_WAITERS; + +	curval = cmpxchg_futex_value_locked(uaddr, 0, newval); + +	if (unlikely(curval == -EFAULT)) +		return -EFAULT; + +	/* +	 * Detect deadlocks. +	 */ +	if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) +		return -EDEADLK; + +	/* +	 * Surprise - we got the lock. Just return to userspace: +	 */ +	if (unlikely(!curval)) +		return 1; + +	uval = curval; + +	/* +	 * Set the FUTEX_WAITERS flag, so the owner will know it has someone +	 * to wake at the next unlock. +	 */ +	newval = curval | FUTEX_WAITERS; + +	/* +	 * There are two cases, where a futex might have no owner (the +	 * owner TID is 0): OWNER_DIED. We take over the futex in this +	 * case. We also do an unconditional take over, when the owner +	 * of the futex died. +	 * +	 * This is safe as we are protected by the hash bucket lock ! +	 */ +	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { +		/* Keep the OWNER_DIED bit */ +		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); +		ownerdied = 0; +		lock_taken = 1; +	} + +	curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + +	if (unlikely(curval == -EFAULT)) +		return -EFAULT; +	if (unlikely(curval != uval)) +		goto retry; + +	/* +	 * We took the lock due to owner died take over. +	 */ +	if (unlikely(lock_taken)) +		return 1; + +	/* +	 * We dont have the lock. Look up the PI state (or create it if +	 * we are the first waiter): +	 */ +	ret = lookup_pi_state(uval, hb, key, ps); + +	if (unlikely(ret)) { +		switch (ret) { +		case -ESRCH: +			/* +			 * No owner found for this futex. Check if the +			 * OWNER_DIED bit is set to figure out whether +			 * this is a robust futex or not. +			 */ +			if (get_futex_value_locked(&curval, uaddr)) +				return -EFAULT; + +			/* +			 * We simply start over in case of a robust +			 * futex. The code above will take the futex +			 * and return happy. +			 */ +			if (curval & FUTEX_OWNER_DIED) { +				ownerdied = 1; +				goto retry; +			} +		default: +			break; +		} +	} + +	return ret; +} +  /*   * The hash bucket lock must be held when this is called.   * Afterwards, the futex_q must not be accessed.   */  static void wake_futex(struct futex_q *q)  { -	plist_del(&q->list, &q->list.plist); +	struct task_struct *p = q->task; +  	/* -	 * The lock in wake_up_all() is a crucial memory barrier after the -	 * plist_del() and also before assigning to q->lock_ptr. +	 * We set q->lock_ptr = NULL _before_ we wake up the task. If +	 * a non futex wake up happens on another CPU then the task +	 * might exit and p would dereference a non existing task +	 * struct. Prevent this by holding a reference on p across the +	 * wake up.  	 */ -	wake_up(&q->waiter); +	get_task_struct(p); + +	plist_del(&q->list, &q->list.plist);  	/* -	 * The waiting task can free the futex_q as soon as this is written, -	 * without taking any locks.  This must come last. -	 * -	 * A memory barrier is required here to prevent the following store to -	 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the -	 * end of wake_up() does not prevent this store from moving. +	 * The waiting task can free the futex_q as soon as +	 * q->lock_ptr = NULL is written, without taking any locks. A +	 * memory barrier is required here to prevent the following +	 * store to lock_ptr from getting ahead of the plist_del.  	 */  	smp_wmb();  	q->lock_ptr = NULL; + +	wake_up_state(p, TASK_NORMAL); +	put_task_struct(p);  }  static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -677,7 +836,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)  	if (!bitset)  		return -EINVAL; -	ret = get_futex_key(uaddr, fshared, &key); +	ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);  	if (unlikely(ret != 0))  		goto out; @@ -687,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)  	plist_for_each_entry_safe(this, next, head, list) {  		if (match_futex (&this->key, &key)) { -			if (this->pi_state) { +			if (this->pi_state || this->rt_waiter) {  				ret = -EINVAL;  				break;  			} @@ -723,10 +882,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,  	int ret, op_ret;  retry: -	ret = get_futex_key(uaddr1, fshared, &key1); +	ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);  	if (unlikely(ret != 0))  		goto out; -	ret = get_futex_key(uaddr2, fshared, &key2); +	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);  	if (unlikely(ret != 0))  		goto out_put_key1; @@ -800,24 +959,185 @@ out:  	return ret;  } -/* - * Requeue all waiters hashed on one physical page to another - * physical page. +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q:		the futex_q to requeue + * @hb1:	the source hash_bucket + * @hb2:	the target hash_bucket + * @key2:	the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, +		   struct futex_hash_bucket *hb2, union futex_key *key2) +{ + +	/* +	 * If key1 and key2 hash to the same bucket, no need to +	 * requeue. +	 */ +	if (likely(&hb1->chain != &hb2->chain)) { +		plist_del(&q->list, &hb1->chain); +		plist_add(&q->list, &hb2->chain); +		q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST +		q->list.plist.lock = &hb2->lock; +#endif +	} +	get_futex_key_refs(key2); +	q->key = *key2; +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * q:	the futex_q + * key:	the key of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal.  Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition.  Must be called with the q->lock_ptr held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +{ +	drop_futex_key_refs(&q->key); +	get_futex_key_refs(key); +	q->key = *key; + +	WARN_ON(plist_node_empty(&q->list)); +	plist_del(&q->list, &q->list.plist); + +	WARN_ON(!q->rt_waiter); +	q->rt_waiter = NULL; + +	wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex:		the user address of the to futex + * @hb1:		the from futex hash bucket, must be locked by the caller + * @hb2:		the to futex hash bucket, must be locked by the caller + * @key1:		the from futex key + * @key2:		the to futex key + * @ps:			address to store the pi_state pointer + * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed.  If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * Returns: + *  0 - failed to acquire the lock atomicly + *  1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, +				 struct futex_hash_bucket *hb1, +				 struct futex_hash_bucket *hb2, +				 union futex_key *key1, union futex_key *key2, +				 struct futex_pi_state **ps, int set_waiters) +{ +	struct futex_q *top_waiter = NULL; +	u32 curval; +	int ret; + +	if (get_futex_value_locked(&curval, pifutex)) +		return -EFAULT; + +	/* +	 * Find the top_waiter and determine if there are additional waiters. +	 * If the caller intends to requeue more than 1 waiter to pifutex, +	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, +	 * as we have means to handle the possible fault.  If not, don't set +	 * the bit unecessarily as it will force the subsequent unlock to enter +	 * the kernel. +	 */ +	top_waiter = futex_top_waiter(hb1, key1); + +	/* There are no waiters, nothing for us to do. */ +	if (!top_waiter) +		return 0; + +	/* +	 * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in +	 * the contended case or if set_waiters is 1.  The pi_state is returned +	 * in ps in contended cases. +	 */ +	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, +				   set_waiters); +	if (ret == 1) +		requeue_pi_wake_futex(top_waiter, key2); + +	return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1:	source futex user address + * uaddr2:	target futex user address + * nr_wake:	number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue:	number of waiters to requeue (0-INT_MAX) + * requeue_pi:	if we are attempting to requeue from a non-pi futex to a + * 		pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + *  <0 - on error   */  static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, -			 int nr_wake, int nr_requeue, u32 *cmpval) +			 int nr_wake, int nr_requeue, u32 *cmpval, +			 int requeue_pi)  {  	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; +	int drop_count = 0, task_count = 0, ret; +	struct futex_pi_state *pi_state = NULL;  	struct futex_hash_bucket *hb1, *hb2;  	struct plist_head *head1;  	struct futex_q *this, *next; -	int ret, drop_count = 0; +	u32 curval2; + +	if (requeue_pi) { +		/* +		 * requeue_pi requires a pi_state, try to allocate it now +		 * without any locks in case it fails. +		 */ +		if (refill_pi_state_cache()) +			return -ENOMEM; +		/* +		 * requeue_pi must wake as many tasks as it can, up to nr_wake +		 * + nr_requeue, since it acquires the rt_mutex prior to +		 * returning to userspace, so as to not leave the rt_mutex with +		 * waiters and no owner.  However, second and third wake-ups +		 * cannot be predicted as they involve race conditions with the +		 * first wake and a fault while looking up the pi_state.  Both +		 * pthread_cond_signal() and pthread_cond_broadcast() should +		 * use nr_wake=1. +		 */ +		if (nr_wake != 1) +			return -EINVAL; +	}  retry: -	ret = get_futex_key(uaddr1, fshared, &key1); +	if (pi_state != NULL) { +		/* +		 * We will have to lookup the pi_state again, so free this one +		 * to keep the accounting correct. +		 */ +		free_pi_state(pi_state); +		pi_state = NULL; +	} + +	ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);  	if (unlikely(ret != 0))  		goto out; -	ret = get_futex_key(uaddr2, fshared, &key2); +	ret = get_futex_key(uaddr2, fshared, &key2, +			    requeue_pi ? VERIFY_WRITE : VERIFY_READ);  	if (unlikely(ret != 0))  		goto out_put_key1; @@ -852,32 +1172,99 @@ retry_private:  		}  	} +	if (requeue_pi && (task_count - nr_wake < nr_requeue)) { +		/* +		 * Attempt to acquire uaddr2 and wake the top waiter. If we +		 * intend to requeue waiters, force setting the FUTEX_WAITERS +		 * bit.  We force this here where we are able to easily handle +		 * faults rather in the requeue loop below. +		 */ +		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, +						 &key2, &pi_state, nr_requeue); + +		/* +		 * At this point the top_waiter has either taken uaddr2 or is +		 * waiting on it.  If the former, then the pi_state will not +		 * exist yet, look it up one more time to ensure we have a +		 * reference to it. +		 */ +		if (ret == 1) { +			WARN_ON(pi_state); +			task_count++; +			ret = get_futex_value_locked(&curval2, uaddr2); +			if (!ret) +				ret = lookup_pi_state(curval2, hb2, &key2, +						      &pi_state); +		} + +		switch (ret) { +		case 0: +			break; +		case -EFAULT: +			double_unlock_hb(hb1, hb2); +			put_futex_key(fshared, &key2); +			put_futex_key(fshared, &key1); +			ret = get_user(curval2, uaddr2); +			if (!ret) +				goto retry; +			goto out; +		case -EAGAIN: +			/* The owner was exiting, try again. */ +			double_unlock_hb(hb1, hb2); +			put_futex_key(fshared, &key2); +			put_futex_key(fshared, &key1); +			cond_resched(); +			goto retry; +		default: +			goto out_unlock; +		} +	} +  	head1 = &hb1->chain;  	plist_for_each_entry_safe(this, next, head1, list) { -		if (!match_futex (&this->key, &key1)) +		if (task_count - nr_wake >= nr_requeue) +			break; + +		if (!match_futex(&this->key, &key1))  			continue; -		if (++ret <= nr_wake) { + +		WARN_ON(!requeue_pi && this->rt_waiter); +		WARN_ON(requeue_pi && !this->rt_waiter); + +		/* +		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the +		 * lock, we already woke the top_waiter.  If not, it will be +		 * woken by futex_unlock_pi(). +		 */ +		if (++task_count <= nr_wake && !requeue_pi) {  			wake_futex(this); -		} else { -			/* -			 * If key1 and key2 hash to the same bucket, no need to -			 * requeue. -			 */ -			if (likely(head1 != &hb2->chain)) { -				plist_del(&this->list, &hb1->chain); -				plist_add(&this->list, &hb2->chain); -				this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST -				this->list.plist.lock = &hb2->lock; -#endif -			} -			this->key = key2; -			get_futex_key_refs(&key2); -			drop_count++; +			continue; +		} -			if (ret - nr_wake >= nr_requeue) -				break; +		/* +		 * Requeue nr_requeue waiters and possibly one more in the case +		 * of requeue_pi if we couldn't acquire the lock atomically. +		 */ +		if (requeue_pi) { +			/* Prepare the waiter to take the rt_mutex. */ +			atomic_inc(&pi_state->refcount); +			this->pi_state = pi_state; +			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, +							this->rt_waiter, +							this->task, 1); +			if (ret == 1) { +				/* We got the lock. */ +				requeue_pi_wake_futex(this, &key2); +				continue; +			} else if (ret) { +				/* -EDEADLK */ +				this->pi_state = NULL; +				free_pi_state(pi_state); +				goto out_unlock; +			}  		} +		requeue_futex(this, hb1, hb2, &key2); +		drop_count++;  	}  out_unlock: @@ -897,7 +1284,9 @@ out_put_keys:  out_put_key1:  	put_futex_key(fshared, &key1);  out: -	return ret; +	if (pi_state != NULL) +		free_pi_state(pi_state); +	return ret ? ret : task_count;  }  /* The key must be already stored in q->key. */ @@ -905,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)  {  	struct futex_hash_bucket *hb; -	init_waitqueue_head(&q->waiter); -  	get_futex_key_refs(&q->key);  	hb = hash_futex(&q->key);  	q->lock_ptr = &hb->lock; @@ -1117,35 +1504,149 @@ handle_fault:   */  #define FLAGS_SHARED		0x01  #define FLAGS_CLOCKRT		0x02 +#define FLAGS_HAS_TIMEOUT	0x04  static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, int fshared, -		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr:	user address of the futex + * @fshared:	whether the futex is shared (1) or not (0) + * @q:		futex_q (contains pi_state and access to the rt_mutex) + * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + *  1 - success, lock taken + *  0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, +		       int locked)  { -	struct task_struct *curr = current; -	struct restart_block *restart; -	DECLARE_WAITQUEUE(wait, curr); -	struct futex_hash_bucket *hb; -	struct futex_q q; -	u32 uval; -	int ret; -	struct hrtimer_sleeper t; -	int rem = 0; +	struct task_struct *owner; +	int ret = 0; -	if (!bitset) -		return -EINVAL; +	if (locked) { +		/* +		 * Got the lock. We might not be the anticipated owner if we +		 * did a lock-steal - fix up the PI-state in that case: +		 */ +		if (q->pi_state->owner != current) +			ret = fixup_pi_state_owner(uaddr, q, current, fshared); +		goto out; +	} -	q.pi_state = NULL; -	q.bitset = bitset; -retry: -	q.key = FUTEX_KEY_INIT; -	ret = get_futex_key(uaddr, fshared, &q.key); -	if (unlikely(ret != 0)) +	/* +	 * Catch the rare case, where the lock was released when we were on the +	 * way back before we locked the hash bucket. +	 */ +	if (q->pi_state->owner == current) { +		/* +		 * Try to get the rt_mutex now. This might fail as some other +		 * task acquired the rt_mutex after we removed ourself from the +		 * rt_mutex waiters list. +		 */ +		if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { +			locked = 1; +			goto out; +		} + +		/* +		 * pi_state is incorrect, some other task did a lock steal and +		 * we returned due to timeout or signal without taking the +		 * rt_mutex. Too late. We can access the rt_mutex_owner without +		 * locking, as the other task is now blocked on the hash bucket +		 * lock. Fix the state up. +		 */ +		owner = rt_mutex_owner(&q->pi_state->pi_mutex); +		ret = fixup_pi_state_owner(uaddr, q, owner, fshared);  		goto out; +	} -retry_private: -	hb = queue_lock(&q); +	/* +	 * Paranoia check. If we did not take the lock, then we should not be +	 * the owner, nor the pending owner, of the rt_mutex. +	 */ +	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) +		printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " +				"pi-state %p\n", ret, +				q->pi_state->pi_mutex.owner, +				q->pi_state->owner); + +out: +	return ret ? ret : locked; +} + +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb:		the futex hash bucket, must be locked by the caller + * @q:		the futex_q to queue up on + * @timeout:	the prepared hrtimer_sleeper, or null for no timeout + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, +				struct hrtimer_sleeper *timeout) +{ +	queue_me(q, hb); + +	/* +	 * There might have been scheduling since the queue_me(), as we +	 * cannot hold a spinlock across the get_user() in case it +	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when +	 * queueing ourselves into the futex hash. This code thus has to +	 * rely on the futex_wake() code removing us from hash when it +	 * wakes us up. +	 */ +	set_current_state(TASK_INTERRUPTIBLE); + +	/* Arm the timer */ +	if (timeout) { +		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); +		if (!hrtimer_active(&timeout->timer)) +			timeout->task = NULL; +	} + +	/* +	 * !plist_node_empty() is safe here without any lock. +	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. +	 */ +	if (likely(!plist_node_empty(&q->list))) { +		/* +		 * If the timer has already expired, current will already be +		 * flagged for rescheduling. Only call schedule if there +		 * is no timeout, or if it has yet to expire. +		 */ +		if (!timeout || timeout->task) +			schedule(); +	} +	__set_current_state(TASK_RUNNING); +} + +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr:	the futex userspace address + * @val:	the expected value + * @fshared:	whether the futex is shared (1) or not (0) + * @q:		the associated futex_q + * @hb:		storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket.  Get the futex value and + * compare it with the expected value.  Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + *  0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, +			   struct futex_q *q, struct futex_hash_bucket **hb) +{ +	u32 uval; +	int ret;  	/*  	 * Access the page AFTER the hash-bucket is locked. @@ -1163,95 +1664,83 @@ retry_private:  	 * A consequence is that futex_wait() can return zero and absorb  	 * a wakeup when *uaddr != val on entry to the syscall.  This is  	 * rare, but normal. -	 * -	 * For shared futexes, we hold the mmap semaphore, so the mapping -	 * cannot have changed since we looked it up in get_futex_key.  	 */ +retry: +	q->key = FUTEX_KEY_INIT; +	ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); +	if (unlikely(ret != 0)) +		return ret; + +retry_private: +	*hb = queue_lock(q); +  	ret = get_futex_value_locked(&uval, uaddr); -	if (unlikely(ret)) { -		queue_unlock(&q, hb); +	if (ret) { +		queue_unlock(q, *hb);  		ret = get_user(uval, uaddr);  		if (ret) -			goto out_put_key; +			goto out;  		if (!fshared)  			goto retry_private; -		put_futex_key(fshared, &q.key); +		put_futex_key(fshared, &q->key);  		goto retry;  	} -	ret = -EWOULDBLOCK; -	if (unlikely(uval != val)) { -		queue_unlock(&q, hb); -		goto out_put_key; -	} - -	/* Only actually queue if *uaddr contained val.  */ -	queue_me(&q, hb); -	/* -	 * There might have been scheduling since the queue_me(), as we -	 * cannot hold a spinlock across the get_user() in case it -	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when -	 * queueing ourselves into the futex hash.  This code thus has to -	 * rely on the futex_wake() code removing us from hash when it -	 * wakes us up. -	 */ +	if (uval != val) { +		queue_unlock(q, *hb); +		ret = -EWOULDBLOCK; +	} -	/* add_wait_queue is the barrier after __set_current_state. */ -	__set_current_state(TASK_INTERRUPTIBLE); -	add_wait_queue(&q.waiter, &wait); -	/* -	 * !plist_node_empty() is safe here without any lock. -	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. -	 */ -	if (likely(!plist_node_empty(&q.list))) { -		if (!abs_time) -			schedule(); -		else { -			hrtimer_init_on_stack(&t.timer, -					      clockrt ? CLOCK_REALTIME : -					      CLOCK_MONOTONIC, -					      HRTIMER_MODE_ABS); -			hrtimer_init_sleeper(&t, current); -			hrtimer_set_expires_range_ns(&t.timer, *abs_time, -						     current->timer_slack_ns); +out: +	if (ret) +		put_futex_key(fshared, &q->key); +	return ret; +} -			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); -			if (!hrtimer_active(&t.timer)) -				t.task = NULL; +static int futex_wait(u32 __user *uaddr, int fshared, +		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +{ +	struct hrtimer_sleeper timeout, *to = NULL; +	struct restart_block *restart; +	struct futex_hash_bucket *hb; +	struct futex_q q; +	int ret; -			/* -			 * the timer could have already expired, in which -			 * case current would be flagged for rescheduling. -			 * Don't bother calling schedule. -			 */ -			if (likely(t.task)) -				schedule(); +	if (!bitset) +		return -EINVAL; -			hrtimer_cancel(&t.timer); +	q.pi_state = NULL; +	q.bitset = bitset; +	q.rt_waiter = NULL; -			/* Flag if a timeout occured */ -			rem = (t.task == NULL); +	if (abs_time) { +		to = &timeout; -			destroy_hrtimer_on_stack(&t.timer); -		} +		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : +				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +		hrtimer_init_sleeper(to, current); +		hrtimer_set_expires_range_ns(&to->timer, *abs_time, +					     current->timer_slack_ns);  	} -	__set_current_state(TASK_RUNNING); -	/* -	 * NOTE: we don't remove ourselves from the waitqueue because -	 * we are the only user of it. -	 */ +	/* Prepare to wait on uaddr. */ +	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); +	if (ret) +		goto out; + +	/* queue_me and wait for wakeup, timeout, or a signal. */ +	futex_wait_queue_me(hb, &q, to);  	/* If we were woken (and unqueued), we succeeded, whatever. */  	ret = 0;  	if (!unqueue_me(&q))  		goto out_put_key;  	ret = -ETIMEDOUT; -	if (rem) +	if (to && !to->task)  		goto out_put_key;  	/* @@ -1268,7 +1757,7 @@ retry_private:  	restart->futex.val = val;  	restart->futex.time = abs_time->tv64;  	restart->futex.bitset = bitset; -	restart->futex.flags = 0; +	restart->futex.flags = FLAGS_HAS_TIMEOUT;  	if (fshared)  		restart->futex.flags |= FLAGS_SHARED; @@ -1280,6 +1769,10 @@ retry_private:  out_put_key:  	put_futex_key(fshared, &q.key);  out: +	if (to) { +		hrtimer_cancel(&to->timer); +		destroy_hrtimer_on_stack(&to->timer); +	}  	return ret;  } @@ -1288,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)  {  	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;  	int fshared = 0; -	ktime_t t; +	ktime_t t, *tp = NULL; -	t.tv64 = restart->futex.time; +	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { +		t.tv64 = restart->futex.time; +		tp = &t; +	}  	restart->fn = do_no_restart_syscall;  	if (restart->futex.flags & FLAGS_SHARED)  		fshared = 1; -	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, +	return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,  				restart->futex.bitset,  				restart->futex.flags & FLAGS_CLOCKRT);  } @@ -1310,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,  			 int detect, ktime_t *time, int trylock)  {  	struct hrtimer_sleeper timeout, *to = NULL; -	struct task_struct *curr = current;  	struct futex_hash_bucket *hb; -	u32 uval, newval, curval; +	u32 uval;  	struct futex_q q; -	int ret, lock_taken, ownerdied = 0; +	int res, ret;  	if (refill_pi_state_cache())  		return -ENOMEM; @@ -1328,90 +1823,25 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,  	}  	q.pi_state = NULL; +	q.rt_waiter = NULL;  retry:  	q.key = FUTEX_KEY_INIT; -	ret = get_futex_key(uaddr, fshared, &q.key); +	ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);  	if (unlikely(ret != 0))  		goto out;  retry_private:  	hb = queue_lock(&q); -retry_locked: -	ret = lock_taken = 0; - -	/* -	 * To avoid races, we attempt to take the lock here again -	 * (by doing a 0 -> TID atomic cmpxchg), while holding all -	 * the locks. It will most likely not succeed. -	 */ -	newval = task_pid_vnr(current); - -	curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - -	if (unlikely(curval == -EFAULT)) -		goto uaddr_faulted; - -	/* -	 * Detect deadlocks. In case of REQUEUE_PI this is a valid -	 * situation and we return success to user space. -	 */ -	if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { -		ret = -EDEADLK; -		goto out_unlock_put_key; -	} - -	/* -	 * Surprise - we got the lock. Just return to userspace: -	 */ -	if (unlikely(!curval)) -		goto out_unlock_put_key; - -	uval = curval; - -	/* -	 * Set the WAITERS flag, so the owner will know it has someone -	 * to wake at next unlock -	 */ -	newval = curval | FUTEX_WAITERS; - -	/* -	 * There are two cases, where a futex might have no owner (the -	 * owner TID is 0): OWNER_DIED. We take over the futex in this -	 * case. We also do an unconditional take over, when the owner -	 * of the futex died. -	 * -	 * This is safe as we are protected by the hash bucket lock ! -	 */ -	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { -		/* Keep the OWNER_DIED bit */ -		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); -		ownerdied = 0; -		lock_taken = 1; -	} - -	curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - -	if (unlikely(curval == -EFAULT)) -		goto uaddr_faulted; -	if (unlikely(curval != uval)) -		goto retry_locked; - -	/* -	 * We took the lock due to owner died take over. -	 */ -	if (unlikely(lock_taken)) -		goto out_unlock_put_key; - -	/* -	 * We dont have the lock. Look up the PI state (or create it if -	 * we are the first waiter): -	 */ -	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - +	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);  	if (unlikely(ret)) {  		switch (ret) { - +		case 1: +			/* We got the lock. */ +			ret = 0; +			goto out_unlock_put_key; +		case -EFAULT: +			goto uaddr_faulted;  		case -EAGAIN:  			/*  			 * Task is exiting and we just wait for the @@ -1421,25 +1851,6 @@ retry_locked:  			put_futex_key(fshared, &q.key);  			cond_resched();  			goto retry; - -		case -ESRCH: -			/* -			 * No owner found for this futex. Check if the -			 * OWNER_DIED bit is set to figure out whether -			 * this is a robust futex or not. -			 */ -			if (get_futex_value_locked(&curval, uaddr)) -				goto uaddr_faulted; - -			/* -			 * We simply start over in case of a robust -			 * futex. The code above will take the futex -			 * and return happy. -			 */ -			if (curval & FUTEX_OWNER_DIED) { -				ownerdied = 1; -				goto retry_locked; -			}  		default:  			goto out_unlock_put_key;  		} @@ -1463,71 +1874,21 @@ retry_locked:  	}  	spin_lock(q.lock_ptr); - -	if (!ret) { -		/* -		 * Got the lock. We might not be the anticipated owner -		 * if we did a lock-steal - fix up the PI-state in -		 * that case: -		 */ -		if (q.pi_state->owner != curr) -			ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); -	} else { -		/* -		 * Catch the rare case, where the lock was released -		 * when we were on the way back before we locked the -		 * hash bucket. -		 */ -		if (q.pi_state->owner == curr) { -			/* -			 * Try to get the rt_mutex now. This might -			 * fail as some other task acquired the -			 * rt_mutex after we removed ourself from the -			 * rt_mutex waiters list. -			 */ -			if (rt_mutex_trylock(&q.pi_state->pi_mutex)) -				ret = 0; -			else { -				/* -				 * pi_state is incorrect, some other -				 * task did a lock steal and we -				 * returned due to timeout or signal -				 * without taking the rt_mutex. Too -				 * late. We can access the -				 * rt_mutex_owner without locking, as -				 * the other task is now blocked on -				 * the hash bucket lock. Fix the state -				 * up. -				 */ -				struct task_struct *owner; -				int res; - -				owner = rt_mutex_owner(&q.pi_state->pi_mutex); -				res = fixup_pi_state_owner(uaddr, &q, owner, -							   fshared); - -				/* propagate -EFAULT, if the fixup failed */ -				if (res) -					ret = res; -			} -		} else { -			/* -			 * Paranoia check. If we did not take the lock -			 * in the trylock above, then we should not be -			 * the owner of the rtmutex, neither the real -			 * nor the pending one: -			 */ -			if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) -				printk(KERN_ERR "futex_lock_pi: ret = %d " -				       "pi-mutex: %p pi-state %p\n", ret, -				       q.pi_state->pi_mutex.owner, -				       q.pi_state->owner); -		} -	} +	/* +	 * Fixup the pi_state owner and possibly acquire the lock if we +	 * haven't already. +	 */ +	res = fixup_owner(uaddr, fshared, &q, !ret); +	/* +	 * If fixup_owner() returned an error, proprogate that.  If it acquired +	 * the lock, clear our -ETIMEDOUT or -EINTR. +	 */ +	if (res) +		ret = (res < 0) ? res : 0;  	/* -	 * If fixup_pi_state_owner() faulted and was unable to handle the -	 * fault, unlock it and return the fault to userspace. +	 * If fixup_owner() faulted and was unable to handle the fault, unlock +	 * it and return the fault to userspace.  	 */  	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))  		rt_mutex_unlock(&q.pi_state->pi_mutex); @@ -1535,9 +1896,7 @@ retry_locked:  	/* Unqueue and drop the lock */  	unqueue_me_pi(&q); -	if (to) -		destroy_hrtimer_on_stack(&to->timer); -	return ret != -EINTR ? ret : -ERESTARTNOINTR; +	goto out;  out_unlock_put_key:  	queue_unlock(&q, hb); @@ -1547,7 +1906,7 @@ out_put_key:  out:  	if (to)  		destroy_hrtimer_on_stack(&to->timer); -	return ret; +	return ret != -EINTR ? ret : -ERESTARTNOINTR;  uaddr_faulted:  	/* @@ -1570,7 +1929,6 @@ uaddr_faulted:  	goto retry;  } -  /*   * Userspace attempted a TID -> 0 atomic transition, and failed.   * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1594,7 +1952,7 @@ retry:  	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))  		return -EPERM; -	ret = get_futex_key(uaddr, fshared, &key); +	ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);  	if (unlikely(ret != 0))  		goto out; @@ -1672,6 +2030,229 @@ pi_faulted:  	return ret;  } +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb:		the hash_bucket futex_q was original enqueued on + * @q:		the futex_q woken while waiting to be requeued + * @key2:	the futex_key of the requeue target futex + * @timeout:	the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex.  If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller.  Must be + * called with the hb lock held. + * + * Returns + *  0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, +				   struct futex_q *q, union futex_key *key2, +				   struct hrtimer_sleeper *timeout) +{ +	int ret = 0; + +	/* +	 * With the hb lock held, we avoid races while we process the wakeup. +	 * We only need to hold hb (and not hb2) to ensure atomicity as the +	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. +	 * It can't be requeued from uaddr2 to something else since we don't +	 * support a PI aware source futex for requeue. +	 */ +	if (!match_futex(&q->key, key2)) { +		WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); +		/* +		 * We were woken prior to requeue by a timeout or a signal. +		 * Unqueue the futex_q and determine which it was. +		 */ +		plist_del(&q->list, &q->list.plist); +		drop_futex_key_refs(&q->key); + +		if (timeout && !timeout->task) +			ret = -ETIMEDOUT; +		else +			ret = -ERESTARTNOINTR; +	} +	return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr:	the futex we initialyl wait on (non-pi) + * @fshared:	whether the futexes are shared (1) or not (0).  They must be + * 		the same type, no requeueing from private to shared, etc. + * @val:	the expected value of uaddr + * @abs_time:	absolute timeout + * @bitset:	32 bit wakeup bitset set by userspace, defaults to all. + * @clockrt:	whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2:	the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue and subsequent unlock + * 3) signal (before or after requeue) + * 4) timeout (before or after requeue) + * + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, we setup a restart_block with futex_lock_pi() as the function. + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + *  0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, +				 u32 val, ktime_t *abs_time, u32 bitset, +				 int clockrt, u32 __user *uaddr2) +{ +	struct hrtimer_sleeper timeout, *to = NULL; +	struct rt_mutex_waiter rt_waiter; +	struct rt_mutex *pi_mutex = NULL; +	struct futex_hash_bucket *hb; +	union futex_key key2; +	struct futex_q q; +	int res, ret; + +	if (!bitset) +		return -EINVAL; + +	if (abs_time) { +		to = &timeout; +		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : +				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +		hrtimer_init_sleeper(to, current); +		hrtimer_set_expires_range_ns(&to->timer, *abs_time, +					     current->timer_slack_ns); +	} + +	/* +	 * The waiter is allocated on our stack, manipulated by the requeue +	 * code while we sleep on uaddr. +	 */ +	debug_rt_mutex_init_waiter(&rt_waiter); +	rt_waiter.task = NULL; + +	q.pi_state = NULL; +	q.bitset = bitset; +	q.rt_waiter = &rt_waiter; + +	key2 = FUTEX_KEY_INIT; +	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); +	if (unlikely(ret != 0)) +		goto out; + +	/* Prepare to wait on uaddr. */ +	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); +	if (ret) +		goto out_key2; + +	/* Queue the futex_q, drop the hb lock, wait for wakeup. */ +	futex_wait_queue_me(hb, &q, to); + +	spin_lock(&hb->lock); +	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); +	spin_unlock(&hb->lock); +	if (ret) +		goto out_put_keys; + +	/* +	 * In order for us to be here, we know our q.key == key2, and since +	 * we took the hb->lock above, we also know that futex_requeue() has +	 * completed and we no longer have to concern ourselves with a wakeup +	 * race with the atomic proxy lock acquition by the requeue code. +	 */ + +	/* Check if the requeue code acquired the second futex for us. */ +	if (!q.rt_waiter) { +		/* +		 * Got the lock. We might not be the anticipated owner if we +		 * did a lock-steal - fix up the PI-state in that case. +		 */ +		if (q.pi_state && (q.pi_state->owner != current)) { +			spin_lock(q.lock_ptr); +			ret = fixup_pi_state_owner(uaddr2, &q, current, +						   fshared); +			spin_unlock(q.lock_ptr); +		} +	} else { +		/* +		 * We have been woken up by futex_unlock_pi(), a timeout, or a +		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor +		 * the pi_state. +		 */ +		WARN_ON(!&q.pi_state); +		pi_mutex = &q.pi_state->pi_mutex; +		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); +		debug_rt_mutex_free_waiter(&rt_waiter); + +		spin_lock(q.lock_ptr); +		/* +		 * Fixup the pi_state owner and possibly acquire the lock if we +		 * haven't already. +		 */ +		res = fixup_owner(uaddr2, fshared, &q, !ret); +		/* +		 * If fixup_owner() returned an error, proprogate that.  If it +		 * acquired the lock, clear our -ETIMEDOUT or -EINTR. +		 */ +		if (res) +			ret = (res < 0) ? res : 0; + +		/* Unqueue and drop the lock. */ +		unqueue_me_pi(&q); +	} + +	/* +	 * If fixup_pi_state_owner() faulted and was unable to handle the +	 * fault, unlock the rt_mutex and return the fault to userspace. +	 */ +	if (ret == -EFAULT) { +		if (rt_mutex_owner(pi_mutex) == current) +			rt_mutex_unlock(pi_mutex); +	} else if (ret == -EINTR) { +		/* +		 * We've already been requeued, but we have no way to +		 * restart by calling futex_lock_pi() directly. We +		 * could restart the syscall, but that will look at +		 * the user space value and return right away. So we +		 * drop back with EWOULDBLOCK to tell user space that +		 * "val" has been changed. That's the same what the +		 * restart of the syscall would do in +		 * futex_wait_setup(). +		 */ +		ret = -EWOULDBLOCK; +	} + +out_put_keys: +	put_futex_key(fshared, &q.key); +out_key2: +	put_futex_key(fshared, &key2); + +out: +	if (to) { +		hrtimer_cancel(&to->timer); +		destroy_hrtimer_on_stack(&to->timer); +	} +	return ret; +} +  /*   * Support for robust futexes: the kernel cleans up held futexes at   * thread exit time. @@ -1894,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,  		fshared = 1;  	clockrt = op & FUTEX_CLOCK_REALTIME; -	if (clockrt && cmd != FUTEX_WAIT_BITSET) +	if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)  		return -ENOSYS;  	switch (cmd) { @@ -1909,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,  		ret = futex_wake(uaddr, fshared, val, val3);  		break;  	case FUTEX_REQUEUE: -		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); +		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);  		break;  	case FUTEX_CMP_REQUEUE: -		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); +		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, +				    0);  		break;  	case FUTEX_WAKE_OP:  		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); @@ -1929,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,  		if (futex_cmpxchg_enabled)  			ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);  		break; +	case FUTEX_WAIT_REQUEUE_PI: +		val3 = FUTEX_BITSET_MATCH_ANY; +		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, +					    clockrt, uaddr2); +		break; +	case FUTEX_CMP_REQUEUE_PI: +		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, +				    1); +		break;  	default:  		ret = -ENOSYS;  	} @@ -1946,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,  	int cmd = op & FUTEX_CMD_MASK;  	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || -		      cmd == FUTEX_WAIT_BITSET)) { +		      cmd == FUTEX_WAIT_BITSET || +		      cmd == FUTEX_WAIT_REQUEUE_PI)) {  		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)  			return -EFAULT;  		if (!timespec_valid(&ts)) @@ -1958,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,  		tp = &t;  	}  	/* -	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. +	 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.  	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.  	 */  	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || -	    cmd == FUTEX_WAKE_OP) +	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)  		val2 = (u32) (unsigned long) utime;  	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f5296..7d047808419 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o  obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o  obj-$(CONFIG_PROC_FS) += proc.o  obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o  obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4363f..13c68e71b72 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)  	spin_lock(&desc->lock);  	mask_ack_irq(desc, irq); -	desc = irq_remap_to_desc(irq, desc);  	if (unlikely(desc->status & IRQ_INPROGRESS))  		goto out_unlock; @@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)  	desc->status &= ~IRQ_INPROGRESS;  out:  	desc->chip->eoi(irq); -	desc = irq_remap_to_desc(irq, desc);  	spin_unlock(&desc->lock);  } @@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)  		    !desc->action)) {  		desc->status |= (IRQ_PENDING | IRQ_MASKED);  		mask_ack_irq(desc, irq); -		desc = irq_remap_to_desc(irq, desc);  		goto out_unlock;  	}  	kstat_incr_irqs_this_cpu(irq, desc); @@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)  	/* Start handling the irq */  	if (desc->chip->ack)  		desc->chip->ack(irq); -	desc = irq_remap_to_desc(irq, desc);  	/* Mark the IRQ currently in progress.*/  	desc->status |= IRQ_INPROGRESS; @@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)  	if (!noirqdebug)  		note_interrupt(irq, desc, action_ret); -	if (desc->chip->eoi) { +	if (desc->chip->eoi)  		desc->chip->eoi(irq); -		desc = irq_remap_to_desc(irq, desc); -	}  }  void @@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,  	/* Uninstall? */  	if (handle == handle_bad_irq) { -		if (desc->chip != &no_irq_chip) { +		if (desc->chip != &no_irq_chip)  			mask_ack_irq(desc, irq); -			desc = irq_remap_to_desc(irq, desc); -		}  		desc->status |= IRQ_DISABLED;  		desc->depth = 1;  	} diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 26e08754744..065205bdd92 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,14 +11,15 @@   */  #include <linux/irq.h> +#include <linux/slab.h>  #include <linux/module.h>  #include <linux/random.h>  #include <linux/interrupt.h>  #include <linux/kernel_stat.h>  #include <linux/rculist.h>  #include <linux/hash.h> -#include <trace/irq.h>  #include <linux/bootmem.h> +#include <trace/events/irq.h>  #include "internals.h" @@ -44,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)  #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)  static void __init init_irq_default_affinity(void)  { -	alloc_bootmem_cpumask_var(&irq_default_affinity); +	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);  	cpumask_setall(irq_default_affinity);  }  #else @@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {  	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),  }; -void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)  { -	int node;  	void *ptr; -	node = cpu_to_node(cpu); -	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); +	if (slab_is_available()) +		ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), +				   GFP_ATOMIC, node); +	else +		ptr = alloc_bootmem_node(NODE_DATA(node), +				nr * sizeof(*desc->kstat_irqs));  	/*  	 * don't overwite if can not get new one  	 * init_copy_kstat_irqs() could still use old one  	 */  	if (ptr) { -		printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", -			 cpu, node); +		printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);  		desc->kstat_irqs = ptr;  	}  } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)  {  	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));  	spin_lock_init(&desc->lock);  	desc->irq = irq;  #ifdef CONFIG_SMP -	desc->cpu = cpu; +	desc->node = node;  #endif  	lockdep_set_class(&desc->lock, &irq_desc_lock_class); -	init_kstat_irqs(desc, cpu, nr_cpu_ids); +	init_kstat_irqs(desc, node, nr_cpu_ids);  	if (!desc->kstat_irqs) {  		printk(KERN_ERR "can not alloc kstat_irqs\n");  		BUG_ON(1);  	} -	if (!init_alloc_desc_masks(desc, cpu, false)) { +	if (!alloc_desc_masks(desc, node, false)) {  		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");  		BUG_ON(1);  	} -	arch_init_chip_data(desc, cpu); +	init_desc_masks(desc); +	arch_init_chip_data(desc, node);  }  /* @@ -146,6 +150,7 @@ int __init early_irq_init(void)  {  	struct irq_desc *desc;  	int legacy_count; +	int node;  	int i;  	init_irq_default_affinity(); @@ -156,20 +161,21 @@ int __init early_irq_init(void)  	desc = irq_desc_legacy;  	legacy_count = ARRAY_SIZE(irq_desc_legacy); + 	node = first_online_node;  	/* allocate irq_desc_ptrs array based on nr_irqs */ -	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); +	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);  	/* allocate based on nr_cpu_ids */ -	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ -	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * -					  sizeof(int)); +	kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * +					  sizeof(int), GFP_NOWAIT, node);  	for (i = 0; i < legacy_count; i++) {  		desc[i].irq = i;  		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;  		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); -		init_alloc_desc_masks(&desc[i], 0, true); +		alloc_desc_masks(&desc[i], node, true); +		init_desc_masks(&desc[i]);  		irq_desc_ptrs[i] = desc + i;  	} @@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)  	return NULL;  } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)  {  	struct irq_desc *desc;  	unsigned long flags; -	int node;  	if (irq >= nr_irqs) {  		WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", @@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)  	if (desc)  		goto out_unlock; -	node = cpu_to_node(cpu); -	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); -	printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n", -		 irq, cpu, node); +	if (slab_is_available()) +		desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); +	else +		desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + +	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);  	if (!desc) {  		printk(KERN_ERR "can not alloc irq_desc\n");  		BUG_ON(1);  	} -	init_one_irq_desc(irq, desc, cpu); +	init_one_irq_desc(irq, desc, node);  	irq_desc_ptrs[irq] = desc; @@ -256,7 +263,8 @@ int __init early_irq_init(void)  	for (i = 0; i < count; i++) {  		desc[i].irq = i; -		init_alloc_desc_masks(&desc[i], 0, true); +		alloc_desc_masks(&desc[i], 0, true); +		init_desc_masks(&desc[i]);  		desc[i].kstat_irqs = kstat_irqs_all[i];  	}  	return arch_early_irq_init(); @@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)  	return (irq < NR_IRQS) ? irq_desc + irq : NULL;  } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)  {  	return irq_to_desc(irq);  } @@ -348,9 +356,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)  	       "but no thread function available.", irq, action->name);  } -DEFINE_TRACE(irq_handler_entry); -DEFINE_TRACE(irq_handler_exit); -  /**   * handle_IRQ_event - irq action chain handler   * @irq:	the interrupt number @@ -453,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)  		/*  		 * No locking required for CPU-local interrupts:  		 */ -		if (desc->chip->ack) { +		if (desc->chip->ack)  			desc->chip->ack(irq); -			/* get new one */ -			desc = irq_remap_to_desc(irq, desc); -		}  		if (likely(!(desc->status & IRQ_DISABLED))) {  			action_ret = handle_IRQ_event(irq, desc->action);  			if (!noirqdebug) @@ -468,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)  	}  	spin_lock(&desc->lock); -	if (desc->chip->ack) { +	if (desc->chip->ack)  		desc->chip->ack(irq); -		desc = irq_remap_to_desc(irq, desc); -	}  	/*  	 * REPLAY is when Linux resends an IRQ that was dropped earlier  	 * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20eab38..73468253143 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);  extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);  extern struct lock_class_key irq_desc_lock_class; -extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);  extern void clear_kstat_irqs(struct irq_desc *desc);  extern spinlock_t sparse_irq_lock; @@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,  extern int irq_select_affinity_usr(unsigned int irq); +extern void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); +  /*   * Debugging printout:   */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca5924..aaf5c9d0577 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)  	return 1;  } -static void +void  irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)  {  	struct irqaction *action = desc->action; @@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)  	spin_lock_irqsave(&desc->lock, flags);  #ifdef CONFIG_GENERIC_PENDING_IRQ -	if (desc->status & IRQ_MOVE_PCNTXT) -		desc->chip->set_affinity(irq, cpumask); +	if (desc->status & IRQ_MOVE_PCNTXT) { +		if (!desc->chip->set_affinity(irq, cpumask)) { +			cpumask_copy(desc->affinity, cpumask); +			irq_set_thread_affinity(desc, cpumask); +		} +	}  	else {  		desc->status |= IRQ_MOVE_PENDING;  		cpumask_copy(desc->pending_mask, cpumask);  	}  #else -	cpumask_copy(desc->affinity, cpumask); -	desc->chip->set_affinity(irq, cpumask); +	if (!desc->chip->set_affinity(irq, cpumask)) { +		cpumask_copy(desc->affinity, cpumask); +		irq_set_thread_affinity(desc, cpumask); +	}  #endif -	irq_set_thread_affinity(desc, cpumask);  	desc->status |= IRQ_AFFINITY_SET;  	spin_unlock_irqrestore(&desc->lock, flags);  	return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9be43b..cfe767ca154 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,5 +1,8 @@  #include <linux/irq.h> +#include <linux/interrupt.h> + +#include "internals.h"  void move_masked_irq(int irq)  { @@ -39,11 +42,12 @@ void move_masked_irq(int irq)  	 * masking the irqs.  	 */  	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) -		   < nr_cpu_ids)) { -		cpumask_and(desc->affinity, -			    desc->pending_mask, cpu_online_mask); -		desc->chip->set_affinity(irq, desc->affinity); -	} +		   < nr_cpu_ids)) +		if (!desc->chip->set_affinity(irq, desc->pending_mask)) { +			cpumask_copy(desc->affinity, desc->pending_mask); +			irq_set_thread_affinity(desc, desc->pending_mask); +		} +  	cpumask_clear(desc->pending_mask);  } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcbaf8d..2f69bee57bf 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -15,9 +15,9 @@  static void init_copy_kstat_irqs(struct irq_desc *old_desc,  				 struct irq_desc *desc, -				 int cpu, int nr) +				 int node, int nr)  { -	init_kstat_irqs(desc, cpu, nr); +	init_kstat_irqs(desc, node, nr);  	if (desc->kstat_irqs != old_desc->kstat_irqs)  		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, @@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)  }  static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, -		 struct irq_desc *desc, int cpu) +		 struct irq_desc *desc, int node)  {  	memcpy(desc, old_desc, sizeof(struct irq_desc)); -	if (!init_alloc_desc_masks(desc, cpu, false)) { +	if (!alloc_desc_masks(desc, node, false)) {  		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "  				"for migration.\n", irq);  		return false;  	}  	spin_lock_init(&desc->lock); -	desc->cpu = cpu; +	desc->node = node;  	lockdep_set_class(&desc->lock, &irq_desc_lock_class); -	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); +	init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);  	init_copy_desc_masks(old_desc, desc); -	arch_init_copy_chip_data(old_desc, desc, cpu); +	arch_init_copy_chip_data(old_desc, desc, node);  	return true;  } @@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)  }  static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, -						int cpu) +						int node)  {  	struct irq_desc *desc;  	unsigned int irq;  	unsigned long flags; -	int node;  	irq = old_desc->irq; @@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,  	if (desc && old_desc != desc)  		goto out_unlock; -	node = cpu_to_node(cpu);  	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);  	if (!desc) {  		printk(KERN_ERR "irq %d: can not get new irq_desc " @@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,  		desc = old_desc;  		goto out_unlock;  	} -	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { +	if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {  		/* still use old one */  		kfree(desc);  		desc = old_desc; @@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,  	/* free the old one */  	free_one_irq_desc(old_desc, desc); -	spin_unlock(&old_desc->lock);  	kfree(old_desc); -	spin_lock(&desc->lock);  	return desc; @@ -109,24 +105,14 @@ out_unlock:  	return desc;  } -struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)  { -	int old_cpu; -	int node, old_node; -  	/* those all static, do move them */  	if (desc->irq < NR_IRQS_LEGACY)  		return desc; -	old_cpu = desc->cpu; -	if (old_cpu != cpu) { -		node = cpu_to_node(cpu); -		old_node = cpu_to_node(old_cpu); -		if (old_node != node) -			desc = __real_move_irq_desc(desc, cpu); -		else -			desc->cpu = cpu; -	} +	if (desc->node != node) +		desc = __real_move_irq_desc(desc, node);  	return desc;  } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 374faf9bfdc..3a29dbe7898 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -30,12 +30,16 @@  #define all_var 0  #endif -/* These will be re-linked against their real values during the second link stage */ +/* + * These will be re-linked against their real values + * during the second link stage. + */  extern const unsigned long kallsyms_addresses[] __attribute__((weak));  extern const u8 kallsyms_names[] __attribute__((weak)); -/* tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV) +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV).   */  extern const unsigned long kallsyms_num_syms  __attribute__((weak, section(".rodata"))); @@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)  	return is_kernel_text(addr) || is_kernel_inittext(addr);  } -/* expand a compressed symbol data into the resulting uncompressed string, -   given the offset to where the symbol is in the compressed stream */ +/* + * Expand a compressed symbol data into the resulting uncompressed string, + * given the offset to where the symbol is in the compressed stream. + */  static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)  {  	int len, skipped_first = 0;  	const u8 *tptr, *data; -	/* get the compressed symbol length from the first symbol byte */ +	/* Get the compressed symbol length from the first symbol byte. */  	data = &kallsyms_names[off];  	len = *data;  	data++; -	/* update the offset to return the offset for the next symbol on -	 * the compressed stream */ +	/* +	 * Update the offset to return the offset for the next symbol on +	 * the compressed stream. +	 */  	off += len + 1; -	/* for every byte on the compressed symbol data, copy the table -	   entry for that byte */ -	while(len) { -		tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; +	/* +	 * For every byte on the compressed symbol data, copy the table +	 * entry for that byte. +	 */ +	while (len) { +		tptr = &kallsyms_token_table[kallsyms_token_index[*data]];  		data++;  		len--;  		while (*tptr) { -			if(skipped_first) { +			if (skipped_first) {  				*result = *tptr;  				result++;  			} else @@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)  	*result = '\0'; -	/* return to offset to the next symbol */ +	/* Return to offset to the next symbol. */  	return off;  } -/* get symbol type information. This is encoded as a single char at the - * begining of the symbol name */ +/* + * Get symbol type information. This is encoded as a single char at the + * beginning of the symbol name. + */  static char kallsyms_get_symbol_type(unsigned int off)  { -	/* get just the first code, look it up in the token table, and return the -	 * first char from this token */ -	return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; +	/* +	 * Get just the first code, look it up in the token table, +	 * and return the first char from this token. +	 */ +	return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];  } -/* find the offset on the compressed stream given and index in the - * kallsyms array */ +/* + * Find the offset on the compressed stream given and index in the + * kallsyms array. + */  static unsigned int get_symbol_offset(unsigned long pos)  {  	const u8 *name;  	int i; -	/* use the closest marker we have. We have markers every 256 positions, -	 * so that should be close enough */ -	name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; +	/* +	 * Use the closest marker we have. We have markers every 256 positions, +	 * so that should be close enough. +	 */ +	name = &kallsyms_names[kallsyms_markers[pos >> 8]]; -	/* sequentially scan all the symbols up to the point we're searching for. -	 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we -	 * just need to add the len to the current pointer for every symbol we -	 * wish to skip */ -	for(i = 0; i < (pos&0xFF); i++) +	/* +	 * Sequentially scan all the symbols up to the point we're searching +	 * for. Every symbol is stored in a [<len>][<len> bytes of data] format, +	 * so we just need to add the len to the current pointer for every +	 * symbol we wish to skip. +	 */ +	for (i = 0; i < (pos & 0xFF); i++)  		name = name + (*name) + 1;  	return name - kallsyms_names; @@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,  	/* This kernel should never had been booted. */  	BUG_ON(!kallsyms_addresses); -	/* do a binary search on the sorted kallsyms_addresses array */ +	/* Do a binary search on the sorted kallsyms_addresses array. */  	low = 0;  	high = kallsyms_num_syms; @@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,  	}  	/* -	 * search for the first aliased symbol. Aliased -	 * symbols are symbols with the same address +	 * Search for the first aliased symbol. Aliased +	 * symbols are symbols with the same address.  	 */  	while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])  		--low;  	symbol_start = kallsyms_addresses[low]; -	/* Search for next non-aliased symbol */ +	/* Search for next non-aliased symbol. */  	for (i = low + 1; i < kallsyms_num_syms; i++) {  		if (kallsyms_addresses[i] > symbol_start) {  			symbol_end = kallsyms_addresses[i]; @@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,  		}  	} -	/* if we found no next symbol, we use the end of the section */ +	/* If we found no next symbol, we use the end of the section. */  	if (!symbol_end) {  		if (is_kernel_inittext(addr))  			symbol_end = (unsigned long)_einittext; @@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,  /*   * Lookup an address - * - modname is set to NULL if it's in the kernel - * - we guarantee that the returned name is valid until we reschedule even if - *   it resides in a module - * - we also guarantee that modname will be valid until rescheduled + * - modname is set to NULL if it's in the kernel. + * - We guarantee that the returned name is valid until we reschedule even if. + *   It resides in a module. + * - We also guarantee that modname will be valid until rescheduled.   */  const char *kallsyms_lookup(unsigned long addr,  			    unsigned long *symbolsize, @@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,  		return namebuf;  	} -	/* see if it's in a module */ +	/* See if it's in a module. */  	return module_address_lookup(addr, symbolsize, offset, modname,  				     namebuf);  } @@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)  		kallsyms_expand_symbol(get_symbol_offset(pos), symname);  		return 0;  	} -	/* see if it's in a module */ +	/* See if it's in a module. */  	return lookup_module_symbol_name(addr, symname);  } @@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,  		modname[0] = '\0';  		return 0;  	} -	/* see if it's in a module */ +	/* See if it's in a module. */  	return lookup_module_symbol_attrs(addr, size, offset, modname, name);  } @@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)  	return len;  } +EXPORT_SYMBOL_GPL(sprint_symbol);  /* Look up a kernel symbol and print it to the kernel messages. */  void __print_symbol(const char *fmt, unsigned long address) @@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)  	printk(fmt, buffer);  } +EXPORT_SYMBOL(__print_symbol);  /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ -struct kallsym_iter -{ +struct kallsym_iter {  	loff_t pos;  	unsigned long value; -	unsigned int nameoff; /* If iterating in core kernel symbols */ +	unsigned int nameoff; /* If iterating in core kernel symbols. */  	char type;  	char name[KSYM_NAME_LEN];  	char module_name[MODULE_NAME_LEN]; @@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)  		iter->pos = pos;  		return get_ksymbol_mod(iter);  	} -	 +  	/* If we're not on the desired position, reset to new position. */  	if (pos != iter->pos)  		reset_iter(iter, pos); @@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)  {  	struct kallsym_iter *iter = m->private; -	/* Some debugging symbols have no name.  Ignore them. */  +	/* Some debugging symbols have no name.  Ignore them. */  	if (!iter->name[0])  		return 0;  	if (iter->module_name[0]) {  		char type; -		/* Label it "global" if it is exported, -		 * "local" if not exported. */ +		/* +		 * Label it "global" if it is exported, +		 * "local" if not exported. +		 */  		type = iter->exported ? toupper(iter->type) :  					tolower(iter->type);  		seq_printf(m, "%0*lx %c %s\t[%s]\n", -			   (int)(2*sizeof(void*)), +			   (int)(2 * sizeof(void *)),  			   iter->value, type, iter->name, iter->module_name);  	} else  		seq_printf(m, "%0*lx %c %s\n", -			   (int)(2*sizeof(void*)), +			   (int)(2 * sizeof(void *)),  			   iter->value, iter->type, iter->name);  	return 0;  } @@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {  static int kallsyms_open(struct inode *inode, struct file *file)  { -	/* We keep iterator in m->private, since normal case is to +	/* +	 * We keep iterator in m->private, since normal case is to  	 * s_start from where we left off, so we avoid doing -	 * using get_symbol_offset for every symbol */ +	 * using get_symbol_offset for every symbol. +	 */  	struct kallsym_iter *iter;  	int ret; @@ -500,7 +525,4 @@ static int __init kallsyms_init(void)  	proc_create("kallsyms", 0444, NULL, &kallsyms_operations);  	return 0;  } -__initcall(kallsyms_init); - -EXPORT_SYMBOL(__print_symbol); -EXPORT_SYMBOL_GPL(sprint_symbol); +device_initcall(kallsyms_init); diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a758c6e495..ae1c35201cc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1448,18 +1448,17 @@ int kernel_kexec(void)  			goto Restore_console;  		}  		suspend_console(); -		error = device_suspend(PMSG_FREEZE); +		error = dpm_suspend_start(PMSG_FREEZE);  		if (error)  			goto Resume_console; -		device_pm_lock(); -		/* At this point, device_suspend() has been called, -		 * but *not* device_power_down(). We *must* -		 * device_power_down() now.  Otherwise, drivers for +		/* At this point, dpm_suspend_start() has been called, +		 * but *not* dpm_suspend_noirq(). We *must* call +		 * dpm_suspend_noirq() now.  Otherwise, drivers for  		 * some devices (e.g. interrupt controllers) become  		 * desynchronized with the actual state of the  		 * hardware at resume time, and evil weirdness ensues.  		 */ -		error = device_power_down(PMSG_FREEZE); +		error = dpm_suspend_noirq(PMSG_FREEZE);  		if (error)  			goto Resume_devices;  		error = disable_nonboot_cpus(); @@ -1487,10 +1486,9 @@ int kernel_kexec(void)  		local_irq_enable();   Enable_cpus:  		enable_nonboot_cpus(); -		device_power_up(PMSG_RESTORE); +		dpm_resume_noirq(PMSG_RESTORE);   Resume_devices: -		device_pm_unlock(); -		device_resume(PMSG_RESTORE); +		dpm_resume_end(PMSG_RESTORE);   Resume_console:  		resume_console();  		thaw_processes(); diff --git a/kernel/kmod.c b/kernel/kmod.c index b750675251e..7e95bedb2bf 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,  	sub_info->argv = argv;  	sub_info->envp = envp;  	sub_info->cred = prepare_usermodehelper_creds(); -	if (!sub_info->cred) +	if (!sub_info->cred) { +		kfree(sub_info);  		return NULL; +	}    out:  	return sub_info; diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ebaf8519ab..41c88fe4050 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,7 @@  #include <linux/file.h>  #include <linux/module.h>  #include <linux/mutex.h> -#include <trace/sched.h> +#include <trace/events/sched.h>  #define KTHREAD_NICE_LEVEL (-5) @@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);  static LIST_HEAD(kthread_create_list);  struct task_struct *kthreadd_task; -DEFINE_TRACE(sched_kthread_stop); -DEFINE_TRACE(sched_kthread_stop_ret); -  struct kthread_create_info  {  	/* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index accb40cdb12..8bbeef996c7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,12 +42,14 @@  #include <linux/hash.h>  #include <linux/ftrace.h>  #include <linux/stringify.h> -#include <trace/lockdep.h>  #include <asm/sections.h>  #include "lockdep_internals.h" +#define CREATE_TRACE_POINTS +#include <trace/events/lockdep.h> +  #ifdef CONFIG_PROVE_LOCKING  int prove_locking = 1;  module_param(prove_locking, int, 0644); @@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,  }  EXPORT_SYMBOL_GPL(lock_set_class); -DEFINE_TRACE(lock_acquire); -  /*   * We are not always called with irqs disabled - do that here,   * and also avoid lockdep recursion: @@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,  }  EXPORT_SYMBOL_GPL(lock_acquire); -DEFINE_TRACE(lock_release); -  void lock_release(struct lockdep_map *lock, int nested,  			  unsigned long ip)  { @@ -3105,6 +3103,8 @@ found_it:  		hlock->holdtime_stamp = now;  	} +	trace_lock_acquired(lock, ip, waittime); +  	stats = get_lock_stats(hlock_class(hlock));  	if (waittime) {  		if (hlock->read) @@ -3120,8 +3120,6 @@ found_it:  	lock->ip = ip;  } -DEFINE_TRACE(lock_contended); -  void lock_contended(struct lockdep_map *lock, unsigned long ip)  {  	unsigned long flags; @@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)  }  EXPORT_SYMBOL_GPL(lock_contended); -DEFINE_TRACE(lock_acquired); -  void lock_acquired(struct lockdep_map *lock, unsigned long ip)  {  	unsigned long flags; -	trace_lock_acquired(lock, ip); -  	if (unlikely(!lock_stat))  		return; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index a2cc7e9a6e8..699a2ac3a0d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -54,9 +54,9 @@ enum {   * table (if it's not there yet), and we check it for lock order   * conflicts and deadlocks.   */ -#define MAX_LOCKDEP_ENTRIES	8192UL +#define MAX_LOCKDEP_ENTRIES	16384UL -#define MAX_LOCKDEP_CHAINS_BITS	14 +#define MAX_LOCKDEP_CHAINS_BITS	15  #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)  #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) diff --git a/kernel/module.c b/kernel/module.c index e797812a4d9..e4ab36ce767 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@  */  #include <linux/module.h>  #include <linux/moduleloader.h> +#include <linux/ftrace_event.h>  #include <linux/init.h>  #include <linux/kallsyms.h>  #include <linux/fs.h> @@ -52,6 +53,7 @@  #include <linux/ftrace.h>  #include <linux/async.h>  #include <linux/percpu.h> +#include <linux/kmemleak.h>  #if 0  #define DEBUGP printk @@ -72,6 +74,9 @@ DEFINE_MUTEX(module_mutex);  EXPORT_SYMBOL_GPL(module_mutex);  static LIST_HEAD(modules); +/* Block module loading/unloading? */ +int modules_disabled = 0; +  /* Waiting for a module to finish initializing? */  static DECLARE_WAIT_QUEUE_HEAD(module_wq); @@ -429,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,  	unsigned long extra;  	unsigned int i;  	void *ptr; +	int cpu;  	if (align > PAGE_SIZE) {  		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", @@ -458,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,  			if (!split_block(i, size))  				return NULL; +		/* add the per-cpu scanning areas */ +		for_each_possible_cpu(cpu) +			kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0, +				       GFP_KERNEL); +  		/* Mark allocated */  		pcpu_size[i] = -pcpu_size[i];  		return ptr; @@ -472,6 +483,7 @@ static void percpu_modfree(void *freeme)  {  	unsigned int i;  	void *ptr = __per_cpu_start + block_size(pcpu_size[0]); +	int cpu;  	/* First entry is core kernel percpu data. */  	for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { @@ -483,6 +495,10 @@ static void percpu_modfree(void *freeme)  	BUG();   free: +	/* remove the per-cpu scanning areas */ +	for_each_possible_cpu(cpu) +		kmemleak_free(freeme + per_cpu_offset(cpu)); +  	/* Merge with previous? */  	if (pcpu_size[i-1] >= 0) {  		pcpu_size[i-1] += pcpu_size[i]; @@ -777,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,  	char name[MODULE_NAME_LEN];  	int ret, forced = 0; -	if (!capable(CAP_SYS_MODULE)) +	if (!capable(CAP_SYS_MODULE) || modules_disabled)  		return -EPERM;  	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) @@ -1489,9 +1505,6 @@ static void free_module(struct module *mod)  	/* Free any allocated parameters. */  	destroy_params(mod->kp, mod->num_kp); -	/* release any pointers to mcount in this module */ -	ftrace_release(mod->module_core, mod->core_size); -  	/* This may be NULL, but that's OK */  	module_free(mod, mod->module_init);  	kfree(mod->args); @@ -1878,6 +1891,36 @@ static void *module_alloc_update_bounds(unsigned long size)  	return ret;  } +#ifdef CONFIG_DEBUG_KMEMLEAK +static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, +				 Elf_Shdr *sechdrs, char *secstrings) +{ +	unsigned int i; + +	/* only scan the sections containing data */ +	kmemleak_scan_area(mod->module_core, (unsigned long)mod - +			   (unsigned long)mod->module_core, +			   sizeof(struct module), GFP_KERNEL); + +	for (i = 1; i < hdr->e_shnum; i++) { +		if (!(sechdrs[i].sh_flags & SHF_ALLOC)) +			continue; +		if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 +		    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) +			continue; + +		kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - +				   (unsigned long)mod->module_core, +				   sechdrs[i].sh_size, GFP_KERNEL); +	} +} +#else +static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, +					Elf_Shdr *sechdrs, char *secstrings) +{ +} +#endif +  /* Allocate and load the module: note that size of section 0 is always     zero, and we rely on this for optional sections. */  static noinline struct module *load_module(void __user *umod, @@ -1892,11 +1935,9 @@ static noinline struct module *load_module(void __user *umod,  	unsigned int symindex = 0;  	unsigned int strindex = 0;  	unsigned int modindex, versindex, infoindex, pcpuindex; -	unsigned int num_mcount;  	struct module *mod;  	long err = 0;  	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ -	unsigned long *mseg;  	mm_segment_t old_fs;  	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2050,6 +2091,12 @@ static noinline struct module *load_module(void __user *umod,  	/* Do the allocs. */  	ptr = module_alloc_update_bounds(mod->core_size); +	/* +	 * The pointer to this block is stored in the module structure +	 * which is inside the block. Just mark it as not being a +	 * leak. +	 */ +	kmemleak_not_leak(ptr);  	if (!ptr) {  		err = -ENOMEM;  		goto free_percpu; @@ -2058,6 +2105,13 @@ static noinline struct module *load_module(void __user *umod,  	mod->module_core = ptr;  	ptr = module_alloc_update_bounds(mod->init_size); +	/* +	 * The pointer to this block is stored in the module structure +	 * which is inside the block. This block doesn't need to be +	 * scanned as it contains data and code that will be freed +	 * after the module is initialized. +	 */ +	kmemleak_ignore(ptr);  	if (!ptr && mod->init_size) {  		err = -ENOMEM;  		goto free_core; @@ -2088,6 +2142,7 @@ static noinline struct module *load_module(void __user *umod,  	}  	/* Module has been moved. */  	mod = (void *)sechdrs[modindex].sh_addr; +	kmemleak_load_module(mod, hdr, sechdrs, secstrings);  #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)  	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), @@ -2172,7 +2227,19 @@ static noinline struct module *load_module(void __user *umod,  					sizeof(*mod->tracepoints),  					&mod->num_tracepoints);  #endif - +#ifdef CONFIG_EVENT_TRACING +	mod->trace_events = section_objs(hdr, sechdrs, secstrings, +					 "_ftrace_events", +					 sizeof(*mod->trace_events), +					 &mod->num_trace_events); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +	/* sechdrs[0].sh_size is always zero */ +	mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, +					     "__mcount_loc", +					     sizeof(*mod->ftrace_callsites), +					     &mod->num_ftrace_callsites); +#endif  #ifdef CONFIG_MODVERSIONS  	if ((mod->num_syms && !mod->crcs)  	    || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2237,11 +2304,6 @@ static noinline struct module *load_module(void __user *umod,  			dynamic_debug_setup(debug, num_debug);  	} -	/* sechdrs[0].sh_size is always zero */ -	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", -			    sizeof(*mseg), &num_mcount); -	ftrace_init_module(mod, mseg, mseg + num_mcount); -  	err = module_finalize(hdr, sechdrs, mod);  	if (err < 0)  		goto cleanup; @@ -2302,7 +2364,6 @@ static noinline struct module *load_module(void __user *umod,   cleanup:  	kobject_del(&mod->mkobj.kobj);  	kobject_put(&mod->mkobj.kobj); -	ftrace_release(mod->module_core, mod->core_size);   free_unload:  	module_unload_free(mod);  #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) @@ -2336,7 +2397,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  	int ret = 0;  	/* Must have permission */ -	if (!capable(CAP_SYS_MODULE)) +	if (!capable(CAP_SYS_MODULE) || modules_disabled)  		return -EPERM;  	/* Only one module load at a time, please */ @@ -2394,6 +2455,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  	mutex_lock(&module_mutex);  	/* Drop initial reference. */  	module_put(mod); +	trim_init_extable(mod);  	module_free(mod, mod->module_init);  	mod->module_init = NULL;  	mod->init_size = 0; diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f..947b3ad551f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);   *   * This function is similar to (but not equivalent to) down().   */ -void inline __sched mutex_lock(struct mutex *lock) +void __sched mutex_lock(struct mutex *lock)  {  	might_sleep();  	/* @@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		/* didnt get the lock, go to sleep: */  		spin_unlock_mutex(&lock->wait_lock, flags); -		__schedule(); +		preempt_enable_no_resched(); +		schedule(); +		preempt_disable();  		spin_lock_mutex(&lock->wait_lock, flags);  	} @@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)  	return ret;  } -  EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ +	/* dec if we can't possibly hit 0 */ +	if (atomic_add_unless(cnt, -1, 1)) +		return 0; +	/* we might hit 0, so take the lock */ +	mutex_lock(lock); +	if (!atomic_dec_and_test(cnt)) { +		/* when we actually did the dec, we didn't hit 0 */ +		mutex_unlock(lock); +		return 0; +	} +	/* we hit 0, and we hold the lock */ +	return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/params.c b/kernel/params.c index de273ec85bd..7f6912ced2b 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -24,9 +24,6 @@  #include <linux/err.h>  #include <linux/slab.h> -/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */ -#define KPARAM_KMALLOCED	0x80000000 -  #if 0  #define DEBUGP printk  #else @@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)  		return -ENOSPC;  	} -	if (kp->perm & KPARAM_KMALLOCED) +	if (kp->flags & KPARAM_KMALLOCED)  		kfree(*(char **)kp->arg);  	/* This is a hack.  We can't need to strdup in early boot, and we  	 * don't need to; this mangled commandline is preserved. */  	if (slab_is_available()) { -		kp->perm |= KPARAM_KMALLOCED; +		kp->flags |= KPARAM_KMALLOCED;  		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);  		if (!kp->arg)  			return -ENOMEM; @@ -241,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp)  	return sprintf(buffer, "%s", *((char **)kp->arg));  } +/* Actually could be a bool or an int, for historical reasons. */  int param_set_bool(const char *val, struct kernel_param *kp)  { +	bool v; +  	/* No equals means "set"... */  	if (!val) val = "1";  	/* One of =[yYnN01] */  	switch (val[0]) {  	case 'y': case 'Y': case '1': -		*(int *)kp->arg = 1; -		return 0; +		v = true; +		break;  	case 'n': case 'N': case '0': -		*(int *)kp->arg = 0; -		return 0; +		v = false; +		break; +	default: +		return -EINVAL;  	} -	return -EINVAL; + +	if (kp->flags & KPARAM_ISBOOL) +		*(bool *)kp->arg = v; +	else +		*(int *)kp->arg = v; +	return 0;  }  int param_get_bool(char *buffer, struct kernel_param *kp)  { +	bool val; +	if (kp->flags & KPARAM_ISBOOL) +		val = *(bool *)kp->arg; +	else +		val = *(int *)kp->arg; +  	/* Y and N chosen as being relatively non-coder friendly */ -	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); +	return sprintf(buffer, "%c", val ? 'Y' : 'N');  } +/* This one must be bool. */  int param_set_invbool(const char *val, struct kernel_param *kp)  { -	int boolval, ret; +	int ret; +	bool boolval;  	struct kernel_param dummy;  	dummy.arg = &boolval; +	dummy.flags = KPARAM_ISBOOL;  	ret = param_set_bool(val, &dummy);  	if (ret == 0) -		*(int *)kp->arg = !boolval; +		*(bool *)kp->arg = !boolval;  	return ret;  }  int param_get_invbool(char *buffer, struct kernel_param *kp)  { -	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); +	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');  }  /* We break the rule and mangle the string. */ @@ -591,7 +607,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)  	unsigned int i;  	for (i = 0; i < num; i++) -		if (params[i].perm & KPARAM_KMALLOCED) +		if (params[i].flags & KPARAM_KMALLOCED)  			kfree(*(char **)params[i].arg);  } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 00000000000..29b685f551a --- /dev/null +++ b/kernel/perf_counter.c @@ -0,0 +1,4339 @@ +/* + * Performance counter core code + * + *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + *  For licensing details see kernel-base/COPYING + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/sysfs.h> +#include <linux/dcache.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/vmstat.h> +#include <linux/hardirq.h> +#include <linux/rculist.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/anon_inodes.h> +#include <linux/kernel_stat.h> +#include <linux/perf_counter.h> + +#include <asm/irq_regs.h> + +/* + * Each CPU has a list of per CPU counters: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_counters __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_counters __read_mostly; +static atomic_t nr_mmap_counters __read_mostly; +static atomic_t nr_comm_counters __read_mostly; + +/* + * perf counter paranoia level: + *  0 - not paranoid + *  1 - disallow cpu counters to unpriv + *  2 - disallow kernel profiling to unpriv + */ +int sysctl_perf_counter_paranoid __read_mostly; + +static inline bool perf_paranoid_cpu(void) +{ +	return sysctl_perf_counter_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ +	return sysctl_perf_counter_paranoid > 1; +} + +int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf counter sample rate + */ +int sysctl_perf_counter_sample_rate __read_mostly = 100000; + +static atomic64_t perf_counter_id; + +/* + * Lock for (sysadmin-configurable) counter reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ +	return NULL; +} + +void __weak hw_perf_disable(void)		{ barrier(); } +void __weak hw_perf_enable(void)		{ barrier(); } + +void __weak hw_perf_counter_setup(int cpu)	{ barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_counter *group_leader, +	       struct perf_cpu_context *cpuctx, +	       struct perf_counter_context *ctx, int cpu) +{ +	return 0; +} + +void __weak perf_counter_print_debug(void)	{ } + +static DEFINE_PER_CPU(int, disable_count); + +void __perf_disable(void) +{ +	__get_cpu_var(disable_count)++; +} + +bool __perf_enable(void) +{ +	return !--__get_cpu_var(disable_count); +} + +void perf_disable(void) +{ +	__perf_disable(); +	hw_perf_disable(); +} + +void perf_enable(void) +{ +	if (__perf_enable()) +		hw_perf_enable(); +} + +static void get_ctx(struct perf_counter_context *ctx) +{ +	atomic_inc(&ctx->refcount); +} + +static void free_ctx(struct rcu_head *head) +{ +	struct perf_counter_context *ctx; + +	ctx = container_of(head, struct perf_counter_context, rcu_head); +	kfree(ctx); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ +	if (atomic_dec_and_test(&ctx->refcount)) { +		if (ctx->parent_ctx) +			put_ctx(ctx->parent_ctx); +		if (ctx->task) +			put_task_struct(ctx->task); +		call_rcu(&ctx->rcu_head, free_ctx); +	} +} + +/* + * Get the perf_counter_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_counter_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ +	struct perf_counter_context *ctx; + +	rcu_read_lock(); + retry: +	ctx = rcu_dereference(task->perf_counter_ctxp); +	if (ctx) { +		/* +		 * If this context is a clone of another, it might +		 * get swapped for another underneath us by +		 * perf_counter_task_sched_out, though the +		 * rcu_read_lock() protects us from any context +		 * getting freed.  Lock the context and check if it +		 * got swapped before we could get the lock, and retry +		 * if so.  If we locked the right context, then it +		 * can't get swapped on us any more. +		 */ +		spin_lock_irqsave(&ctx->lock, *flags); +		if (ctx != rcu_dereference(task->perf_counter_ctxp)) { +			spin_unlock_irqrestore(&ctx->lock, *flags); +			goto retry; +		} +	} +	rcu_read_unlock(); +	return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task.  This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +{ +	struct perf_counter_context *ctx; +	unsigned long flags; + +	ctx = perf_lock_task_context(task, &flags); +	if (ctx) { +		++ctx->pin_count; +		get_ctx(ctx); +		spin_unlock_irqrestore(&ctx->lock, flags); +	} +	return ctx; +} + +static void perf_unpin_context(struct perf_counter_context *ctx) +{ +	unsigned long flags; + +	spin_lock_irqsave(&ctx->lock, flags); +	--ctx->pin_count; +	spin_unlock_irqrestore(&ctx->lock, flags); +	put_ctx(ctx); +} + +/* + * Add a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ +	struct perf_counter *group_leader = counter->group_leader; + +	/* +	 * Depending on whether it is a standalone or sibling counter, +	 * add it straight to the context's counter list, or to the group +	 * leader's sibling list: +	 */ +	if (group_leader == counter) +		list_add_tail(&counter->list_entry, &ctx->counter_list); +	else { +		list_add_tail(&counter->list_entry, &group_leader->sibling_list); +		group_leader->nr_siblings++; +	} + +	list_add_rcu(&counter->event_entry, &ctx->event_list); +	ctx->nr_counters++; +} + +/* + * Remove a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ +	struct perf_counter *sibling, *tmp; + +	if (list_empty(&counter->list_entry)) +		return; +	ctx->nr_counters--; + +	list_del_init(&counter->list_entry); +	list_del_rcu(&counter->event_entry); + +	if (counter->group_leader != counter) +		counter->group_leader->nr_siblings--; + +	/* +	 * If this was a group counter with sibling counters then +	 * upgrade the siblings to singleton counters by adding them +	 * to the context list directly: +	 */ +	list_for_each_entry_safe(sibling, tmp, +				 &counter->sibling_list, list_entry) { + +		list_move_tail(&sibling->list_entry, &ctx->counter_list); +		sibling->group_leader = sibling; +	} +} + +static void +counter_sched_out(struct perf_counter *counter, +		  struct perf_cpu_context *cpuctx, +		  struct perf_counter_context *ctx) +{ +	if (counter->state != PERF_COUNTER_STATE_ACTIVE) +		return; + +	counter->state = PERF_COUNTER_STATE_INACTIVE; +	counter->tstamp_stopped = ctx->time; +	counter->pmu->disable(counter); +	counter->oncpu = -1; + +	if (!is_software_counter(counter)) +		cpuctx->active_oncpu--; +	ctx->nr_active--; +	if (counter->attr.exclusive || !cpuctx->active_oncpu) +		cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_counter *group_counter, +		struct perf_cpu_context *cpuctx, +		struct perf_counter_context *ctx) +{ +	struct perf_counter *counter; + +	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) +		return; + +	counter_sched_out(group_counter, cpuctx, ctx); + +	/* +	 * Schedule out siblings (if any): +	 */ +	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) +		counter_sched_out(counter, cpuctx, ctx); + +	if (group_counter->attr.exclusive) +		cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance counter + * + * We disable the counter on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_counter_remove_from_context(void *info) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter *counter = info; +	struct perf_counter_context *ctx = counter->ctx; + +	/* +	 * If this is a task context, we need to check whether it is +	 * the current task context of this cpu. If not it has been +	 * scheduled out before the smp call arrived. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) +		return; + +	spin_lock(&ctx->lock); +	/* +	 * Protect the list operation against NMI by disabling the +	 * counters on a global level. +	 */ +	perf_disable(); + +	counter_sched_out(counter, cpuctx, ctx); + +	list_del_counter(counter, ctx); + +	if (!ctx->task) { +		/* +		 * Allow more per task counters with respect to the +		 * reservation: +		 */ +		cpuctx->max_pertask = +			min(perf_max_counters - ctx->nr_counters, +			    perf_max_counters - perf_reserved_percpu); +	} + +	perf_enable(); +	spin_unlock(&ctx->lock); +} + + +/* + * Remove the counter from a task's (or a CPU's) list of counters. + * + * Must be called with ctx->mutex held. + * + * CPU counters are removed with a smp call. For task counters we only + * call when the task is on a CPU. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid.  This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_counter_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_counter_remove_from_context(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Per cpu counters are removed via an smp call and +		 * the removal is always sucessful. +		 */ +		smp_call_function_single(counter->cpu, +					 __perf_counter_remove_from_context, +					 counter, 1); +		return; +	} + +retry: +	task_oncpu_function_call(task, __perf_counter_remove_from_context, +				 counter); + +	spin_lock_irq(&ctx->lock); +	/* +	 * If the context is active we need to retry the smp call. +	 */ +	if (ctx->nr_active && !list_empty(&counter->list_entry)) { +		spin_unlock_irq(&ctx->lock); +		goto retry; +	} + +	/* +	 * The lock prevents that this context is scheduled in so we +	 * can remove the counter safely, if the call above did not +	 * succeed. +	 */ +	if (!list_empty(&counter->list_entry)) { +		list_del_counter(counter, ctx); +	} +	spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ +	return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx) +{ +	u64 now = perf_clock(); + +	ctx->time += now - ctx->timestamp; +	ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	u64 run_end; + +	if (counter->state < PERF_COUNTER_STATE_INACTIVE) +		return; + +	counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + +	if (counter->state == PERF_COUNTER_STATE_INACTIVE) +		run_end = counter->tstamp_stopped; +	else +		run_end = ctx->time; + +	counter->total_time_running = run_end - counter->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ +	struct perf_counter *counter; + +	update_counter_times(leader); +	list_for_each_entry(counter, &leader->sibling_list, list_entry) +		update_counter_times(counter); +} + +/* + * Cross CPU call to disable a performance counter + */ +static void __perf_counter_disable(void *info) +{ +	struct perf_counter *counter = info; +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter_context *ctx = counter->ctx; + +	/* +	 * If this is a per-task counter, need to check whether this +	 * counter's task is the current task on this cpu. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) +		return; + +	spin_lock(&ctx->lock); + +	/* +	 * If the counter is on, turn it off. +	 * If it is in error state, leave it in error state. +	 */ +	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { +		update_context_time(ctx); +		update_counter_times(counter); +		if (counter == counter->group_leader) +			group_sched_out(counter, cpuctx, ctx); +		else +			counter_sched_out(counter, cpuctx, ctx); +		counter->state = PERF_COUNTER_STATE_OFF; +	} + +	spin_unlock(&ctx->lock); +} + +/* + * Disable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid.  This condition is satisifed when called through + * perf_counter_for_each_child or perf_counter_for_each because they + * hold the top-level counter's child_mutex, so any descendant that + * goes to exit will block in sync_child_counter. + * When called from perf_pending_counter it's OK because counter->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_counter_task_sched_out for this context. + */ +static void perf_counter_disable(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Disable the counter on the cpu that it's on +		 */ +		smp_call_function_single(counter->cpu, __perf_counter_disable, +					 counter, 1); +		return; +	} + + retry: +	task_oncpu_function_call(task, __perf_counter_disable, counter); + +	spin_lock_irq(&ctx->lock); +	/* +	 * If the counter is still active, we need to retry the cross-call. +	 */ +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) { +		spin_unlock_irq(&ctx->lock); +		goto retry; +	} + +	/* +	 * Since we have the lock this context can't be scheduled +	 * in, so we can change the state safely. +	 */ +	if (counter->state == PERF_COUNTER_STATE_INACTIVE) { +		update_counter_times(counter); +		counter->state = PERF_COUNTER_STATE_OFF; +	} + +	spin_unlock_irq(&ctx->lock); +} + +static int +counter_sched_in(struct perf_counter *counter, +		 struct perf_cpu_context *cpuctx, +		 struct perf_counter_context *ctx, +		 int cpu) +{ +	if (counter->state <= PERF_COUNTER_STATE_OFF) +		return 0; + +	counter->state = PERF_COUNTER_STATE_ACTIVE; +	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */ +	/* +	 * The new state must be visible before we turn it on in the hardware: +	 */ +	smp_wmb(); + +	if (counter->pmu->enable(counter)) { +		counter->state = PERF_COUNTER_STATE_INACTIVE; +		counter->oncpu = -1; +		return -EAGAIN; +	} + +	counter->tstamp_running += ctx->time - counter->tstamp_stopped; + +	if (!is_software_counter(counter)) +		cpuctx->active_oncpu++; +	ctx->nr_active++; + +	if (counter->attr.exclusive) +		cpuctx->exclusive = 1; + +	return 0; +} + +static int +group_sched_in(struct perf_counter *group_counter, +	       struct perf_cpu_context *cpuctx, +	       struct perf_counter_context *ctx, +	       int cpu) +{ +	struct perf_counter *counter, *partial_group; +	int ret; + +	if (group_counter->state == PERF_COUNTER_STATE_OFF) +		return 0; + +	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); +	if (ret) +		return ret < 0 ? ret : 0; + +	if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) +		return -EAGAIN; + +	/* +	 * Schedule in siblings as one group (if any): +	 */ +	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { +		if (counter_sched_in(counter, cpuctx, ctx, cpu)) { +			partial_group = counter; +			goto group_error; +		} +	} + +	return 0; + +group_error: +	/* +	 * Groups can be scheduled in as one unit only, so undo any +	 * partial group before returning: +	 */ +	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { +		if (counter == partial_group) +			break; +		counter_sched_out(counter, cpuctx, ctx); +	} +	counter_sched_out(group_counter, cpuctx, ctx); + +	return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ +	struct perf_counter *counter; + +	if (!is_software_counter(leader)) +		return 0; + +	list_for_each_entry(counter, &leader->sibling_list, list_entry) +		if (!is_software_counter(counter)) +			return 0; + +	return 1; +} + +/* + * Work out whether we can put this counter group on the CPU now. + */ +static int group_can_go_on(struct perf_counter *counter, +			   struct perf_cpu_context *cpuctx, +			   int can_add_hw) +{ +	/* +	 * Groups consisting entirely of software counters can always go on. +	 */ +	if (is_software_only_group(counter)) +		return 1; +	/* +	 * If an exclusive group is already on, no other hardware +	 * counters can go on. +	 */ +	if (cpuctx->exclusive) +		return 0; +	/* +	 * If this group is exclusive and there are already +	 * counters on the CPU, it can't go on. +	 */ +	if (counter->attr.exclusive && cpuctx->active_oncpu) +		return 0; +	/* +	 * Otherwise, try to add it if all previous groups were able +	 * to go on. +	 */ +	return can_add_hw; +} + +static void add_counter_to_ctx(struct perf_counter *counter, +			       struct perf_counter_context *ctx) +{ +	list_add_counter(counter, ctx); +	counter->tstamp_enabled = ctx->time; +	counter->tstamp_running = ctx->time; +	counter->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance counter + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter *counter = info; +	struct perf_counter_context *ctx = counter->ctx; +	struct perf_counter *leader = counter->group_leader; +	int cpu = smp_processor_id(); +	int err; + +	/* +	 * If this is a task context, we need to check whether it is +	 * the current task context of this cpu. If not it has been +	 * scheduled out before the smp call arrived. +	 * Or possibly this is the right context but it isn't +	 * on this cpu because it had no counters. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) { +		if (cpuctx->task_ctx || ctx->task != current) +			return; +		cpuctx->task_ctx = ctx; +	} + +	spin_lock(&ctx->lock); +	ctx->is_active = 1; +	update_context_time(ctx); + +	/* +	 * Protect the list operation against NMI by disabling the +	 * counters on a global level. NOP for non NMI based counters. +	 */ +	perf_disable(); + +	add_counter_to_ctx(counter, ctx); + +	/* +	 * Don't put the counter on if it is disabled or if +	 * it is in a group and the group isn't on. +	 */ +	if (counter->state != PERF_COUNTER_STATE_INACTIVE || +	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) +		goto unlock; + +	/* +	 * An exclusive counter can't go on if there are already active +	 * hardware counters, and no hardware counter can go on if there +	 * is already an exclusive counter on. +	 */ +	if (!group_can_go_on(counter, cpuctx, 1)) +		err = -EEXIST; +	else +		err = counter_sched_in(counter, cpuctx, ctx, cpu); + +	if (err) { +		/* +		 * This counter couldn't go on.  If it is in a group +		 * then we have to pull the whole group off. +		 * If the counter group is pinned then put it in error state. +		 */ +		if (leader != counter) +			group_sched_out(leader, cpuctx, ctx); +		if (leader->attr.pinned) { +			update_group_times(leader); +			leader->state = PERF_COUNTER_STATE_ERROR; +		} +	} + +	if (!err && !ctx->task && cpuctx->max_pertask) +		cpuctx->max_pertask--; + + unlock: +	perf_enable(); + +	spin_unlock(&ctx->lock); +} + +/* + * Attach a performance counter to a context + * + * First we add the counter to the list with the hardware enable bit + * in counter->hw_config cleared. + * + * If the counter is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_counter_context *ctx, +			struct perf_counter *counter, +			int cpu) +{ +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Per cpu counters are installed via an smp call and +		 * the install is always sucessful. +		 */ +		smp_call_function_single(cpu, __perf_install_in_context, +					 counter, 1); +		return; +	} + +retry: +	task_oncpu_function_call(task, __perf_install_in_context, +				 counter); + +	spin_lock_irq(&ctx->lock); +	/* +	 * we need to retry the smp call. +	 */ +	if (ctx->is_active && list_empty(&counter->list_entry)) { +		spin_unlock_irq(&ctx->lock); +		goto retry; +	} + +	/* +	 * The lock prevents that this context is scheduled in so we +	 * can add the counter safely, if it the call above did not +	 * succeed. +	 */ +	if (list_empty(&counter->list_entry)) +		add_counter_to_ctx(counter, ctx); +	spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to enable a performance counter + */ +static void __perf_counter_enable(void *info) +{ +	struct perf_counter *counter = info; +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter_context *ctx = counter->ctx; +	struct perf_counter *leader = counter->group_leader; +	int err; + +	/* +	 * If this is a per-task counter, need to check whether this +	 * counter's task is the current task on this cpu. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) { +		if (cpuctx->task_ctx || ctx->task != current) +			return; +		cpuctx->task_ctx = ctx; +	} + +	spin_lock(&ctx->lock); +	ctx->is_active = 1; +	update_context_time(ctx); + +	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) +		goto unlock; +	counter->state = PERF_COUNTER_STATE_INACTIVE; +	counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + +	/* +	 * If the counter is in a group and isn't the group leader, +	 * then don't put it on unless the group is on. +	 */ +	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) +		goto unlock; + +	if (!group_can_go_on(counter, cpuctx, 1)) { +		err = -EEXIST; +	} else { +		perf_disable(); +		if (counter == leader) +			err = group_sched_in(counter, cpuctx, ctx, +					     smp_processor_id()); +		else +			err = counter_sched_in(counter, cpuctx, ctx, +					       smp_processor_id()); +		perf_enable(); +	} + +	if (err) { +		/* +		 * If this counter can't go on and it's part of a +		 * group, then the whole group has to come off. +		 */ +		if (leader != counter) +			group_sched_out(leader, cpuctx, ctx); +		if (leader->attr.pinned) { +			update_group_times(leader); +			leader->state = PERF_COUNTER_STATE_ERROR; +		} +	} + + unlock: +	spin_unlock(&ctx->lock); +} + +/* + * Enable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid.  This condition is satisfied when called through + * perf_counter_for_each_child or perf_counter_for_each as described + * for perf_counter_disable. + */ +static void perf_counter_enable(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Enable the counter on the cpu that it's on +		 */ +		smp_call_function_single(counter->cpu, __perf_counter_enable, +					 counter, 1); +		return; +	} + +	spin_lock_irq(&ctx->lock); +	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) +		goto out; + +	/* +	 * If the counter is in error state, clear that first. +	 * That way, if we see the counter in error state below, we +	 * know that it has gone back into error state, as distinct +	 * from the task having been scheduled away before the +	 * cross-call arrived. +	 */ +	if (counter->state == PERF_COUNTER_STATE_ERROR) +		counter->state = PERF_COUNTER_STATE_OFF; + + retry: +	spin_unlock_irq(&ctx->lock); +	task_oncpu_function_call(task, __perf_counter_enable, counter); + +	spin_lock_irq(&ctx->lock); + +	/* +	 * If the context is active and the counter is still off, +	 * we need to retry the cross-call. +	 */ +	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) +		goto retry; + +	/* +	 * Since we have the lock this context can't be scheduled +	 * in, so we can change the state safely. +	 */ +	if (counter->state == PERF_COUNTER_STATE_OFF) { +		counter->state = PERF_COUNTER_STATE_INACTIVE; +		counter->tstamp_enabled = +			ctx->time - counter->total_time_enabled; +	} + out: +	spin_unlock_irq(&ctx->lock); +} + +static int perf_counter_refresh(struct perf_counter *counter, int refresh) +{ +	/* +	 * not supported on inherited counters +	 */ +	if (counter->attr.inherit) +		return -EINVAL; + +	atomic_add(refresh, &counter->event_limit); +	perf_counter_enable(counter); + +	return 0; +} + +void __perf_counter_sched_out(struct perf_counter_context *ctx, +			      struct perf_cpu_context *cpuctx) +{ +	struct perf_counter *counter; + +	spin_lock(&ctx->lock); +	ctx->is_active = 0; +	if (likely(!ctx->nr_counters)) +		goto out; +	update_context_time(ctx); + +	perf_disable(); +	if (ctx->nr_active) { +		list_for_each_entry(counter, &ctx->counter_list, list_entry) { +			if (counter != counter->group_leader) +				counter_sched_out(counter, cpuctx, ctx); +			else +				group_sched_out(counter, cpuctx, ctx); +		} +	} +	perf_enable(); + out: +	spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled counters. + * If the number of enabled counters is the same, then the set + * of enabled counters should be the same, because these are both + * inherited contexts, therefore we can't access individual counters + * in them directly with an fd; we can only enable/disable all + * counters via prctl, or enable/disable all counters in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_counter_context *ctx1, +			 struct perf_counter_context *ctx2) +{ +	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx +		&& ctx1->parent_gen == ctx2->parent_gen +		&& !ctx1->pin_count && !ctx2->pin_count; +} + +/* + * Called from scheduler to remove the counters of the current task, + * with interrupts disabled. + * + * We stop each counter and update the counter value in counter->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * not restart the counter. + */ +void perf_counter_task_sched_out(struct task_struct *task, +				 struct task_struct *next, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx = task->perf_counter_ctxp; +	struct perf_counter_context *next_ctx; +	struct perf_counter_context *parent; +	struct pt_regs *regs; +	int do_switch = 1; + +	regs = task_pt_regs(task); +	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + +	if (likely(!ctx || !cpuctx->task_ctx)) +		return; + +	update_context_time(ctx); + +	rcu_read_lock(); +	parent = rcu_dereference(ctx->parent_ctx); +	next_ctx = next->perf_counter_ctxp; +	if (parent && next_ctx && +	    rcu_dereference(next_ctx->parent_ctx) == parent) { +		/* +		 * Looks like the two contexts are clones, so we might be +		 * able to optimize the context switch.  We lock both +		 * contexts and check that they are clones under the +		 * lock (including re-checking that neither has been +		 * uncloned in the meantime).  It doesn't matter which +		 * order we take the locks because no other cpu could +		 * be trying to lock both of these tasks. +		 */ +		spin_lock(&ctx->lock); +		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); +		if (context_equiv(ctx, next_ctx)) { +			/* +			 * XXX do we need a memory barrier of sorts +			 * wrt to rcu_dereference() of perf_counter_ctxp +			 */ +			task->perf_counter_ctxp = next_ctx; +			next->perf_counter_ctxp = ctx; +			ctx->task = next; +			next_ctx->task = task; +			do_switch = 0; +		} +		spin_unlock(&next_ctx->lock); +		spin_unlock(&ctx->lock); +	} +	rcu_read_unlock(); + +	if (do_switch) { +		__perf_counter_sched_out(ctx, cpuctx); +		cpuctx->task_ctx = NULL; +	} +} + +/* + * Called with IRQs disabled + */ +static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + +	if (!cpuctx->task_ctx) +		return; + +	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) +		return; + +	__perf_counter_sched_out(ctx, cpuctx); +	cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ +	__perf_counter_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_counter_sched_in(struct perf_counter_context *ctx, +			struct perf_cpu_context *cpuctx, int cpu) +{ +	struct perf_counter *counter; +	int can_add_hw = 1; + +	spin_lock(&ctx->lock); +	ctx->is_active = 1; +	if (likely(!ctx->nr_counters)) +		goto out; + +	ctx->timestamp = perf_clock(); + +	perf_disable(); + +	/* +	 * First go through the list and put on any pinned groups +	 * in order to give them the best chance of going on. +	 */ +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		if (counter->state <= PERF_COUNTER_STATE_OFF || +		    !counter->attr.pinned) +			continue; +		if (counter->cpu != -1 && counter->cpu != cpu) +			continue; + +		if (counter != counter->group_leader) +			counter_sched_in(counter, cpuctx, ctx, cpu); +		else { +			if (group_can_go_on(counter, cpuctx, 1)) +				group_sched_in(counter, cpuctx, ctx, cpu); +		} + +		/* +		 * If this pinned group hasn't been scheduled, +		 * put it in error state. +		 */ +		if (counter->state == PERF_COUNTER_STATE_INACTIVE) { +			update_group_times(counter); +			counter->state = PERF_COUNTER_STATE_ERROR; +		} +	} + +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		/* +		 * Ignore counters in OFF or ERROR state, and +		 * ignore pinned counters since we did them already. +		 */ +		if (counter->state <= PERF_COUNTER_STATE_OFF || +		    counter->attr.pinned) +			continue; + +		/* +		 * Listen to the 'cpu' scheduling filter constraint +		 * of counters: +		 */ +		if (counter->cpu != -1 && counter->cpu != cpu) +			continue; + +		if (counter != counter->group_leader) { +			if (counter_sched_in(counter, cpuctx, ctx, cpu)) +				can_add_hw = 0; +		} else { +			if (group_can_go_on(counter, cpuctx, can_add_hw)) { +				if (group_sched_in(counter, cpuctx, ctx, cpu)) +					can_add_hw = 0; +			} +		} +	} +	perf_enable(); + out: +	spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx = task->perf_counter_ctxp; + +	if (likely(!ctx)) +		return; +	if (cpuctx->task_ctx == ctx) +		return; +	__perf_counter_sched_in(ctx, cpuctx, cpu); +	cpuctx->task_ctx = ctx; +} + +static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ +	struct perf_counter_context *ctx = &cpuctx->ctx; + +	__perf_counter_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_counter *counter, int enable); +static void perf_log_period(struct perf_counter *counter, u64 period); + +static void perf_adjust_period(struct perf_counter *counter, u64 events) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	u64 period, sample_period; +	s64 delta; + +	events *= hwc->sample_period; +	period = div64_u64(events, counter->attr.sample_freq); + +	delta = (s64)(period - hwc->sample_period); +	delta = (delta + 7) / 8; /* low pass filter */ + +	sample_period = hwc->sample_period + delta; + +	if (!sample_period) +		sample_period = 1; + +	perf_log_period(counter, sample_period); + +	hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) +{ +	struct perf_counter *counter; +	struct hw_perf_counter *hwc; +	u64 interrupts, freq; + +	spin_lock(&ctx->lock); +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		if (counter->state != PERF_COUNTER_STATE_ACTIVE) +			continue; + +		hwc = &counter->hw; + +		interrupts = hwc->interrupts; +		hwc->interrupts = 0; + +		/* +		 * unthrottle counters on the tick +		 */ +		if (interrupts == MAX_INTERRUPTS) { +			perf_log_throttle(counter, 1); +			counter->pmu->unthrottle(counter); +			interrupts = 2*sysctl_perf_counter_sample_rate/HZ; +		} + +		if (!counter->attr.freq || !counter->attr.sample_freq) +			continue; + +		/* +		 * if the specified freq < HZ then we need to skip ticks +		 */ +		if (counter->attr.sample_freq < HZ) { +			freq = counter->attr.sample_freq; + +			hwc->freq_count += freq; +			hwc->freq_interrupts += interrupts; + +			if (hwc->freq_count < HZ) +				continue; + +			interrupts = hwc->freq_interrupts; +			hwc->freq_interrupts = 0; +			hwc->freq_count -= HZ; +		} else +			freq = HZ; + +		perf_adjust_period(counter, freq * interrupts); + +		/* +		 * In order to avoid being stalled by an (accidental) huge +		 * sample period, force reset the sample period if we didn't +		 * get any events in this freq period. +		 */ +		if (!interrupts) { +			perf_disable(); +			counter->pmu->disable(counter); +			atomic_set(&hwc->period_left, 0); +			counter->pmu->enable(counter); +			perf_enable(); +		} +	} +	spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's counters: + */ +static void rotate_ctx(struct perf_counter_context *ctx) +{ +	struct perf_counter *counter; + +	if (!ctx->nr_counters) +		return; + +	spin_lock(&ctx->lock); +	/* +	 * Rotate the first entry last (works just fine for group counters too): +	 */ +	perf_disable(); +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		list_move_tail(&counter->list_entry, &ctx->counter_list); +		break; +	} +	perf_enable(); + +	spin_unlock(&ctx->lock); +} + +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; + +	if (!atomic_read(&nr_counters)) +		return; + +	cpuctx = &per_cpu(perf_cpu_context, cpu); +	ctx = curr->perf_counter_ctxp; + +	perf_ctx_adjust_freq(&cpuctx->ctx); +	if (ctx) +		perf_ctx_adjust_freq(ctx); + +	perf_counter_cpu_sched_out(cpuctx); +	if (ctx) +		__perf_counter_task_sched_out(ctx); + +	rotate_ctx(&cpuctx->ctx); +	if (ctx) +		rotate_ctx(ctx); + +	perf_counter_cpu_sched_in(cpuctx, cpu); +	if (ctx) +		perf_counter_task_sched_in(curr, cpu); +} + +/* + * Cross CPU call to read the hardware counter + */ +static void __read(void *info) +{ +	struct perf_counter *counter = info; +	struct perf_counter_context *ctx = counter->ctx; +	unsigned long flags; + +	local_irq_save(flags); +	if (ctx->is_active) +		update_context_time(ctx); +	counter->pmu->read(counter); +	update_counter_times(counter); +	local_irq_restore(flags); +} + +static u64 perf_counter_read(struct perf_counter *counter) +{ +	/* +	 * If counter is enabled and currently active on a CPU, update the +	 * value in the counter structure: +	 */ +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) { +		smp_call_function_single(counter->oncpu, +					 __read, counter, 1); +	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { +		update_counter_times(counter); +	} + +	return atomic64_read(&counter->count); +} + +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, +			    struct task_struct *task) +{ +	memset(ctx, 0, sizeof(*ctx)); +	spin_lock_init(&ctx->lock); +	mutex_init(&ctx->mutex); +	INIT_LIST_HEAD(&ctx->counter_list); +	INIT_LIST_HEAD(&ctx->event_list); +	atomic_set(&ctx->refcount, 1); +	ctx->task = task; +} + +static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +{ +	struct perf_counter_context *parent_ctx; +	struct perf_counter_context *ctx; +	struct perf_cpu_context *cpuctx; +	struct task_struct *task; +	unsigned long flags; +	int err; + +	/* +	 * If cpu is not a wildcard then this is a percpu counter: +	 */ +	if (cpu != -1) { +		/* Must be root to operate on a CPU counter: */ +		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) +			return ERR_PTR(-EACCES); + +		if (cpu < 0 || cpu > num_possible_cpus()) +			return ERR_PTR(-EINVAL); + +		/* +		 * We could be clever and allow to attach a counter to an +		 * offline CPU and activate it when the CPU comes up, but +		 * that's for later. +		 */ +		if (!cpu_isset(cpu, cpu_online_map)) +			return ERR_PTR(-ENODEV); + +		cpuctx = &per_cpu(perf_cpu_context, cpu); +		ctx = &cpuctx->ctx; +		get_ctx(ctx); + +		return ctx; +	} + +	rcu_read_lock(); +	if (!pid) +		task = current; +	else +		task = find_task_by_vpid(pid); +	if (task) +		get_task_struct(task); +	rcu_read_unlock(); + +	if (!task) +		return ERR_PTR(-ESRCH); + +	/* +	 * Can't attach counters to a dying task. +	 */ +	err = -ESRCH; +	if (task->flags & PF_EXITING) +		goto errout; + +	/* Reuse ptrace permission checks for now. */ +	err = -EACCES; +	if (!ptrace_may_access(task, PTRACE_MODE_READ)) +		goto errout; + + retry: +	ctx = perf_lock_task_context(task, &flags); +	if (ctx) { +		parent_ctx = ctx->parent_ctx; +		if (parent_ctx) { +			put_ctx(parent_ctx); +			ctx->parent_ctx = NULL;		/* no longer a clone */ +		} +		/* +		 * Get an extra reference before dropping the lock so that +		 * this context won't get freed if the task exits. +		 */ +		get_ctx(ctx); +		spin_unlock_irqrestore(&ctx->lock, flags); +	} + +	if (!ctx) { +		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); +		err = -ENOMEM; +		if (!ctx) +			goto errout; +		__perf_counter_init_context(ctx, task); +		get_ctx(ctx); +		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { +			/* +			 * We raced with some other task; use +			 * the context they set. +			 */ +			kfree(ctx); +			goto retry; +		} +		get_task_struct(task); +	} + +	put_task_struct(task); +	return ctx; + + errout: +	put_task_struct(task); +	return ERR_PTR(err); +} + +static void free_counter_rcu(struct rcu_head *head) +{ +	struct perf_counter *counter; + +	counter = container_of(head, struct perf_counter, rcu_head); +	if (counter->ns) +		put_pid_ns(counter->ns); +	kfree(counter); +} + +static void perf_pending_sync(struct perf_counter *counter); + +static void free_counter(struct perf_counter *counter) +{ +	perf_pending_sync(counter); + +	atomic_dec(&nr_counters); +	if (counter->attr.mmap) +		atomic_dec(&nr_mmap_counters); +	if (counter->attr.comm) +		atomic_dec(&nr_comm_counters); + +	if (counter->destroy) +		counter->destroy(counter); + +	put_ctx(counter->ctx); +	call_rcu(&counter->rcu_head, free_counter_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ +	struct perf_counter *counter = file->private_data; +	struct perf_counter_context *ctx = counter->ctx; + +	file->private_data = NULL; + +	WARN_ON_ONCE(ctx->parent_ctx); +	mutex_lock(&ctx->mutex); +	perf_counter_remove_from_context(counter); +	mutex_unlock(&ctx->mutex); + +	mutex_lock(&counter->owner->perf_counter_mutex); +	list_del_init(&counter->owner_entry); +	mutex_unlock(&counter->owner->perf_counter_mutex); +	put_task_struct(counter->owner); + +	free_counter(counter); + +	return 0; +} + +/* + * Read the performance counter - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +{ +	u64 values[3]; +	int n; + +	/* +	 * Return end-of-file for a read on a counter that is in +	 * error state (i.e. because it was pinned but it couldn't be +	 * scheduled on to the CPU at some point). +	 */ +	if (counter->state == PERF_COUNTER_STATE_ERROR) +		return 0; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->child_mutex); +	values[0] = perf_counter_read(counter); +	n = 1; +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) +		values[n++] = counter->total_time_enabled + +			atomic64_read(&counter->child_total_time_enabled); +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) +		values[n++] = counter->total_time_running + +			atomic64_read(&counter->child_total_time_running); +	if (counter->attr.read_format & PERF_FORMAT_ID) +		values[n++] = counter->id; +	mutex_unlock(&counter->child_mutex); + +	if (count < n * sizeof(u64)) +		return -EINVAL; +	count = n * sizeof(u64); + +	if (copy_to_user(buf, values, count)) +		return -EFAULT; + +	return count; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ +	struct perf_counter *counter = file->private_data; + +	return perf_read_hw(counter, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ +	struct perf_counter *counter = file->private_data; +	struct perf_mmap_data *data; +	unsigned int events = POLL_HUP; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (data) +		events = atomic_xchg(&data->poll, 0); +	rcu_read_unlock(); + +	poll_wait(file, &counter->waitq, wait); + +	return events; +} + +static void perf_counter_reset(struct perf_counter *counter) +{ +	(void)perf_counter_read(counter); +	atomic64_set(&counter->count, 0); +	perf_counter_update_userpage(counter); +} + +static void perf_counter_for_each_sibling(struct perf_counter *counter, +					  void (*func)(struct perf_counter *)) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct perf_counter *sibling; + +	WARN_ON_ONCE(ctx->parent_ctx); +	mutex_lock(&ctx->mutex); +	counter = counter->group_leader; + +	func(counter); +	list_for_each_entry(sibling, &counter->sibling_list, list_entry) +		func(sibling); +	mutex_unlock(&ctx->mutex); +} + +/* + * Holding the top-level counter's child_mutex means that any + * descendant process that has inherited this counter will block + * in sync_child_counter if it goes to exit, thus satisfying the + * task existence requirements of perf_counter_enable/disable. + */ +static void perf_counter_for_each_child(struct perf_counter *counter, +					void (*func)(struct perf_counter *)) +{ +	struct perf_counter *child; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->child_mutex); +	func(counter); +	list_for_each_entry(child, &counter->child_list, child_list) +		func(child); +	mutex_unlock(&counter->child_mutex); +} + +static void perf_counter_for_each(struct perf_counter *counter, +				  void (*func)(struct perf_counter *)) +{ +	struct perf_counter *child; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->child_mutex); +	perf_counter_for_each_sibling(counter, func); +	list_for_each_entry(child, &counter->child_list, child_list) +		perf_counter_for_each_sibling(child, func); +	mutex_unlock(&counter->child_mutex); +} + +static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +{ +	struct perf_counter_context *ctx = counter->ctx; +	unsigned long size; +	int ret = 0; +	u64 value; + +	if (!counter->attr.sample_period) +		return -EINVAL; + +	size = copy_from_user(&value, arg, sizeof(value)); +	if (size != sizeof(value)) +		return -EFAULT; + +	if (!value) +		return -EINVAL; + +	spin_lock_irq(&ctx->lock); +	if (counter->attr.freq) { +		if (value > sysctl_perf_counter_sample_rate) { +			ret = -EINVAL; +			goto unlock; +		} + +		counter->attr.sample_freq = value; +	} else { +		perf_log_period(counter, value); + +		counter->attr.sample_period = value; +		counter->hw.sample_period = value; +	} +unlock: +	spin_unlock_irq(&ctx->lock); + +	return ret; +} + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ +	struct perf_counter *counter = file->private_data; +	void (*func)(struct perf_counter *); +	u32 flags = arg; + +	switch (cmd) { +	case PERF_COUNTER_IOC_ENABLE: +		func = perf_counter_enable; +		break; +	case PERF_COUNTER_IOC_DISABLE: +		func = perf_counter_disable; +		break; +	case PERF_COUNTER_IOC_RESET: +		func = perf_counter_reset; +		break; + +	case PERF_COUNTER_IOC_REFRESH: +		return perf_counter_refresh(counter, arg); + +	case PERF_COUNTER_IOC_PERIOD: +		return perf_counter_period(counter, (u64 __user *)arg); + +	default: +		return -ENOTTY; +	} + +	if (flags & PERF_IOC_FLAG_GROUP) +		perf_counter_for_each(counter, func); +	else +		perf_counter_for_each_child(counter, func); + +	return 0; +} + +int perf_counter_task_enable(void) +{ +	struct perf_counter *counter; + +	mutex_lock(¤t->perf_counter_mutex); +	list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) +		perf_counter_for_each_child(counter, perf_counter_enable); +	mutex_unlock(¤t->perf_counter_mutex); + +	return 0; +} + +int perf_counter_task_disable(void) +{ +	struct perf_counter *counter; + +	mutex_lock(¤t->perf_counter_mutex); +	list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) +		perf_counter_for_each_child(counter, perf_counter_disable); +	mutex_unlock(¤t->perf_counter_mutex); + +	return 0; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_counter_update_userpage(struct perf_counter *counter) +{ +	struct perf_counter_mmap_page *userpg; +	struct perf_mmap_data *data; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (!data) +		goto unlock; + +	userpg = data->user_page; + +	/* +	 * Disable preemption so as to not let the corresponding user-space +	 * spin too long if we get preempted. +	 */ +	preempt_disable(); +	++userpg->lock; +	barrier(); +	userpg->index = counter->hw.idx; +	userpg->offset = atomic64_read(&counter->count); +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) +		userpg->offset -= atomic64_read(&counter->hw.prev_count); + +	barrier(); +	++userpg->lock; +	preempt_enable(); +unlock: +	rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	struct perf_counter *counter = vma->vm_file->private_data; +	struct perf_mmap_data *data; +	int ret = VM_FAULT_SIGBUS; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (!data) +		goto unlock; + +	if (vmf->pgoff == 0) { +		vmf->page = virt_to_page(data->user_page); +	} else { +		int nr = vmf->pgoff - 1; + +		if ((unsigned)nr > data->nr_pages) +			goto unlock; + +		vmf->page = virt_to_page(data->data_pages[nr]); +	} +	get_page(vmf->page); +	ret = 0; +unlock: +	rcu_read_unlock(); + +	return ret; +} + +static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +{ +	struct perf_mmap_data *data; +	unsigned long size; +	int i; + +	WARN_ON(atomic_read(&counter->mmap_count)); + +	size = sizeof(struct perf_mmap_data); +	size += nr_pages * sizeof(void *); + +	data = kzalloc(size, GFP_KERNEL); +	if (!data) +		goto fail; + +	data->user_page = (void *)get_zeroed_page(GFP_KERNEL); +	if (!data->user_page) +		goto fail_user_page; + +	for (i = 0; i < nr_pages; i++) { +		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); +		if (!data->data_pages[i]) +			goto fail_data_pages; +	} + +	data->nr_pages = nr_pages; +	atomic_set(&data->lock, -1); + +	rcu_assign_pointer(counter->data, data); + +	return 0; + +fail_data_pages: +	for (i--; i >= 0; i--) +		free_page((unsigned long)data->data_pages[i]); + +	free_page((unsigned long)data->user_page); + +fail_user_page: +	kfree(data); + +fail: +	return -ENOMEM; +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ +	struct perf_mmap_data *data; +	int i; + +	data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + +	free_page((unsigned long)data->user_page); +	for (i = 0; i < data->nr_pages; i++) +		free_page((unsigned long)data->data_pages[i]); +	kfree(data); +} + +static void perf_mmap_data_free(struct perf_counter *counter) +{ +	struct perf_mmap_data *data = counter->data; + +	WARN_ON(atomic_read(&counter->mmap_count)); + +	rcu_assign_pointer(counter->data, NULL); +	call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ +	struct perf_counter *counter = vma->vm_file->private_data; + +	atomic_inc(&counter->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ +	struct perf_counter *counter = vma->vm_file->private_data; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { +		struct user_struct *user = current_user(); + +		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); +		vma->vm_mm->locked_vm -= counter->data->nr_locked; +		perf_mmap_data_free(counter); +		mutex_unlock(&counter->mmap_mutex); +	} +} + +static struct vm_operations_struct perf_mmap_vmops = { +	.open  = perf_mmap_open, +	.close = perf_mmap_close, +	.fault = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ +	struct perf_counter *counter = file->private_data; +	unsigned long user_locked, user_lock_limit; +	struct user_struct *user = current_user(); +	unsigned long locked, lock_limit; +	unsigned long vma_size; +	unsigned long nr_pages; +	long user_extra, extra; +	int ret = 0; + +	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) +		return -EINVAL; + +	vma_size = vma->vm_end - vma->vm_start; +	nr_pages = (vma_size / PAGE_SIZE) - 1; + +	/* +	 * If we have data pages ensure they're a power-of-two number, so we +	 * can do bitmasks instead of modulo. +	 */ +	if (nr_pages != 0 && !is_power_of_2(nr_pages)) +		return -EINVAL; + +	if (vma_size != PAGE_SIZE * (1 + nr_pages)) +		return -EINVAL; + +	if (vma->vm_pgoff != 0) +		return -EINVAL; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->mmap_mutex); +	if (atomic_inc_not_zero(&counter->mmap_count)) { +		if (nr_pages != counter->data->nr_pages) +			ret = -EINVAL; +		goto unlock; +	} + +	user_extra = nr_pages + 1; +	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + +	/* +	 * Increase the limit linearly with more CPUs: +	 */ +	user_lock_limit *= num_online_cpus(); + +	user_locked = atomic_long_read(&user->locked_vm) + user_extra; + +	extra = 0; +	if (user_locked > user_lock_limit) +		extra = user_locked - user_lock_limit; + +	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; +	lock_limit >>= PAGE_SHIFT; +	locked = vma->vm_mm->locked_vm + extra; + +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { +		ret = -EPERM; +		goto unlock; +	} + +	WARN_ON(counter->data); +	ret = perf_mmap_data_alloc(counter, nr_pages); +	if (ret) +		goto unlock; + +	atomic_set(&counter->mmap_count, 1); +	atomic_long_add(user_extra, &user->locked_vm); +	vma->vm_mm->locked_vm += extra; +	counter->data->nr_locked = extra; +unlock: +	mutex_unlock(&counter->mmap_mutex); + +	vma->vm_flags &= ~VM_MAYWRITE; +	vma->vm_flags |= VM_RESERVED; +	vma->vm_ops = &perf_mmap_vmops; + +	return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ +	struct inode *inode = filp->f_path.dentry->d_inode; +	struct perf_counter *counter = filp->private_data; +	int retval; + +	mutex_lock(&inode->i_mutex); +	retval = fasync_helper(fd, filp, on, &counter->fasync); +	mutex_unlock(&inode->i_mutex); + +	if (retval < 0) +		return retval; + +	return 0; +} + +static const struct file_operations perf_fops = { +	.release		= perf_release, +	.read			= perf_read, +	.poll			= perf_poll, +	.unlocked_ioctl		= perf_ioctl, +	.compat_ioctl		= perf_ioctl, +	.mmap			= perf_mmap, +	.fasync			= perf_fasync, +}; + +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ +	wake_up_all(&counter->waitq); + +	if (counter->pending_kill) { +		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); +		counter->pending_kill = 0; +	} +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_counter(struct perf_pending_entry *entry) +{ +	struct perf_counter *counter = container_of(entry, +			struct perf_counter, pending); + +	if (counter->pending_disable) { +		counter->pending_disable = 0; +		perf_counter_disable(counter); +	} + +	if (counter->pending_wakeup) { +		counter->pending_wakeup = 0; +		perf_counter_wakeup(counter); +	} +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { +	PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, +			       void (*func)(struct perf_pending_entry *)) +{ +	struct perf_pending_entry **head; + +	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) +		return; + +	entry->func = func; + +	head = &get_cpu_var(perf_pending_head); + +	do { +		entry->next = *head; +	} while (cmpxchg(head, entry->next, entry) != entry->next); + +	set_perf_counter_pending(); + +	put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ +	struct perf_pending_entry *list; +	int nr = 0; + +	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); +	while (list != PENDING_TAIL) { +		void (*func)(struct perf_pending_entry *); +		struct perf_pending_entry *entry = list; + +		list = list->next; + +		func = entry->func; +		entry->next = NULL; +		/* +		 * Ensure we observe the unqueue before we issue the wakeup, +		 * so that we won't be waiting forever. +		 * -- see perf_not_pending(). +		 */ +		smp_wmb(); + +		func(entry); +		nr++; +	} + +	return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ +	/* +	 * If we flush on whatever cpu we run, there is a chance we don't +	 * need to wait. +	 */ +	get_cpu(); +	__perf_pending_run(); +	put_cpu(); + +	/* +	 * Ensure we see the proper queue state before going to sleep +	 * so that we do not miss the wakeup. -- see perf_pending_handle() +	 */ +	smp_rmb(); +	return counter->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ +	wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ +	__perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ +	return NULL; +} + +/* + * Output + */ + +struct perf_output_handle { +	struct perf_counter	*counter; +	struct perf_mmap_data	*data; +	unsigned long		head; +	unsigned long		offset; +	int			nmi; +	int			overflow; +	int			locked; +	unsigned long		flags; +}; + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ +	atomic_set(&handle->data->poll, POLL_IN); + +	if (handle->nmi) { +		handle->counter->pending_wakeup = 1; +		perf_pending_queue(&handle->counter->pending, +				   perf_pending_counter); +	} else +		perf_counter_wakeup(handle->counter); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ +	struct perf_mmap_data *data = handle->data; +	int cpu; + +	handle->locked = 0; + +	local_irq_save(handle->flags); +	cpu = smp_processor_id(); + +	if (in_nmi() && atomic_read(&data->lock) == cpu) +		return; + +	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) +		cpu_relax(); + +	handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ +	struct perf_mmap_data *data = handle->data; +	unsigned long head; +	int cpu; + +	data->done_head = data->head; + +	if (!handle->locked) +		goto out; + +again: +	/* +	 * The xchg implies a full barrier that ensures all writes are done +	 * before we publish the new head, matched by a rmb() in userspace when +	 * reading this position. +	 */ +	while ((head = atomic_long_xchg(&data->done_head, 0))) +		data->user_page->data_head = head; + +	/* +	 * NMI can happen here, which means we can miss a done_head update. +	 */ + +	cpu = atomic_xchg(&data->lock, -1); +	WARN_ON_ONCE(cpu != smp_processor_id()); + +	/* +	 * Therefore we have to validate we did not indeed do so. +	 */ +	if (unlikely(atomic_long_read(&data->done_head))) { +		/* +		 * Since we had it locked, we can lock it again. +		 */ +		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) +			cpu_relax(); + +		goto again; +	} + +	if (atomic_xchg(&data->wakeup, 0)) +		perf_output_wakeup(handle); +out: +	local_irq_restore(handle->flags); +} + +static int perf_output_begin(struct perf_output_handle *handle, +			     struct perf_counter *counter, unsigned int size, +			     int nmi, int overflow) +{ +	struct perf_mmap_data *data; +	unsigned int offset, head; + +	/* +	 * For inherited counters we send all the output towards the parent. +	 */ +	if (counter->parent) +		counter = counter->parent; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (!data) +		goto out; + +	handle->data	 = data; +	handle->counter	 = counter; +	handle->nmi	 = nmi; +	handle->overflow = overflow; + +	if (!data->nr_pages) +		goto fail; + +	perf_output_lock(handle); + +	do { +		offset = head = atomic_long_read(&data->head); +		head += size; +	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + +	handle->offset	= offset; +	handle->head	= head; + +	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) +		atomic_set(&data->wakeup, 1); + +	return 0; + +fail: +	perf_output_wakeup(handle); +out: +	rcu_read_unlock(); + +	return -ENOSPC; +} + +static void perf_output_copy(struct perf_output_handle *handle, +			     const void *buf, unsigned int len) +{ +	unsigned int pages_mask; +	unsigned int offset; +	unsigned int size; +	void **pages; + +	offset		= handle->offset; +	pages_mask	= handle->data->nr_pages - 1; +	pages		= handle->data->data_pages; + +	do { +		unsigned int page_offset; +		int nr; + +		nr	    = (offset >> PAGE_SHIFT) & pages_mask; +		page_offset = offset & (PAGE_SIZE - 1); +		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len); + +		memcpy(pages[nr] + page_offset, buf, size); + +		len	    -= size; +		buf	    += size; +		offset	    += size; +	} while (len); + +	handle->offset = offset; + +	/* +	 * Check we didn't copy past our reservation window, taking the +	 * possible unsigned int wrap into account. +	 */ +	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +#define perf_output_put(handle, x) \ +	perf_output_copy((handle), &(x), sizeof(x)) + +static void perf_output_end(struct perf_output_handle *handle) +{ +	struct perf_counter *counter = handle->counter; +	struct perf_mmap_data *data = handle->data; + +	int wakeup_events = counter->attr.wakeup_events; + +	if (handle->overflow && wakeup_events) { +		int events = atomic_inc_return(&data->events); +		if (events >= wakeup_events) { +			atomic_sub(wakeup_events, &data->events); +			atomic_set(&data->wakeup, 1); +		} +	} + +	perf_output_unlock(handle); +	rcu_read_unlock(); +} + +static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +{ +	/* +	 * only top level counters have the pid namespace they were created in +	 */ +	if (counter->parent) +		counter = counter->parent; + +	return task_tgid_nr_ns(p, counter->ns); +} + +static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +{ +	/* +	 * only top level counters have the pid namespace they were created in +	 */ +	if (counter->parent) +		counter = counter->parent; + +	return task_pid_nr_ns(p, counter->ns); +} + +static void perf_counter_output(struct perf_counter *counter, int nmi, +				struct perf_sample_data *data) +{ +	int ret; +	u64 sample_type = counter->attr.sample_type; +	struct perf_output_handle handle; +	struct perf_event_header header; +	u64 ip; +	struct { +		u32 pid, tid; +	} tid_entry; +	struct { +		u64 id; +		u64 counter; +	} group_entry; +	struct perf_callchain_entry *callchain = NULL; +	int callchain_size = 0; +	u64 time; +	struct { +		u32 cpu, reserved; +	} cpu_entry; + +	header.type = 0; +	header.size = sizeof(header); + +	header.misc = PERF_EVENT_MISC_OVERFLOW; +	header.misc |= perf_misc_flags(data->regs); + +	if (sample_type & PERF_SAMPLE_IP) { +		ip = perf_instruction_pointer(data->regs); +		header.type |= PERF_SAMPLE_IP; +		header.size += sizeof(ip); +	} + +	if (sample_type & PERF_SAMPLE_TID) { +		/* namespace issues */ +		tid_entry.pid = perf_counter_pid(counter, current); +		tid_entry.tid = perf_counter_tid(counter, current); + +		header.type |= PERF_SAMPLE_TID; +		header.size += sizeof(tid_entry); +	} + +	if (sample_type & PERF_SAMPLE_TIME) { +		/* +		 * Maybe do better on x86 and provide cpu_clock_nmi() +		 */ +		time = sched_clock(); + +		header.type |= PERF_SAMPLE_TIME; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_ADDR) { +		header.type |= PERF_SAMPLE_ADDR; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_ID) { +		header.type |= PERF_SAMPLE_ID; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_CPU) { +		header.type |= PERF_SAMPLE_CPU; +		header.size += sizeof(cpu_entry); + +		cpu_entry.cpu = raw_smp_processor_id(); +	} + +	if (sample_type & PERF_SAMPLE_PERIOD) { +		header.type |= PERF_SAMPLE_PERIOD; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_GROUP) { +		header.type |= PERF_SAMPLE_GROUP; +		header.size += sizeof(u64) + +			counter->nr_siblings * sizeof(group_entry); +	} + +	if (sample_type & PERF_SAMPLE_CALLCHAIN) { +		callchain = perf_callchain(data->regs); + +		if (callchain) { +			callchain_size = (1 + callchain->nr) * sizeof(u64); + +			header.type |= PERF_SAMPLE_CALLCHAIN; +			header.size += callchain_size; +		} +	} + +	ret = perf_output_begin(&handle, counter, header.size, nmi, 1); +	if (ret) +		return; + +	perf_output_put(&handle, header); + +	if (sample_type & PERF_SAMPLE_IP) +		perf_output_put(&handle, ip); + +	if (sample_type & PERF_SAMPLE_TID) +		perf_output_put(&handle, tid_entry); + +	if (sample_type & PERF_SAMPLE_TIME) +		perf_output_put(&handle, time); + +	if (sample_type & PERF_SAMPLE_ADDR) +		perf_output_put(&handle, data->addr); + +	if (sample_type & PERF_SAMPLE_ID) +		perf_output_put(&handle, counter->id); + +	if (sample_type & PERF_SAMPLE_CPU) +		perf_output_put(&handle, cpu_entry); + +	if (sample_type & PERF_SAMPLE_PERIOD) +		perf_output_put(&handle, data->period); + +	/* +	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. +	 */ +	if (sample_type & PERF_SAMPLE_GROUP) { +		struct perf_counter *leader, *sub; +		u64 nr = counter->nr_siblings; + +		perf_output_put(&handle, nr); + +		leader = counter->group_leader; +		list_for_each_entry(sub, &leader->sibling_list, list_entry) { +			if (sub != counter) +				sub->pmu->read(sub); + +			group_entry.id = sub->id; +			group_entry.counter = atomic64_read(&sub->count); + +			perf_output_put(&handle, group_entry); +		} +	} + +	if (callchain) +		perf_output_copy(&handle, callchain, callchain_size); + +	perf_output_end(&handle); +} + +/* + * fork tracking + */ + +struct perf_fork_event { +	struct task_struct	*task; + +	struct { +		struct perf_event_header	header; + +		u32				pid; +		u32				ppid; +	} event; +}; + +static void perf_counter_fork_output(struct perf_counter *counter, +				     struct perf_fork_event *fork_event) +{ +	struct perf_output_handle handle; +	int size = fork_event->event.header.size; +	struct task_struct *task = fork_event->task; +	int ret = perf_output_begin(&handle, counter, size, 0, 0); + +	if (ret) +		return; + +	fork_event->event.pid = perf_counter_pid(counter, task); +	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + +	perf_output_put(&handle, fork_event->event); +	perf_output_end(&handle); +} + +static int perf_counter_fork_match(struct perf_counter *counter) +{ +	if (counter->attr.comm || counter->attr.mmap) +		return 1; + +	return 0; +} + +static void perf_counter_fork_ctx(struct perf_counter_context *ctx, +				  struct perf_fork_event *fork_event) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_counter_fork_match(counter)) +			perf_counter_fork_output(counter, fork_event); +	} +	rcu_read_unlock(); +} + +static void perf_counter_fork_event(struct perf_fork_event *fork_event) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; + +	cpuctx = &get_cpu_var(perf_cpu_context); +	perf_counter_fork_ctx(&cpuctx->ctx, fork_event); +	put_cpu_var(perf_cpu_context); + +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_counter_fork_ctx(ctx, fork_event); +	rcu_read_unlock(); +} + +void perf_counter_fork(struct task_struct *task) +{ +	struct perf_fork_event fork_event; + +	if (!atomic_read(&nr_comm_counters) && +	    !atomic_read(&nr_mmap_counters)) +		return; + +	fork_event = (struct perf_fork_event){ +		.task	= task, +		.event  = { +			.header = { +				.type = PERF_EVENT_FORK, +				.size = sizeof(fork_event.event), +			}, +		}, +	}; + +	perf_counter_fork_event(&fork_event); +} + +/* + * comm tracking + */ + +struct perf_comm_event { +	struct task_struct	*task; +	char			*comm; +	int			comm_size; + +	struct { +		struct perf_event_header	header; + +		u32				pid; +		u32				tid; +	} event; +}; + +static void perf_counter_comm_output(struct perf_counter *counter, +				     struct perf_comm_event *comm_event) +{ +	struct perf_output_handle handle; +	int size = comm_event->event.header.size; +	int ret = perf_output_begin(&handle, counter, size, 0, 0); + +	if (ret) +		return; + +	comm_event->event.pid = perf_counter_pid(counter, comm_event->task); +	comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + +	perf_output_put(&handle, comm_event->event); +	perf_output_copy(&handle, comm_event->comm, +				   comm_event->comm_size); +	perf_output_end(&handle); +} + +static int perf_counter_comm_match(struct perf_counter *counter) +{ +	if (counter->attr.comm) +		return 1; + +	return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, +				  struct perf_comm_event *comm_event) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_counter_comm_match(counter)) +			perf_counter_comm_output(counter, comm_event); +	} +	rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; +	unsigned int size; +	char *comm = comm_event->task->comm; + +	size = ALIGN(strlen(comm)+1, sizeof(u64)); + +	comm_event->comm = comm; +	comm_event->comm_size = size; + +	comm_event->event.header.size = sizeof(comm_event->event) + size; + +	cpuctx = &get_cpu_var(perf_cpu_context); +	perf_counter_comm_ctx(&cpuctx->ctx, comm_event); +	put_cpu_var(perf_cpu_context); + +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_counter_comm_ctx(ctx, comm_event); +	rcu_read_unlock(); +} + +void perf_counter_comm(struct task_struct *task) +{ +	struct perf_comm_event comm_event; + +	if (!atomic_read(&nr_comm_counters)) +		return; + +	comm_event = (struct perf_comm_event){ +		.task	= task, +		.event  = { +			.header = { .type = PERF_EVENT_COMM, }, +		}, +	}; + +	perf_counter_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { +	struct vm_area_struct	*vma; + +	const char		*file_name; +	int			file_size; + +	struct { +		struct perf_event_header	header; + +		u32				pid; +		u32				tid; +		u64				start; +		u64				len; +		u64				pgoff; +	} event; +}; + +static void perf_counter_mmap_output(struct perf_counter *counter, +				     struct perf_mmap_event *mmap_event) +{ +	struct perf_output_handle handle; +	int size = mmap_event->event.header.size; +	int ret = perf_output_begin(&handle, counter, size, 0, 0); + +	if (ret) +		return; + +	mmap_event->event.pid = perf_counter_pid(counter, current); +	mmap_event->event.tid = perf_counter_tid(counter, current); + +	perf_output_put(&handle, mmap_event->event); +	perf_output_copy(&handle, mmap_event->file_name, +				   mmap_event->file_size); +	perf_output_end(&handle); +} + +static int perf_counter_mmap_match(struct perf_counter *counter, +				   struct perf_mmap_event *mmap_event) +{ +	if (counter->attr.mmap) +		return 1; + +	return 0; +} + +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, +				  struct perf_mmap_event *mmap_event) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_counter_mmap_match(counter, mmap_event)) +			perf_counter_mmap_output(counter, mmap_event); +	} +	rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; +	struct vm_area_struct *vma = mmap_event->vma; +	struct file *file = vma->vm_file; +	unsigned int size; +	char tmp[16]; +	char *buf = NULL; +	const char *name; + +	if (file) { +		buf = kzalloc(PATH_MAX, GFP_KERNEL); +		if (!buf) { +			name = strncpy(tmp, "//enomem", sizeof(tmp)); +			goto got_name; +		} +		name = d_path(&file->f_path, buf, PATH_MAX); +		if (IS_ERR(name)) { +			name = strncpy(tmp, "//toolong", sizeof(tmp)); +			goto got_name; +		} +	} else { +		name = arch_vma_name(mmap_event->vma); +		if (name) +			goto got_name; + +		if (!vma->vm_mm) { +			name = strncpy(tmp, "[vdso]", sizeof(tmp)); +			goto got_name; +		} + +		name = strncpy(tmp, "//anon", sizeof(tmp)); +		goto got_name; +	} + +got_name: +	size = ALIGN(strlen(name)+1, sizeof(u64)); + +	mmap_event->file_name = name; +	mmap_event->file_size = size; + +	mmap_event->event.header.size = sizeof(mmap_event->event) + size; + +	cpuctx = &get_cpu_var(perf_cpu_context); +	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); +	put_cpu_var(perf_cpu_context); + +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_counter_mmap_ctx(ctx, mmap_event); +	rcu_read_unlock(); + +	kfree(buf); +} + +void __perf_counter_mmap(struct vm_area_struct *vma) +{ +	struct perf_mmap_event mmap_event; + +	if (!atomic_read(&nr_mmap_counters)) +		return; + +	mmap_event = (struct perf_mmap_event){ +		.vma	= vma, +		.event  = { +			.header = { .type = PERF_EVENT_MMAP, }, +			.start  = vma->vm_start, +			.len    = vma->vm_end - vma->vm_start, +			.pgoff  = vma->vm_pgoff, +		}, +	}; + +	perf_counter_mmap_event(&mmap_event); +} + +/* + * Log sample_period changes so that analyzing tools can re-normalize the + * event flow. + */ + +struct freq_event { +	struct perf_event_header	header; +	u64				time; +	u64				id; +	u64				period; +}; + +static void perf_log_period(struct perf_counter *counter, u64 period) +{ +	struct perf_output_handle handle; +	struct freq_event event; +	int ret; + +	if (counter->hw.sample_period == period) +		return; + +	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) +		return; + +	event = (struct freq_event) { +		.header = { +			.type = PERF_EVENT_PERIOD, +			.misc = 0, +			.size = sizeof(event), +		}, +		.time = sched_clock(), +		.id = counter->id, +		.period = period, +	}; + +	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); +	if (ret) +		return; + +	perf_output_put(&handle, event); +	perf_output_end(&handle); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_counter *counter, int enable) +{ +	struct perf_output_handle handle; +	int ret; + +	struct { +		struct perf_event_header	header; +		u64				time; +		u64				id; +	} throttle_event = { +		.header = { +			.type = PERF_EVENT_THROTTLE + 1, +			.misc = 0, +			.size = sizeof(throttle_event), +		}, +		.time	= sched_clock(), +		.id	= counter->id, +	}; + +	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); +	if (ret) +		return; + +	perf_output_put(&handle, throttle_event); +	perf_output_end(&handle); +} + +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, int nmi, +			  struct perf_sample_data *data) +{ +	int events = atomic_read(&counter->event_limit); +	int throttle = counter->pmu->unthrottle != NULL; +	struct hw_perf_counter *hwc = &counter->hw; +	int ret = 0; + +	if (!throttle) { +		hwc->interrupts++; +	} else { +		if (hwc->interrupts != MAX_INTERRUPTS) { +			hwc->interrupts++; +			if (HZ * hwc->interrupts > +					(u64)sysctl_perf_counter_sample_rate) { +				hwc->interrupts = MAX_INTERRUPTS; +				perf_log_throttle(counter, 0); +				ret = 1; +			} +		} else { +			/* +			 * Keep re-disabling counters even though on the previous +			 * pass we disabled it - just in case we raced with a +			 * sched-in and the counter got enabled again: +			 */ +			ret = 1; +		} +	} + +	if (counter->attr.freq) { +		u64 now = sched_clock(); +		s64 delta = now - hwc->freq_stamp; + +		hwc->freq_stamp = now; + +		if (delta > 0 && delta < TICK_NSEC) +			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); +	} + +	/* +	 * XXX event_limit might not quite work as expected on inherited +	 * counters +	 */ + +	counter->pending_kill = POLL_IN; +	if (events && atomic_dec_and_test(&counter->event_limit)) { +		ret = 1; +		counter->pending_kill = POLL_HUP; +		if (nmi) { +			counter->pending_disable = 1; +			perf_pending_queue(&counter->pending, +					   perf_pending_counter); +		} else +			perf_counter_disable(counter); +	} + +	perf_counter_output(counter, nmi, data); +	return ret; +} + +/* + * Generic software counter infrastructure + */ + +static void perf_swcounter_update(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	u64 prev, now; +	s64 delta; + +again: +	prev = atomic64_read(&hwc->prev_count); +	now = atomic64_read(&hwc->count); +	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) +		goto again; + +	delta = now - prev; + +	atomic64_add(delta, &counter->count); +	atomic64_sub(delta, &hwc->period_left); +} + +static void perf_swcounter_set_period(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	s64 left = atomic64_read(&hwc->period_left); +	s64 period = hwc->sample_period; + +	if (unlikely(left <= -period)) { +		left = period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +	} + +	if (unlikely(left <= 0)) { +		left += period; +		atomic64_add(period, &hwc->period_left); +		hwc->last_period = period; +	} + +	atomic64_set(&hwc->prev_count, -left); +	atomic64_set(&hwc->count, -left); +} + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ +	enum hrtimer_restart ret = HRTIMER_RESTART; +	struct perf_sample_data data; +	struct perf_counter *counter; +	u64 period; + +	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer); +	counter->pmu->read(counter); + +	data.addr = 0; +	data.regs = get_irq_regs(); +	/* +	 * In case we exclude kernel IPs or are somehow not in interrupt +	 * context, provide the next best thing, the user IP. +	 */ +	if ((counter->attr.exclude_kernel || !data.regs) && +			!counter->attr.exclude_user) +		data.regs = task_pt_regs(current); + +	if (data.regs) { +		if (perf_counter_overflow(counter, 0, &data)) +			ret = HRTIMER_NORESTART; +	} + +	period = max_t(u64, 10000, counter->hw.sample_period); +	hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + +	return ret; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, +				    int nmi, struct pt_regs *regs, u64 addr) +{ +	struct perf_sample_data data = { +		.regs	= regs, +		.addr	= addr, +		.period	= counter->hw.last_period, +	}; + +	perf_swcounter_update(counter); +	perf_swcounter_set_period(counter); +	if (perf_counter_overflow(counter, nmi, &data)) +		/* soft-disable the counter */ +		; + +} + +static int perf_swcounter_is_counting(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx; +	unsigned long flags; +	int count; + +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) +		return 1; + +	if (counter->state != PERF_COUNTER_STATE_INACTIVE) +		return 0; + +	/* +	 * If the counter is inactive, it could be just because +	 * its task is scheduled out, or because it's in a group +	 * which could not go on the PMU.  We want to count in +	 * the first case but not the second.  If the context is +	 * currently active then an inactive software counter must +	 * be the second case.  If it's not currently active then +	 * we need to know whether the counter was active when the +	 * context was last active, which we can determine by +	 * comparing counter->tstamp_stopped with ctx->time. +	 * +	 * We are within an RCU read-side critical section, +	 * which protects the existence of *ctx. +	 */ +	ctx = counter->ctx; +	spin_lock_irqsave(&ctx->lock, flags); +	count = 1; +	/* Re-check state now we have the lock */ +	if (counter->state < PERF_COUNTER_STATE_INACTIVE || +	    counter->ctx->is_active || +	    counter->tstamp_stopped < ctx->time) +		count = 0; +	spin_unlock_irqrestore(&ctx->lock, flags); +	return count; +} + +static int perf_swcounter_match(struct perf_counter *counter, +				enum perf_type_id type, +				u32 event, struct pt_regs *regs) +{ +	if (!perf_swcounter_is_counting(counter)) +		return 0; + +	if (counter->attr.type != type) +		return 0; +	if (counter->attr.config != event) +		return 0; + +	if (regs) { +		if (counter->attr.exclude_user && user_mode(regs)) +			return 0; + +		if (counter->attr.exclude_kernel && !user_mode(regs)) +			return 0; +	} + +	return 1; +} + +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, +			       int nmi, struct pt_regs *regs, u64 addr) +{ +	int neg = atomic64_add_negative(nr, &counter->hw.count); + +	if (counter->hw.sample_period && !neg && regs) +		perf_swcounter_overflow(counter, nmi, regs, addr); +} + +static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, +				     enum perf_type_id type, u32 event, +				     u64 nr, int nmi, struct pt_regs *regs, +				     u64 addr) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_swcounter_match(counter, type, event, regs)) +			perf_swcounter_add(counter, nr, nmi, regs, addr); +	} +	rcu_read_unlock(); +} + +static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +{ +	if (in_nmi()) +		return &cpuctx->recursion[3]; + +	if (in_irq()) +		return &cpuctx->recursion[2]; + +	if (in_softirq()) +		return &cpuctx->recursion[1]; + +	return &cpuctx->recursion[0]; +} + +static void __perf_swcounter_event(enum perf_type_id type, u32 event, +				   u64 nr, int nmi, struct pt_regs *regs, +				   u64 addr) +{ +	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); +	int *recursion = perf_swcounter_recursion_context(cpuctx); +	struct perf_counter_context *ctx; + +	if (*recursion) +		goto out; + +	(*recursion)++; +	barrier(); + +	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, +				 nr, nmi, regs, addr); +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); +	rcu_read_unlock(); + +	barrier(); +	(*recursion)--; + +out: +	put_cpu_var(perf_cpu_context); +} + +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ +	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); +} + +static void perf_swcounter_read(struct perf_counter *counter) +{ +	perf_swcounter_update(counter); +} + +static int perf_swcounter_enable(struct perf_counter *counter) +{ +	perf_swcounter_set_period(counter); +	return 0; +} + +static void perf_swcounter_disable(struct perf_counter *counter) +{ +	perf_swcounter_update(counter); +} + +static const struct pmu perf_ops_generic = { +	.enable		= perf_swcounter_enable, +	.disable	= perf_swcounter_disable, +	.read		= perf_swcounter_read, +}; + +/* + * Software counter: cpu wall time clock + */ + +static void cpu_clock_perf_counter_update(struct perf_counter *counter) +{ +	int cpu = raw_smp_processor_id(); +	s64 prev; +	u64 now; + +	now = cpu_clock(cpu); +	prev = atomic64_read(&counter->hw.prev_count); +	atomic64_set(&counter->hw.prev_count, now); +	atomic64_add(now - prev, &counter->count); +} + +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	int cpu = raw_smp_processor_id(); + +	atomic64_set(&hwc->prev_count, cpu_clock(cpu)); +	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hwc->hrtimer.function = perf_swcounter_hrtimer; +	if (hwc->sample_period) { +		u64 period = max_t(u64, 10000, hwc->sample_period); +		__hrtimer_start_range_ns(&hwc->hrtimer, +				ns_to_ktime(period), 0, +				HRTIMER_MODE_REL, 0); +	} + +	return 0; +} + +static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +{ +	if (counter->hw.sample_period) +		hrtimer_cancel(&counter->hw.hrtimer); +	cpu_clock_perf_counter_update(counter); +} + +static void cpu_clock_perf_counter_read(struct perf_counter *counter) +{ +	cpu_clock_perf_counter_update(counter); +} + +static const struct pmu perf_ops_cpu_clock = { +	.enable		= cpu_clock_perf_counter_enable, +	.disable	= cpu_clock_perf_counter_disable, +	.read		= cpu_clock_perf_counter_read, +}; + +/* + * Software counter: task time clock + */ + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ +	u64 prev; +	s64 delta; + +	prev = atomic64_xchg(&counter->hw.prev_count, now); +	delta = now - prev; +	atomic64_add(delta, &counter->count); +} + +static int task_clock_perf_counter_enable(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	u64 now; + +	now = counter->ctx->time; + +	atomic64_set(&hwc->prev_count, now); +	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hwc->hrtimer.function = perf_swcounter_hrtimer; +	if (hwc->sample_period) { +		u64 period = max_t(u64, 10000, hwc->sample_period); +		__hrtimer_start_range_ns(&hwc->hrtimer, +				ns_to_ktime(period), 0, +				HRTIMER_MODE_REL, 0); +	} + +	return 0; +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) +{ +	if (counter->hw.sample_period) +		hrtimer_cancel(&counter->hw.hrtimer); +	task_clock_perf_counter_update(counter, counter->ctx->time); + +} + +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ +	u64 time; + +	if (!in_nmi()) { +		update_context_time(counter->ctx); +		time = counter->ctx->time; +	} else { +		u64 now = perf_clock(); +		u64 delta = now - counter->ctx->timestamp; +		time = counter->ctx->time + delta; +	} + +	task_clock_perf_counter_update(counter, time); +} + +static const struct pmu perf_ops_task_clock = { +	.enable		= task_clock_perf_counter_enable, +	.disable	= task_clock_perf_counter_disable, +	.read		= task_clock_perf_counter_read, +}; + +/* + * Software counter: cpu migrations + */ +void perf_counter_task_migration(struct task_struct *task, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx; + +	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, +				 PERF_COUNT_SW_CPU_MIGRATIONS, +				 1, 1, NULL, 0); + +	ctx = perf_pin_task_context(task); +	if (ctx) { +		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, +					 PERF_COUNT_SW_CPU_MIGRATIONS, +					 1, 1, NULL, 0); +		perf_unpin_context(ctx); +	} +} + +#ifdef CONFIG_EVENT_PROFILE +void perf_tpcounter_event(int event_id) +{ +	struct pt_regs *regs = get_irq_regs(); + +	if (!regs) +		regs = task_pt_regs(current); + +	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); +} +EXPORT_SYMBOL_GPL(perf_tpcounter_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_counter_destroy(struct perf_counter *counter) +{ +	ftrace_profile_disable(perf_event_id(&counter->attr)); +} + +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ +	int event_id = perf_event_id(&counter->attr); +	int ret; + +	ret = ftrace_profile_enable(event_id); +	if (ret) +		return NULL; + +	counter->destroy = tp_perf_counter_destroy; + +	return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ +	return NULL; +} +#endif + +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) +{ +	const struct pmu *pmu = NULL; + +	/* +	 * Software counters (currently) can't in general distinguish +	 * between user, kernel and hypervisor events. +	 * However, context switches and cpu migrations are considered +	 * to be kernel events, and page faults are never hypervisor +	 * events. +	 */ +	switch (counter->attr.config) { +	case PERF_COUNT_SW_CPU_CLOCK: +		pmu = &perf_ops_cpu_clock; + +		break; +	case PERF_COUNT_SW_TASK_CLOCK: +		/* +		 * If the user instantiates this as a per-cpu counter, +		 * use the cpu_clock counter instead. +		 */ +		if (counter->ctx->task) +			pmu = &perf_ops_task_clock; +		else +			pmu = &perf_ops_cpu_clock; + +		break; +	case PERF_COUNT_SW_PAGE_FAULTS: +	case PERF_COUNT_SW_PAGE_FAULTS_MIN: +	case PERF_COUNT_SW_PAGE_FAULTS_MAJ: +	case PERF_COUNT_SW_CONTEXT_SWITCHES: +	case PERF_COUNT_SW_CPU_MIGRATIONS: +		pmu = &perf_ops_generic; +		break; +	} + +	return pmu; +} + +/* + * Allocate and initialize a counter structure + */ +static struct perf_counter * +perf_counter_alloc(struct perf_counter_attr *attr, +		   int cpu, +		   struct perf_counter_context *ctx, +		   struct perf_counter *group_leader, +		   gfp_t gfpflags) +{ +	const struct pmu *pmu; +	struct perf_counter *counter; +	struct hw_perf_counter *hwc; +	long err; + +	counter = kzalloc(sizeof(*counter), gfpflags); +	if (!counter) +		return ERR_PTR(-ENOMEM); + +	/* +	 * Single counters are their own group leaders, with an +	 * empty sibling list: +	 */ +	if (!group_leader) +		group_leader = counter; + +	mutex_init(&counter->child_mutex); +	INIT_LIST_HEAD(&counter->child_list); + +	INIT_LIST_HEAD(&counter->list_entry); +	INIT_LIST_HEAD(&counter->event_entry); +	INIT_LIST_HEAD(&counter->sibling_list); +	init_waitqueue_head(&counter->waitq); + +	mutex_init(&counter->mmap_mutex); + +	counter->cpu		= cpu; +	counter->attr		= *attr; +	counter->group_leader	= group_leader; +	counter->pmu		= NULL; +	counter->ctx		= ctx; +	counter->oncpu		= -1; + +	counter->ns		= get_pid_ns(current->nsproxy->pid_ns); +	counter->id		= atomic64_inc_return(&perf_counter_id); + +	counter->state		= PERF_COUNTER_STATE_INACTIVE; + +	if (attr->disabled) +		counter->state = PERF_COUNTER_STATE_OFF; + +	pmu = NULL; + +	hwc = &counter->hw; +	hwc->sample_period = attr->sample_period; +	if (attr->freq && attr->sample_freq) +		hwc->sample_period = 1; + +	atomic64_set(&hwc->period_left, hwc->sample_period); + +	/* +	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters +	 */ +	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) +		goto done; + +	switch (attr->type) { +	case PERF_TYPE_RAW: +	case PERF_TYPE_HARDWARE: +	case PERF_TYPE_HW_CACHE: +		pmu = hw_perf_counter_init(counter); +		break; + +	case PERF_TYPE_SOFTWARE: +		pmu = sw_perf_counter_init(counter); +		break; + +	case PERF_TYPE_TRACEPOINT: +		pmu = tp_perf_counter_init(counter); +		break; + +	default: +		break; +	} +done: +	err = 0; +	if (!pmu) +		err = -EINVAL; +	else if (IS_ERR(pmu)) +		err = PTR_ERR(pmu); + +	if (err) { +		if (counter->ns) +			put_pid_ns(counter->ns); +		kfree(counter); +		return ERR_PTR(err); +	} + +	counter->pmu = pmu; + +	atomic_inc(&nr_counters); +	if (counter->attr.mmap) +		atomic_inc(&nr_mmap_counters); +	if (counter->attr.comm) +		atomic_inc(&nr_comm_counters); + +	return counter; +} + +static int perf_copy_attr(struct perf_counter_attr __user *uattr, +			  struct perf_counter_attr *attr) +{ +	int ret; +	u32 size; + +	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) +		return -EFAULT; + +	/* +	 * zero the full structure, so that a short copy will be nice. +	 */ +	memset(attr, 0, sizeof(*attr)); + +	ret = get_user(size, &uattr->size); +	if (ret) +		return ret; + +	if (size > PAGE_SIZE)	/* silly large */ +		goto err_size; + +	if (!size)		/* abi compat */ +		size = PERF_ATTR_SIZE_VER0; + +	if (size < PERF_ATTR_SIZE_VER0) +		goto err_size; + +	/* +	 * If we're handed a bigger struct than we know of, +	 * ensure all the unknown bits are 0. +	 */ +	if (size > sizeof(*attr)) { +		unsigned long val; +		unsigned long __user *addr; +		unsigned long __user *end; + +		addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), +				sizeof(unsigned long)); +		end  = PTR_ALIGN((void __user *)uattr + size, +				sizeof(unsigned long)); + +		for (; addr < end; addr += sizeof(unsigned long)) { +			ret = get_user(val, addr); +			if (ret) +				return ret; +			if (val) +				goto err_size; +		} +	} + +	ret = copy_from_user(attr, uattr, size); +	if (ret) +		return -EFAULT; + +	/* +	 * If the type exists, the corresponding creation will verify +	 * the attr->config. +	 */ +	if (attr->type >= PERF_TYPE_MAX) +		return -EINVAL; + +	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) +		return -EINVAL; + +	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) +		return -EINVAL; + +	if (attr->read_format & ~(PERF_FORMAT_MAX-1)) +		return -EINVAL; + +out: +	return ret; + +err_size: +	put_user(sizeof(*attr), &uattr->size); +	ret = -E2BIG; +	goto out; +} + +/** + * sys_perf_counter_open - open a performance counter, associate it to a task/cpu + * + * @attr_uptr:	event type attributes for monitoring/sampling + * @pid:		target pid + * @cpu:		target cpu + * @group_fd:		group leader counter fd + */ +SYSCALL_DEFINE5(perf_counter_open, +		struct perf_counter_attr __user *, attr_uptr, +		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ +	struct perf_counter *counter, *group_leader; +	struct perf_counter_attr attr; +	struct perf_counter_context *ctx; +	struct file *counter_file = NULL; +	struct file *group_file = NULL; +	int fput_needed = 0; +	int fput_needed2 = 0; +	int ret; + +	/* for future expandability... */ +	if (flags) +		return -EINVAL; + +	ret = perf_copy_attr(attr_uptr, &attr); +	if (ret) +		return ret; + +	if (!attr.exclude_kernel) { +		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) +			return -EACCES; +	} + +	if (attr.freq) { +		if (attr.sample_freq > sysctl_perf_counter_sample_rate) +			return -EINVAL; +	} + +	/* +	 * Get the target context (task or percpu): +	 */ +	ctx = find_get_context(pid, cpu); +	if (IS_ERR(ctx)) +		return PTR_ERR(ctx); + +	/* +	 * Look up the group leader (we will attach this counter to it): +	 */ +	group_leader = NULL; +	if (group_fd != -1) { +		ret = -EINVAL; +		group_file = fget_light(group_fd, &fput_needed); +		if (!group_file) +			goto err_put_context; +		if (group_file->f_op != &perf_fops) +			goto err_put_context; + +		group_leader = group_file->private_data; +		/* +		 * Do not allow a recursive hierarchy (this new sibling +		 * becoming part of another group-sibling): +		 */ +		if (group_leader->group_leader != group_leader) +			goto err_put_context; +		/* +		 * Do not allow to attach to a group in a different +		 * task or CPU context: +		 */ +		if (group_leader->ctx != ctx) +			goto err_put_context; +		/* +		 * Only a group leader can be exclusive or pinned +		 */ +		if (attr.exclusive || attr.pinned) +			goto err_put_context; +	} + +	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, +				     GFP_KERNEL); +	ret = PTR_ERR(counter); +	if (IS_ERR(counter)) +		goto err_put_context; + +	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); +	if (ret < 0) +		goto err_free_put_context; + +	counter_file = fget_light(ret, &fput_needed2); +	if (!counter_file) +		goto err_free_put_context; + +	counter->filp = counter_file; +	WARN_ON_ONCE(ctx->parent_ctx); +	mutex_lock(&ctx->mutex); +	perf_install_in_context(ctx, counter, cpu); +	++ctx->generation; +	mutex_unlock(&ctx->mutex); + +	counter->owner = current; +	get_task_struct(current); +	mutex_lock(¤t->perf_counter_mutex); +	list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); +	mutex_unlock(¤t->perf_counter_mutex); + +	fput_light(counter_file, fput_needed2); + +out_fput: +	fput_light(group_file, fput_needed); + +	return ret; + +err_free_put_context: +	kfree(counter); + +err_put_context: +	put_ctx(ctx); + +	goto out_fput; +} + +/* + * inherit a counter from parent task to child task: + */ +static struct perf_counter * +inherit_counter(struct perf_counter *parent_counter, +	      struct task_struct *parent, +	      struct perf_counter_context *parent_ctx, +	      struct task_struct *child, +	      struct perf_counter *group_leader, +	      struct perf_counter_context *child_ctx) +{ +	struct perf_counter *child_counter; + +	/* +	 * Instead of creating recursive hierarchies of counters, +	 * we link inherited counters back to the original parent, +	 * which has a filp for sure, which we use as the reference +	 * count: +	 */ +	if (parent_counter->parent) +		parent_counter = parent_counter->parent; + +	child_counter = perf_counter_alloc(&parent_counter->attr, +					   parent_counter->cpu, child_ctx, +					   group_leader, GFP_KERNEL); +	if (IS_ERR(child_counter)) +		return child_counter; +	get_ctx(child_ctx); + +	/* +	 * Make the child state follow the state of the parent counter, +	 * not its attr.disabled bit.  We hold the parent's mutex, +	 * so we won't race with perf_counter_{en, dis}able_family. +	 */ +	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) +		child_counter->state = PERF_COUNTER_STATE_INACTIVE; +	else +		child_counter->state = PERF_COUNTER_STATE_OFF; + +	if (parent_counter->attr.freq) +		child_counter->hw.sample_period = parent_counter->hw.sample_period; + +	/* +	 * Link it up in the child's context: +	 */ +	add_counter_to_ctx(child_counter, child_ctx); + +	child_counter->parent = parent_counter; +	/* +	 * inherit into child's child as well: +	 */ +	child_counter->attr.inherit = 1; + +	/* +	 * Get a reference to the parent filp - we will fput it +	 * when the child counter exits. This is safe to do because +	 * we are in the parent and we know that the filp still +	 * exists and has a nonzero count: +	 */ +	atomic_long_inc(&parent_counter->filp->f_count); + +	/* +	 * Link this into the parent counter's child list +	 */ +	WARN_ON_ONCE(parent_counter->ctx->parent_ctx); +	mutex_lock(&parent_counter->child_mutex); +	list_add_tail(&child_counter->child_list, &parent_counter->child_list); +	mutex_unlock(&parent_counter->child_mutex); + +	return child_counter; +} + +static int inherit_group(struct perf_counter *parent_counter, +	      struct task_struct *parent, +	      struct perf_counter_context *parent_ctx, +	      struct task_struct *child, +	      struct perf_counter_context *child_ctx) +{ +	struct perf_counter *leader; +	struct perf_counter *sub; +	struct perf_counter *child_ctr; + +	leader = inherit_counter(parent_counter, parent, parent_ctx, +				 child, NULL, child_ctx); +	if (IS_ERR(leader)) +		return PTR_ERR(leader); +	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { +		child_ctr = inherit_counter(sub, parent, parent_ctx, +					    child, leader, child_ctx); +		if (IS_ERR(child_ctr)) +			return PTR_ERR(child_ctr); +	} +	return 0; +} + +static void sync_child_counter(struct perf_counter *child_counter, +			       struct perf_counter *parent_counter) +{ +	u64 child_val; + +	child_val = atomic64_read(&child_counter->count); + +	/* +	 * Add back the child's count to the parent's count: +	 */ +	atomic64_add(child_val, &parent_counter->count); +	atomic64_add(child_counter->total_time_enabled, +		     &parent_counter->child_total_time_enabled); +	atomic64_add(child_counter->total_time_running, +		     &parent_counter->child_total_time_running); + +	/* +	 * Remove this counter from the parent's list +	 */ +	WARN_ON_ONCE(parent_counter->ctx->parent_ctx); +	mutex_lock(&parent_counter->child_mutex); +	list_del_init(&child_counter->child_list); +	mutex_unlock(&parent_counter->child_mutex); + +	/* +	 * Release the parent counter, if this was the last +	 * reference to it. +	 */ +	fput(parent_counter->filp); +} + +static void +__perf_counter_exit_task(struct perf_counter *child_counter, +			 struct perf_counter_context *child_ctx) +{ +	struct perf_counter *parent_counter; + +	update_counter_times(child_counter); +	perf_counter_remove_from_context(child_counter); + +	parent_counter = child_counter->parent; +	/* +	 * It can happen that parent exits first, and has counters +	 * that are still around due to the child reference. These +	 * counters need to be zapped - but otherwise linger. +	 */ +	if (parent_counter) { +		sync_child_counter(child_counter, parent_counter); +		free_counter(child_counter); +	} +} + +/* + * When a child task exits, feed back counter values to parent counters. + */ +void perf_counter_exit_task(struct task_struct *child) +{ +	struct perf_counter *child_counter, *tmp; +	struct perf_counter_context *child_ctx; +	unsigned long flags; + +	if (likely(!child->perf_counter_ctxp)) +		return; + +	local_irq_save(flags); +	/* +	 * We can't reschedule here because interrupts are disabled, +	 * and either child is current or it is a task that can't be +	 * scheduled, so we are now safe from rescheduling changing +	 * our context. +	 */ +	child_ctx = child->perf_counter_ctxp; +	__perf_counter_task_sched_out(child_ctx); + +	/* +	 * Take the context lock here so that if find_get_context is +	 * reading child->perf_counter_ctxp, we wait until it has +	 * incremented the context's refcount before we do put_ctx below. +	 */ +	spin_lock(&child_ctx->lock); +	child->perf_counter_ctxp = NULL; +	if (child_ctx->parent_ctx) { +		/* +		 * This context is a clone; unclone it so it can't get +		 * swapped to another process while we're removing all +		 * the counters from it. +		 */ +		put_ctx(child_ctx->parent_ctx); +		child_ctx->parent_ctx = NULL; +	} +	spin_unlock(&child_ctx->lock); +	local_irq_restore(flags); + +	/* +	 * We can recurse on the same lock type through: +	 * +	 *   __perf_counter_exit_task() +	 *     sync_child_counter() +	 *       fput(parent_counter->filp) +	 *         perf_release() +	 *           mutex_lock(&ctx->mutex) +	 * +	 * But since its the parent context it won't be the same instance. +	 */ +	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: +	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, +				 list_entry) +		__perf_counter_exit_task(child_counter, child_ctx); + +	/* +	 * If the last counter was a group counter, it will have appended all +	 * its siblings to the list, but we obtained 'tmp' before that which +	 * will still point to the list head terminating the iteration. +	 */ +	if (!list_empty(&child_ctx->counter_list)) +		goto again; + +	mutex_unlock(&child_ctx->mutex); + +	put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_counter_free_task(struct task_struct *task) +{ +	struct perf_counter_context *ctx = task->perf_counter_ctxp; +	struct perf_counter *counter, *tmp; + +	if (!ctx) +		return; + +	mutex_lock(&ctx->mutex); +again: +	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { +		struct perf_counter *parent = counter->parent; + +		if (WARN_ON_ONCE(!parent)) +			continue; + +		mutex_lock(&parent->child_mutex); +		list_del_init(&counter->child_list); +		mutex_unlock(&parent->child_mutex); + +		fput(parent->filp); + +		list_del_counter(counter, ctx); +		free_counter(counter); +	} + +	if (!list_empty(&ctx->counter_list)) +		goto again; + +	mutex_unlock(&ctx->mutex); + +	put_ctx(ctx); +} + +/* + * Initialize the perf_counter context in task_struct + */ +int perf_counter_init_task(struct task_struct *child) +{ +	struct perf_counter_context *child_ctx, *parent_ctx; +	struct perf_counter_context *cloned_ctx; +	struct perf_counter *counter; +	struct task_struct *parent = current; +	int inherited_all = 1; +	int ret = 0; + +	child->perf_counter_ctxp = NULL; + +	mutex_init(&child->perf_counter_mutex); +	INIT_LIST_HEAD(&child->perf_counter_list); + +	if (likely(!parent->perf_counter_ctxp)) +		return 0; + +	/* +	 * This is executed from the parent task context, so inherit +	 * counters that have been marked for cloning. +	 * First allocate and initialize a context for the child. +	 */ + +	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); +	if (!child_ctx) +		return -ENOMEM; + +	__perf_counter_init_context(child_ctx, child); +	child->perf_counter_ctxp = child_ctx; +	get_task_struct(child); + +	/* +	 * If the parent's context is a clone, pin it so it won't get +	 * swapped under us. +	 */ +	parent_ctx = perf_pin_task_context(parent); + +	/* +	 * No need to check if parent_ctx != NULL here; since we saw +	 * it non-NULL earlier, the only reason for it to become NULL +	 * is if we exit, and since we're currently in the middle of +	 * a fork we can't be exiting at the same time. +	 */ + +	/* +	 * Lock the parent list. No need to lock the child - not PID +	 * hashed yet and not running, so nobody can access it. +	 */ +	mutex_lock(&parent_ctx->mutex); + +	/* +	 * We dont have to disable NMIs - we are only looking at +	 * the list, not manipulating it: +	 */ +	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { +		if (counter != counter->group_leader) +			continue; + +		if (!counter->attr.inherit) { +			inherited_all = 0; +			continue; +		} + +		ret = inherit_group(counter, parent, parent_ctx, +					     child, child_ctx); +		if (ret) { +			inherited_all = 0; +			break; +		} +	} + +	if (inherited_all) { +		/* +		 * Mark the child context as a clone of the parent +		 * context, or of whatever the parent is a clone of. +		 * Note that if the parent is a clone, it could get +		 * uncloned at any point, but that doesn't matter +		 * because the list of counters and the generation +		 * count can't have changed since we took the mutex. +		 */ +		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); +		if (cloned_ctx) { +			child_ctx->parent_ctx = cloned_ctx; +			child_ctx->parent_gen = parent_ctx->parent_gen; +		} else { +			child_ctx->parent_ctx = parent_ctx; +			child_ctx->parent_gen = parent_ctx->generation; +		} +		get_ctx(child_ctx->parent_ctx); +	} + +	mutex_unlock(&parent_ctx->mutex); + +	perf_unpin_context(parent_ctx); + +	return ret; +} + +static void __cpuinit perf_counter_init_cpu(int cpu) +{ +	struct perf_cpu_context *cpuctx; + +	cpuctx = &per_cpu(perf_cpu_context, cpu); +	__perf_counter_init_context(&cpuctx->ctx, NULL); + +	spin_lock(&perf_resource_lock); +	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; +	spin_unlock(&perf_resource_lock); + +	hw_perf_counter_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_counter_exit_cpu(void *info) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter_context *ctx = &cpuctx->ctx; +	struct perf_counter *counter, *tmp; + +	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) +		__perf_counter_remove_from_context(counter); +} +static void perf_counter_exit_cpu(int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx = &cpuctx->ctx; + +	mutex_lock(&ctx->mutex); +	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); +	mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_counter_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ +	unsigned int cpu = (long)hcpu; + +	switch (action) { + +	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN: +		perf_counter_init_cpu(cpu); +		break; + +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +		perf_counter_exit_cpu(cpu); +		break; + +	default: +		break; +	} + +	return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { +	.notifier_call		= perf_cpu_notify, +	.priority		= 20, +}; + +void __init perf_counter_init(void) +{ +	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, +			(void *)(long)smp_processor_id()); +	register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ +	return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, +			const char *buf, +			size_t count) +{ +	struct perf_cpu_context *cpuctx; +	unsigned long val; +	int err, cpu, mpt; + +	err = strict_strtoul(buf, 10, &val); +	if (err) +		return err; +	if (val > perf_max_counters) +		return -EINVAL; + +	spin_lock(&perf_resource_lock); +	perf_reserved_percpu = val; +	for_each_online_cpu(cpu) { +		cpuctx = &per_cpu(perf_cpu_context, cpu); +		spin_lock_irq(&cpuctx->ctx.lock); +		mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, +			  perf_max_counters - perf_reserved_percpu); +		cpuctx->max_pertask = mpt; +		spin_unlock_irq(&cpuctx->ctx.lock); +	} +	spin_unlock(&perf_resource_lock); + +	return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ +	return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ +	unsigned long val; +	int err; + +	err = strict_strtoul(buf, 10, &val); +	if (err) +		return err; +	if (val > 1) +		return -EINVAL; + +	spin_lock(&perf_resource_lock); +	perf_overcommit = val; +	spin_unlock(&perf_resource_lock); + +	return count; +} + +static SYSDEV_CLASS_ATTR( +				reserve_percpu, +				0644, +				perf_show_reserve_percpu, +				perf_set_reserve_percpu +			); + +static SYSDEV_CLASS_ATTR( +				overcommit, +				0644, +				perf_show_overcommit, +				perf_set_overcommit +			); + +static struct attribute *perfclass_attrs[] = { +	&attr_reserve_percpu.attr, +	&attr_overcommit.attr, +	NULL +}; + +static struct attribute_group perfclass_attr_group = { +	.attrs			= perfclass_attrs, +	.name			= "perf_counters", +}; + +static int __init perf_counter_sysfs_init(void) +{ +	return sysfs_create_group(&cpu_sysdev_class.kset.kobj, +				  &perfclass_attr_group); +} +device_initcall(perf_counter_sysfs_init); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 23bd4daeb96..72067cbdb37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -116,9 +116,13 @@ config SUSPEND_FREEZER  	  Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATION_NVS +	bool +  config HIBERNATION  	bool "Hibernation (aka 'suspend to disk')"  	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE +	select HIBERNATION_NVS if HAS_IOMEM  	---help---  	  Enable the suspend to disk (STD) functionality, which is usually  	  called "hibernation" in user interfaces.  STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 720ea4f781b..c3b81c30e5d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -6,6 +6,9 @@ endif  obj-$(CONFIG_PM)		+= main.o  obj-$(CONFIG_PM_SLEEP)		+= console.o  obj-$(CONFIG_FREEZER)		+= process.o -obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o +obj-$(CONFIG_SUSPEND)		+= suspend.o +obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o +obj-$(CONFIG_HIBERNATION)	+= swsusp.o hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION_NVS)	+= hibernate_nvs.o  obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c index e71ca9cd81b..81d2e746489 100644 --- a/kernel/power/disk.c +++ b/kernel/power/hibernate.c @@ -1,12 +1,12 @@  /* - * kernel/power/disk.c - Suspend-to-disk support. + * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.   *   * Copyright (c) 2003 Patrick Mochel   * Copyright (c) 2003 Open Source Development Lab   * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.   *   * This file is released under the GPLv2. - *   */  #include <linux/suspend.h> @@ -215,19 +215,17 @@ static int create_image(int platform_mode)  	if (error)  		return error; -	device_pm_lock(); - -	/* At this point, device_suspend() has been called, but *not* -	 * device_power_down(). We *must* call device_power_down() now. +	/* At this point, dpm_suspend_start() has been called, but *not* +	 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.  	 * Otherwise, drivers for some devices (e.g. interrupt controllers)  	 * become desynchronized with the actual state of the hardware  	 * at resume time, and evil weirdness ensues.  	 */ -	error = device_power_down(PMSG_FREEZE); +	error = dpm_suspend_noirq(PMSG_FREEZE);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down, "  			"aborting hibernation\n"); -		goto Unlock; +		return error;  	}  	error = platform_pre_snapshot(platform_mode); @@ -241,9 +239,9 @@ static int create_image(int platform_mode)  	local_irq_disable(); -	sysdev_suspend(PMSG_FREEZE); +	error = sysdev_suspend(PMSG_FREEZE);  	if (error) { -		printk(KERN_ERR "PM: Some devices failed to power down, " +		printk(KERN_ERR "PM: Some system devices failed to power down, "  			"aborting hibernation\n");  		goto Enable_irqs;  	} @@ -264,7 +262,7 @@ static int create_image(int platform_mode)   Power_up:  	sysdev_resume(); -	/* NOTE:  device_power_up() is just a resume() for devices +	/* NOTE:  dpm_resume_noirq() is just a resume() for devices  	 * that suspended with irqs off ... no overall powerup.  	 */ @@ -277,12 +275,9 @@ static int create_image(int platform_mode)   Platform_finish:  	platform_finish(platform_mode); -	device_power_up(in_suspend ? +	dpm_resume_noirq(in_suspend ?  		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Unlock: -	device_pm_unlock(); -  	return error;  } @@ -309,7 +304,7 @@ int hibernation_snapshot(int platform_mode)  		goto Close;  	suspend_console(); -	error = device_suspend(PMSG_FREEZE); +	error = dpm_suspend_start(PMSG_FREEZE);  	if (error)  		goto Recover_platform; @@ -320,7 +315,7 @@ int hibernation_snapshot(int platform_mode)  	/* Control returns here after successful restore */   Resume_devices: -	device_resume(in_suspend ? +	dpm_resume_end(in_suspend ?  		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);  	resume_console();   Close: @@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode)  {  	int error; -	device_pm_lock(); - -	error = device_power_down(PMSG_QUIESCE); +	error = dpm_suspend_noirq(PMSG_QUIESCE);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down, "  			"aborting resume\n"); -		goto Unlock; +		return error;  	}  	error = platform_pre_restore(platform_mode); @@ -401,10 +394,7 @@ static int resume_target_kernel(bool platform_mode)   Cleanup:  	platform_restore_cleanup(platform_mode); -	device_power_up(PMSG_RECOVER); - - Unlock: -	device_pm_unlock(); +	dpm_resume_noirq(PMSG_RECOVER);  	return error;  } @@ -424,10 +414,10 @@ int hibernation_restore(int platform_mode)  	pm_prepare_console();  	suspend_console(); -	error = device_suspend(PMSG_QUIESCE); +	error = dpm_suspend_start(PMSG_QUIESCE);  	if (!error) {  		error = resume_target_kernel(platform_mode); -		device_resume(PMSG_RECOVER); +		dpm_resume_end(PMSG_RECOVER);  	}  	resume_console();  	pm_restore_console(); @@ -457,18 +447,16 @@ int hibernation_platform_enter(void)  	entering_platform_hibernation = true;  	suspend_console(); -	error = device_suspend(PMSG_HIBERNATE); +	error = dpm_suspend_start(PMSG_HIBERNATE);  	if (error) {  		if (hibernation_ops->recover)  			hibernation_ops->recover();  		goto Resume_devices;  	} -	device_pm_lock(); - -	error = device_power_down(PMSG_HIBERNATE); +	error = dpm_suspend_noirq(PMSG_HIBERNATE);  	if (error) -		goto Unlock; +		goto Resume_devices;  	error = hibernation_ops->prepare();  	if (error) @@ -491,14 +479,11 @@ int hibernation_platform_enter(void)   Platofrm_finish:  	hibernation_ops->finish(); -	device_power_up(PMSG_RESTORE); - - Unlock: -	device_pm_unlock(); +	dpm_suspend_noirq(PMSG_RESTORE);   Resume_devices:  	entering_platform_hibernation = false; -	device_resume(PMSG_RESTORE); +	dpm_resume_end(PMSG_RESTORE);  	resume_console();   Close: diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c new file mode 100644 index 00000000000..39ac698ef83 --- /dev/null +++ b/kernel/power/hibernate_nvs.c @@ -0,0 +1,135 @@ +/* + * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory + * + * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/suspend.h> + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume.  The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { +	unsigned long phys_start; +	unsigned int size; +	void *kaddr; +	void *data; +	struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + *	hibernate_nvs_register - register platform NVS memory region to save + *	@start - physical address of the region + *	@size - size of the region + * + *	The NVS region need not be page-aligned (both ends) and we arrange + *	things so that the data from page-aligned addresses in this region will + *	be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ +	struct nvs_page *entry, *next; + +	while (size > 0) { +		unsigned int nr_bytes; + +		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); +		if (!entry) +			goto Error; + +		list_add_tail(&entry->node, &nvs_list); +		entry->phys_start = start; +		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); +		entry->size = (size < nr_bytes) ? size : nr_bytes; + +		start += entry->size; +		size -= entry->size; +	} +	return 0; + + Error: +	list_for_each_entry_safe(entry, next, &nvs_list, node) { +		list_del(&entry->node); +		kfree(entry); +	} +	return -ENOMEM; +} + +/** + *	hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ +	struct nvs_page *entry; + +	list_for_each_entry(entry, &nvs_list, node) +		if (entry->data) { +			free_page((unsigned long)entry->data); +			entry->data = NULL; +			if (entry->kaddr) { +				iounmap(entry->kaddr); +				entry->kaddr = NULL; +			} +		} +} + +/** + *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ +	struct nvs_page *entry; + +	list_for_each_entry(entry, &nvs_list, node) { +		entry->data = (void *)__get_free_page(GFP_KERNEL); +		if (!entry->data) { +			hibernate_nvs_free(); +			return -ENOMEM; +		} +	} +	return 0; +} + +/** + *	hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ +	struct nvs_page *entry; + +	printk(KERN_INFO "PM: Saving platform NVS memory\n"); + +	list_for_each_entry(entry, &nvs_list, node) +		if (entry->data) { +			entry->kaddr = ioremap(entry->phys_start, entry->size); +			memcpy(entry->data, entry->kaddr, entry->size); +		} +} + +/** + *	hibernate_nvs_restore - restore NVS memory regions + * + *	This function is going to be called with interrupts disabled, so it + *	cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ +	struct nvs_page *entry; + +	printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + +	list_for_each_entry(entry, &nvs_list, node) +		if (entry->data) +			memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/power/main.c b/kernel/power/main.c index f99ed6a75ea..f710e36930c 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,20 +8,9 @@   *   */ -#include <linux/module.h> -#include <linux/suspend.h>  #include <linux/kobject.h>  #include <linux/string.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/kmod.h> -#include <linux/init.h> -#include <linux/console.h> -#include <linux/cpu.h>  #include <linux/resume-trace.h> -#include <linux/freezer.h> -#include <linux/vmstat.h> -#include <linux/syscalls.h>  #include "power.h" @@ -119,378 +108,6 @@ power_attr(pm_test);  #endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_SUSPEND - -static int suspend_test(int level) -{ -#ifdef CONFIG_PM_DEBUG -	if (pm_test_level == level) { -		printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); -		mdelay(5000); -		return 1; -	} -#endif /* !CONFIG_PM_DEBUG */ -	return 0; -} - -#ifdef CONFIG_PM_TEST_SUSPEND - -/* - * We test the system suspend code by setting an RTC wakealarm a short - * time in the future, then suspending.  Suspending the devices won't - * normally take long ... some systems only need a few milliseconds. - * - * The time it takes is system-specific though, so when we test this - * during system bootup we allow a LOT of time. - */ -#define TEST_SUSPEND_SECONDS	5 - -static unsigned long suspend_test_start_time; - -static void suspend_test_start(void) -{ -	/* FIXME Use better timebase than "jiffies", ideally a clocksource. -	 * What we want is a hardware counter that will work correctly even -	 * during the irqs-are-off stages of the suspend/resume cycle... -	 */ -	suspend_test_start_time = jiffies; -} - -static void suspend_test_finish(const char *label) -{ -	long nj = jiffies - suspend_test_start_time; -	unsigned msec; - -	msec = jiffies_to_msecs(abs(nj)); -	pr_info("PM: %s took %d.%03d seconds\n", label, -			msec / 1000, msec % 1000); - -	/* Warning on suspend means the RTC alarm period needs to be -	 * larger -- the system was sooo slooowwww to suspend that the -	 * alarm (should have) fired before the system went to sleep! -	 * -	 * Warning on either suspend or resume also means the system -	 * has some performance issues.  The stack dump of a WARN_ON -	 * is more likely to get the right attention than a printk... -	 */ -	WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); -} - -#else - -static void suspend_test_start(void) -{ -} - -static void suspend_test_finish(const char *label) -{ -} - -#endif - -/* This is just an arbitrary number */ -#define FREE_PAGE_NUMBER (100) - -static struct platform_suspend_ops *suspend_ops; - -/** - *	suspend_set_ops - Set the global suspend method table. - *	@ops:	Pointer to ops structure. - */ - -void suspend_set_ops(struct platform_suspend_ops *ops) -{ -	mutex_lock(&pm_mutex); -	suspend_ops = ops; -	mutex_unlock(&pm_mutex); -} - -/** - * suspend_valid_only_mem - generic memory-only valid callback - * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. - */ -int suspend_valid_only_mem(suspend_state_t state) -{ -	return state == PM_SUSPEND_MEM; -} - -/** - *	suspend_prepare - Do prep work before entering low-power state. - * - *	This is common code that is called for each state that we're entering. - *	Run suspend notifiers, allocate a console and stop all processes. - */ -static int suspend_prepare(void) -{ -	int error; -	unsigned int free_pages; - -	if (!suspend_ops || !suspend_ops->enter) -		return -EPERM; - -	pm_prepare_console(); - -	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); -	if (error) -		goto Finish; - -	error = usermodehelper_disable(); -	if (error) -		goto Finish; - -	if (suspend_freeze_processes()) { -		error = -EAGAIN; -		goto Thaw; -	} - -	free_pages = global_page_state(NR_FREE_PAGES); -	if (free_pages < FREE_PAGE_NUMBER) { -		pr_debug("PM: free some memory\n"); -		shrink_all_memory(FREE_PAGE_NUMBER - free_pages); -		if (nr_free_pages() < FREE_PAGE_NUMBER) { -			error = -ENOMEM; -			printk(KERN_ERR "PM: No enough memory\n"); -		} -	} -	if (!error) -		return 0; - - Thaw: -	suspend_thaw_processes(); -	usermodehelper_enable(); - Finish: -	pm_notifier_call_chain(PM_POST_SUSPEND); -	pm_restore_console(); -	return error; -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) -{ -	local_irq_disable(); -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) -{ -	local_irq_enable(); -} - -/** - *	suspend_enter - enter the desired system sleep state. - *	@state:		state to enter - * - *	This function should be called after devices have been suspended. - */ -static int suspend_enter(suspend_state_t state) -{ -	int error; - -	device_pm_lock(); - -	if (suspend_ops->prepare) { -		error = suspend_ops->prepare(); -		if (error) -			goto Done; -	} - -	error = device_power_down(PMSG_SUSPEND); -	if (error) { -		printk(KERN_ERR "PM: Some devices failed to power down\n"); -		goto Platfrom_finish; -	} - -	if (suspend_ops->prepare_late) { -		error = suspend_ops->prepare_late(); -		if (error) -			goto Power_up_devices; -	} - -	if (suspend_test(TEST_PLATFORM)) -		goto Platform_wake; - -	error = disable_nonboot_cpus(); -	if (error || suspend_test(TEST_CPUS)) -		goto Enable_cpus; - -	arch_suspend_disable_irqs(); -	BUG_ON(!irqs_disabled()); - -	error = sysdev_suspend(PMSG_SUSPEND); -	if (!error) { -		if (!suspend_test(TEST_CORE)) -			error = suspend_ops->enter(state); -		sysdev_resume(); -	} - -	arch_suspend_enable_irqs(); -	BUG_ON(irqs_disabled()); - - Enable_cpus: -	enable_nonboot_cpus(); - - Platform_wake: -	if (suspend_ops->wake) -		suspend_ops->wake(); - - Power_up_devices: -	device_power_up(PMSG_RESUME); - - Platfrom_finish: -	if (suspend_ops->finish) -		suspend_ops->finish(); - - Done: -	device_pm_unlock(); - -	return error; -} - -/** - *	suspend_devices_and_enter - suspend devices and enter the desired system - *				    sleep state. - *	@state:		  state to enter - */ -int suspend_devices_and_enter(suspend_state_t state) -{ -	int error; - -	if (!suspend_ops) -		return -ENOSYS; - -	if (suspend_ops->begin) { -		error = suspend_ops->begin(state); -		if (error) -			goto Close; -	} -	suspend_console(); -	suspend_test_start(); -	error = device_suspend(PMSG_SUSPEND); -	if (error) { -		printk(KERN_ERR "PM: Some devices failed to suspend\n"); -		goto Recover_platform; -	} -	suspend_test_finish("suspend devices"); -	if (suspend_test(TEST_DEVICES)) -		goto Recover_platform; - -	suspend_enter(state); - - Resume_devices: -	suspend_test_start(); -	device_resume(PMSG_RESUME); -	suspend_test_finish("resume devices"); -	resume_console(); - Close: -	if (suspend_ops->end) -		suspend_ops->end(); -	return error; - - Recover_platform: -	if (suspend_ops->recover) -		suspend_ops->recover(); -	goto Resume_devices; -} - -/** - *	suspend_finish - Do final work before exiting suspend sequence. - * - *	Call platform code to clean up, restart processes, and free the  - *	console that we've allocated. This is not called for suspend-to-disk. - */ -static void suspend_finish(void) -{ -	suspend_thaw_processes(); -	usermodehelper_enable(); -	pm_notifier_call_chain(PM_POST_SUSPEND); -	pm_restore_console(); -} - - - - -static const char * const pm_states[PM_SUSPEND_MAX] = { -	[PM_SUSPEND_STANDBY]	= "standby", -	[PM_SUSPEND_MEM]	= "mem", -}; - -static inline int valid_state(suspend_state_t state) -{ -	/* All states need lowlevel support and need to be valid -	 * to the lowlevel implementation, no valid callback -	 * implies that none are valid. */ -	if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state)) -		return 0; -	return 1; -} - - -/** - *	enter_state - Do common work of entering low-power state. - *	@state:		pm_state structure for state we're entering. - * - *	Make sure we're the only ones trying to enter a sleep state. Fail - *	if someone has beat us to it, since we don't want anything weird to - *	happen when we wake up. - *	Then, do the setup for suspend, enter the state, and cleaup (after - *	we've woken up). - */ -static int enter_state(suspend_state_t state) -{ -	int error; - -	if (!valid_state(state)) -		return -ENODEV; - -	if (!mutex_trylock(&pm_mutex)) -		return -EBUSY; - -	printk(KERN_INFO "PM: Syncing filesystems ... "); -	sys_sync(); -	printk("done.\n"); - -	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); -	error = suspend_prepare(); -	if (error) -		goto Unlock; - -	if (suspend_test(TEST_FREEZER)) -		goto Finish; - -	pr_debug("PM: Entering %s sleep\n", pm_states[state]); -	error = suspend_devices_and_enter(state); - - Finish: -	pr_debug("PM: Finishing wakeup.\n"); -	suspend_finish(); - Unlock: -	mutex_unlock(&pm_mutex); -	return error; -} - - -/** - *	pm_suspend - Externally visible function for suspending system. - *	@state:		Enumerated value of state to enter. - * - *	Determine whether or not value is within range, get state  - *	structure, and enter (above). - */ - -int pm_suspend(suspend_state_t state) -{ -	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) -		return enter_state(state); -	return -EINVAL; -} - -EXPORT_SYMBOL(pm_suspend); - -#endif /* CONFIG_SUSPEND */ -  struct kobject *power_kobj;  /** @@ -503,7 +120,6 @@ struct kobject *power_kobj;   *	store() accepts one of those strings, translates it into the    *	proper enumerated value, and initiates a suspend transition.   */ -  static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,  			  char *buf)  { @@ -601,7 +217,6 @@ static struct attribute_group attr_group = {  	.attrs = g,  }; -  static int __init pm_init(void)  {  	power_kobj = kobject_create_and_add("power", NULL); @@ -611,144 +226,3 @@ static int __init pm_init(void)  }  core_initcall(pm_init); - - -#ifdef CONFIG_PM_TEST_SUSPEND - -#include <linux/rtc.h> - -/* - * To test system suspend, we need a hands-off mechanism to resume the - * system.  RTCs wake alarms are a common self-contained mechanism. - */ - -static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) -{ -	static char err_readtime[] __initdata = -		KERN_ERR "PM: can't read %s time, err %d\n"; -	static char err_wakealarm [] __initdata = -		KERN_ERR "PM: can't set %s wakealarm, err %d\n"; -	static char err_suspend[] __initdata = -		KERN_ERR "PM: suspend test failed, error %d\n"; -	static char info_test[] __initdata = -		KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - -	unsigned long		now; -	struct rtc_wkalrm	alm; -	int			status; - -	/* this may fail if the RTC hasn't been initialized */ -	status = rtc_read_time(rtc, &alm.time); -	if (status < 0) { -		printk(err_readtime, dev_name(&rtc->dev), status); -		return; -	} -	rtc_tm_to_time(&alm.time, &now); - -	memset(&alm, 0, sizeof alm); -	rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); -	alm.enabled = true; - -	status = rtc_set_alarm(rtc, &alm); -	if (status < 0) { -		printk(err_wakealarm, dev_name(&rtc->dev), status); -		return; -	} - -	if (state == PM_SUSPEND_MEM) { -		printk(info_test, pm_states[state]); -		status = pm_suspend(state); -		if (status == -ENODEV) -			state = PM_SUSPEND_STANDBY; -	} -	if (state == PM_SUSPEND_STANDBY) { -		printk(info_test, pm_states[state]); -		status = pm_suspend(state); -	} -	if (status < 0) -		printk(err_suspend, status); - -	/* Some platforms can't detect that the alarm triggered the -	 * wakeup, or (accordingly) disable it after it afterwards. -	 * It's supposed to give oneshot behavior; cope. -	 */ -	alm.enabled = false; -	rtc_set_alarm(rtc, &alm); -} - -static int __init has_wakealarm(struct device *dev, void *name_ptr) -{ -	struct rtc_device *candidate = to_rtc_device(dev); - -	if (!candidate->ops->set_alarm) -		return 0; -	if (!device_may_wakeup(candidate->dev.parent)) -		return 0; - -	*(const char **)name_ptr = dev_name(dev); -	return 1; -} - -/* - * Kernel options like "test_suspend=mem" force suspend/resume sanity tests - * at startup time.  They're normally disabled, for faster boot and because - * we can't know which states really work on this particular system. - */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; - -static char warn_bad_state[] __initdata = -	KERN_WARNING "PM: can't test '%s' suspend state\n"; - -static int __init setup_test_suspend(char *value) -{ -	unsigned i; - -	/* "=mem" ==> "mem" */ -	value++; -	for (i = 0; i < PM_SUSPEND_MAX; i++) { -		if (!pm_states[i]) -			continue; -		if (strcmp(pm_states[i], value) != 0) -			continue; -		test_state = (__force suspend_state_t) i; -		return 0; -	} -	printk(warn_bad_state, value); -	return 0; -} -__setup("test_suspend", setup_test_suspend); - -static int __init test_suspend(void) -{ -	static char		warn_no_rtc[] __initdata = -		KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - -	char			*pony = NULL; -	struct rtc_device	*rtc = NULL; - -	/* PM is initialized by now; is that state testable? */ -	if (test_state == PM_SUSPEND_ON) -		goto done; -	if (!valid_state(test_state)) { -		printk(warn_bad_state, pm_states[test_state]); -		goto done; -	} - -	/* RTCs have initialized by now too ... can we use one? */ -	class_find_device(rtc_class, NULL, &pony, has_wakealarm); -	if (pony) -		rtc = rtc_class_open(pony); -	if (!rtc) { -		printk(warn_no_rtc); -		goto done; -	} - -	/* go for it */ -	test_wakealarm(rtc, test_state); -	rtc_class_close(rtc); -done: -	return 0; -} -late_initcall(test_suspend); - -#endif /* CONFIG_PM_TEST_SUSPEND */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 46b5ec7a3af..26d5a26f82e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)   */  #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT) -/* kernel/power/disk.c */ +/* kernel/power/hibernate.c */  extern int hibernation_snapshot(int platform_mode);  extern int hibernation_restore(int platform_mode);  extern int hibernation_platform_enter(void); @@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);  extern int create_basic_memory_bitmaps(void);  extern void free_basic_memory_bitmaps(void); -extern unsigned int count_data_pages(void); +extern int swsusp_shrink_memory(void);  /**   *	Auxiliary structure used for reading the snapshot image data and @@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void);   */  #define SF_PLATFORM_MODE	1 -/* kernel/power/disk.c */ +/* kernel/power/hibernate.c */  extern int swsusp_check(void); -extern int swsusp_shrink_memory(void);  extern void swsusp_free(void);  extern int swsusp_read(unsigned int *flags_p);  extern int swsusp_write(unsigned int flags); @@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,  				unsigned int, char *);  #ifdef CONFIG_SUSPEND -/* kernel/power/main.c */ +/* kernel/power/suspend.c */ +extern const char *const pm_states[]; + +extern bool valid_state(suspend_state_t state);  extern int suspend_devices_and_enter(suspend_state_t state); +extern int enter_state(suspend_state_t state);  #else /* !CONFIG_SUSPEND */  static inline int suspend_devices_and_enter(suspend_state_t state)  {  	return -ENOSYS;  } +static inline int enter_state(suspend_state_t state) { return -ENOSYS; } +static inline bool valid_state(suspend_state_t state) { return false; }  #endif /* !CONFIG_SUSPEND */ +#ifdef CONFIG_PM_TEST_SUSPEND +/* kernel/power/suspend_test.c */ +extern void suspend_test_start(void); +extern void suspend_test_finish(const char *label); +#else /* !CONFIG_PM_TEST_SUSPEND */ +static inline void suspend_test_start(void) {} +static inline void suspend_test_finish(const char *label) {} +#endif /* !CONFIG_PM_TEST_SUSPEND */ +  #ifdef CONFIG_PM_SLEEP  /* kernel/power/main.c */  extern int pm_notifier_call_chain(unsigned long val);  #endif  #ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void);  int restore_highmem(void);  #else  static inline unsigned int count_highmem_pages(void) { return 0; } diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 97890831e1b..e8b33700627 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -34,7 +34,7 @@ static struct sysrq_key_op	sysrq_poweroff_op = {  	.handler        = handle_poweroff,  	.help_msg       = "powerOff",  	.action_msg     = "Power Off", - 	.enable_mask	= SYSRQ_ENABLE_BOOT, +	.enable_mask	= SYSRQ_ENABLE_BOOT,  };  static int pm_sysrq_init(void) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 33e2e4a819f..523a451b45d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);  static void swsusp_set_page_forbidden(struct page *);  static void swsusp_unset_page_forbidden(struct page *); +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size = 500 * 1024 * 1024; +  /* List of PBEs needed for restoring the pages that were allocated before   * the suspend and included in the suspend image, but have also been   * allocated by the "resume" kernel, so their contents cannot be written @@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)   *	pages.   */ -unsigned int count_highmem_pages(void) +static unsigned int count_highmem_pages(void)  {  	struct zone *zone;  	unsigned int n = 0; @@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)   *	pages.   */ -unsigned int count_data_pages(void) +static unsigned int count_data_pages(void)  {  	struct zone *zone;  	unsigned long pfn, max_zone_pfn; @@ -1058,6 +1066,74 @@ void swsusp_free(void)  	buffer = NULL;  } +/** + *	swsusp_shrink_memory -  Try to free as much memory as needed + * + *	... but do not OOM-kill anyone + * + *	Notice: all userland should be stopped before it is called, or + *	livelock is possible. + */ + +#define SHRINK_BITE	10000 +static inline unsigned long __shrink_memory(long tmp) +{ +	if (tmp > SHRINK_BITE) +		tmp = SHRINK_BITE; +	return shrink_all_memory(tmp); +} + +int swsusp_shrink_memory(void) +{ +	long tmp; +	struct zone *zone; +	unsigned long pages = 0; +	unsigned int i = 0; +	char *p = "-\\|/"; +	struct timeval start, stop; + +	printk(KERN_INFO "PM: Shrinking memory...  "); +	do_gettimeofday(&start); +	do { +		long size, highmem_size; + +		highmem_size = count_highmem_pages(); +		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; +		tmp = size; +		size += highmem_size; +		for_each_populated_zone(zone) { +			tmp += snapshot_additional_pages(zone); +			if (is_highmem(zone)) { +				highmem_size -= +					zone_page_state(zone, NR_FREE_PAGES); +			} else { +				tmp -= zone_page_state(zone, NR_FREE_PAGES); +				tmp += zone->lowmem_reserve[ZONE_NORMAL]; +			} +		} + +		if (highmem_size < 0) +			highmem_size = 0; + +		tmp += highmem_size; +		if (tmp > 0) { +			tmp = __shrink_memory(tmp); +			if (!tmp) +				return -ENOMEM; +			pages += tmp; +		} else if (size > image_size / PAGE_SIZE) { +			tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); +			pages += tmp; +		} +		printk("\b%c", p[i++%4]); +	} while (tmp > 0); +	do_gettimeofday(&stop); +	printk("\bdone (%lu pages freed)\n", pages); +	swsusp_show_speed(&start, &stop, pages, "Freed"); + +	return 0; +} +  #ifdef CONFIG_HIGHMEM  /**    *	count_pages_for_highmem - compute the number of non-highmem pages diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c new file mode 100644 index 00000000000..6f10dfc2d3e --- /dev/null +++ b/kernel/power/suspend.c @@ -0,0 +1,300 @@ +/* + * kernel/power/suspend.c - Suspend to RAM and standby functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> + +#include "power.h" + +const char *const pm_states[PM_SUSPEND_MAX] = { +	[PM_SUSPEND_STANDBY]	= "standby", +	[PM_SUSPEND_MEM]	= "mem", +}; + +static struct platform_suspend_ops *suspend_ops; + +/** + *	suspend_set_ops - Set the global suspend method table. + *	@ops:	Pointer to ops structure. + */ +void suspend_set_ops(struct platform_suspend_ops *ops) +{ +	mutex_lock(&pm_mutex); +	suspend_ops = ops; +	mutex_unlock(&pm_mutex); +} + +bool valid_state(suspend_state_t state) +{ +	/* +	 * All states need lowlevel support and need to be valid to the lowlevel +	 * implementation, no valid callback implies that none are valid. +	 */ +	return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/** + * suspend_valid_only_mem - generic memory-only valid callback + * + * Platform drivers that implement mem suspend only and only need + * to check for that in their .valid callback can use this instead + * of rolling their own .valid callback. + */ +int suspend_valid_only_mem(suspend_state_t state) +{ +	return state == PM_SUSPEND_MEM; +} + +static int suspend_test(int level) +{ +#ifdef CONFIG_PM_DEBUG +	if (pm_test_level == level) { +		printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); +		mdelay(5000); +		return 1; +	} +#endif /* !CONFIG_PM_DEBUG */ +	return 0; +} + +/** + *	suspend_prepare - Do prep work before entering low-power state. + * + *	This is common code that is called for each state that we're entering. + *	Run suspend notifiers, allocate a console and stop all processes. + */ +static int suspend_prepare(void) +{ +	int error; + +	if (!suspend_ops || !suspend_ops->enter) +		return -EPERM; + +	pm_prepare_console(); + +	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); +	if (error) +		goto Finish; + +	error = usermodehelper_disable(); +	if (error) +		goto Finish; + +	error = suspend_freeze_processes(); +	if (!error) +		return 0; + +	suspend_thaw_processes(); +	usermodehelper_enable(); + Finish: +	pm_notifier_call_chain(PM_POST_SUSPEND); +	pm_restore_console(); +	return error; +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +{ +	local_irq_disable(); +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +{ +	local_irq_enable(); +} + +/** + *	suspend_enter - enter the desired system sleep state. + *	@state:		state to enter + * + *	This function should be called after devices have been suspended. + */ +static int suspend_enter(suspend_state_t state) +{ +	int error; + +	if (suspend_ops->prepare) { +		error = suspend_ops->prepare(); +		if (error) +			return error; +	} + +	error = dpm_suspend_noirq(PMSG_SUSPEND); +	if (error) { +		printk(KERN_ERR "PM: Some devices failed to power down\n"); +		goto Platfrom_finish; +	} + +	if (suspend_ops->prepare_late) { +		error = suspend_ops->prepare_late(); +		if (error) +			goto Power_up_devices; +	} + +	if (suspend_test(TEST_PLATFORM)) +		goto Platform_wake; + +	error = disable_nonboot_cpus(); +	if (error || suspend_test(TEST_CPUS)) +		goto Enable_cpus; + +	arch_suspend_disable_irqs(); +	BUG_ON(!irqs_disabled()); + +	error = sysdev_suspend(PMSG_SUSPEND); +	if (!error) { +		if (!suspend_test(TEST_CORE)) +			error = suspend_ops->enter(state); +		sysdev_resume(); +	} + +	arch_suspend_enable_irqs(); +	BUG_ON(irqs_disabled()); + + Enable_cpus: +	enable_nonboot_cpus(); + + Platform_wake: +	if (suspend_ops->wake) +		suspend_ops->wake(); + + Power_up_devices: +	dpm_resume_noirq(PMSG_RESUME); + + Platfrom_finish: +	if (suspend_ops->finish) +		suspend_ops->finish(); + +	return error; +} + +/** + *	suspend_devices_and_enter - suspend devices and enter the desired system + *				    sleep state. + *	@state:		  state to enter + */ +int suspend_devices_and_enter(suspend_state_t state) +{ +	int error; + +	if (!suspend_ops) +		return -ENOSYS; + +	if (suspend_ops->begin) { +		error = suspend_ops->begin(state); +		if (error) +			goto Close; +	} +	suspend_console(); +	suspend_test_start(); +	error = dpm_suspend_start(PMSG_SUSPEND); +	if (error) { +		printk(KERN_ERR "PM: Some devices failed to suspend\n"); +		goto Recover_platform; +	} +	suspend_test_finish("suspend devices"); +	if (suspend_test(TEST_DEVICES)) +		goto Recover_platform; + +	suspend_enter(state); + + Resume_devices: +	suspend_test_start(); +	dpm_resume_end(PMSG_RESUME); +	suspend_test_finish("resume devices"); +	resume_console(); + Close: +	if (suspend_ops->end) +		suspend_ops->end(); +	return error; + + Recover_platform: +	if (suspend_ops->recover) +		suspend_ops->recover(); +	goto Resume_devices; +} + +/** + *	suspend_finish - Do final work before exiting suspend sequence. + * + *	Call platform code to clean up, restart processes, and free the + *	console that we've allocated. This is not called for suspend-to-disk. + */ +static void suspend_finish(void) +{ +	suspend_thaw_processes(); +	usermodehelper_enable(); +	pm_notifier_call_chain(PM_POST_SUSPEND); +	pm_restore_console(); +} + +/** + *	enter_state - Do common work of entering low-power state. + *	@state:		pm_state structure for state we're entering. + * + *	Make sure we're the only ones trying to enter a sleep state. Fail + *	if someone has beat us to it, since we don't want anything weird to + *	happen when we wake up. + *	Then, do the setup for suspend, enter the state, and cleaup (after + *	we've woken up). + */ +int enter_state(suspend_state_t state) +{ +	int error; + +	if (!valid_state(state)) +		return -ENODEV; + +	if (!mutex_trylock(&pm_mutex)) +		return -EBUSY; + +	printk(KERN_INFO "PM: Syncing filesystems ... "); +	sys_sync(); +	printk("done.\n"); + +	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); +	error = suspend_prepare(); +	if (error) +		goto Unlock; + +	if (suspend_test(TEST_FREEZER)) +		goto Finish; + +	pr_debug("PM: Entering %s sleep\n", pm_states[state]); +	error = suspend_devices_and_enter(state); + + Finish: +	pr_debug("PM: Finishing wakeup.\n"); +	suspend_finish(); + Unlock: +	mutex_unlock(&pm_mutex); +	return error; +} + +/** + *	pm_suspend - Externally visible function for suspending system. + *	@state:		Enumerated value of state to enter. + * + *	Determine whether or not value is within range, get state + *	structure, and enter (above). + */ +int pm_suspend(suspend_state_t state) +{ +	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) +		return enter_state(state); +	return -EINVAL; +} +EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c new file mode 100644 index 00000000000..17d8bb1acf9 --- /dev/null +++ b/kernel/power/suspend_test.c @@ -0,0 +1,187 @@ +/* + * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. + * + * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> + * + * This file is released under the GPLv2. + */ + +#include <linux/init.h> +#include <linux/rtc.h> + +#include "power.h" + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending.  Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS	5 + +static unsigned long suspend_test_start_time; + +void suspend_test_start(void) +{ +	/* FIXME Use better timebase than "jiffies", ideally a clocksource. +	 * What we want is a hardware counter that will work correctly even +	 * during the irqs-are-off stages of the suspend/resume cycle... +	 */ +	suspend_test_start_time = jiffies; +} + +void suspend_test_finish(const char *label) +{ +	long nj = jiffies - suspend_test_start_time; +	unsigned msec; + +	msec = jiffies_to_msecs(abs(nj)); +	pr_info("PM: %s took %d.%03d seconds\n", label, +			msec / 1000, msec % 1000); + +	/* Warning on suspend means the RTC alarm period needs to be +	 * larger -- the system was sooo slooowwww to suspend that the +	 * alarm (should have) fired before the system went to sleep! +	 * +	 * Warning on either suspend or resume also means the system +	 * has some performance issues.  The stack dump of a WARN_ON +	 * is more likely to get the right attention than a printk... +	 */ +	WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); +} + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system.  RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ +	static char err_readtime[] __initdata = +		KERN_ERR "PM: can't read %s time, err %d\n"; +	static char err_wakealarm [] __initdata = +		KERN_ERR "PM: can't set %s wakealarm, err %d\n"; +	static char err_suspend[] __initdata = +		KERN_ERR "PM: suspend test failed, error %d\n"; +	static char info_test[] __initdata = +		KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + +	unsigned long		now; +	struct rtc_wkalrm	alm; +	int			status; + +	/* this may fail if the RTC hasn't been initialized */ +	status = rtc_read_time(rtc, &alm.time); +	if (status < 0) { +		printk(err_readtime, dev_name(&rtc->dev), status); +		return; +	} +	rtc_tm_to_time(&alm.time, &now); + +	memset(&alm, 0, sizeof alm); +	rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); +	alm.enabled = true; + +	status = rtc_set_alarm(rtc, &alm); +	if (status < 0) { +		printk(err_wakealarm, dev_name(&rtc->dev), status); +		return; +	} + +	if (state == PM_SUSPEND_MEM) { +		printk(info_test, pm_states[state]); +		status = pm_suspend(state); +		if (status == -ENODEV) +			state = PM_SUSPEND_STANDBY; +	} +	if (state == PM_SUSPEND_STANDBY) { +		printk(info_test, pm_states[state]); +		status = pm_suspend(state); +	} +	if (status < 0) +		printk(err_suspend, status); + +	/* Some platforms can't detect that the alarm triggered the +	 * wakeup, or (accordingly) disable it after it afterwards. +	 * It's supposed to give oneshot behavior; cope. +	 */ +	alm.enabled = false; +	rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ +	struct rtc_device *candidate = to_rtc_device(dev); + +	if (!candidate->ops->set_alarm) +		return 0; +	if (!device_may_wakeup(candidate->dev.parent)) +		return 0; + +	*(const char **)name_ptr = dev_name(dev); +	return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time.  They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = +	KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ +	unsigned i; + +	/* "=mem" ==> "mem" */ +	value++; +	for (i = 0; i < PM_SUSPEND_MAX; i++) { +		if (!pm_states[i]) +			continue; +		if (strcmp(pm_states[i], value) != 0) +			continue; +		test_state = (__force suspend_state_t) i; +		return 0; +	} +	printk(warn_bad_state, value); +	return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ +	static char		warn_no_rtc[] __initdata = +		KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + +	char			*pony = NULL; +	struct rtc_device	*rtc = NULL; + +	/* PM is initialized by now; is that state testable? */ +	if (test_state == PM_SUSPEND_ON) +		goto done; +	if (!valid_state(test_state)) { +		printk(warn_bad_state, pm_states[test_state]); +		goto done; +	} + +	/* RTCs have initialized by now too ... can we use one? */ +	class_find_device(rtc_class, NULL, &pony, has_wakealarm); +	if (pony) +		rtc = rtc_class_open(pony); +	if (!rtc) { +		printk(warn_no_rtc); +		goto done; +	} + +	/* go for it */ +	test_wakealarm(rtc, test_state); +	rtc_class_close(rtc); +done: +	return 0; +} +late_initcall(test_suspend); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 78c35047586..6a07f4dbf2f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -55,14 +55,6 @@  #include "power.h" -/* - * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. - */ -unsigned long image_size = 500 * 1024 * 1024; -  int in_suspend __nosavedata = 0;  /** @@ -194,193 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,  			centisecs / 100, centisecs % 100,  			kps / 1000, (kps % 1000) / 10);  } - -/** - *	swsusp_shrink_memory -  Try to free as much memory as needed - * - *	... but do not OOM-kill anyone - * - *	Notice: all userland should be stopped before it is called, or - *	livelock is possible. - */ - -#define SHRINK_BITE	10000 -static inline unsigned long __shrink_memory(long tmp) -{ -	if (tmp > SHRINK_BITE) -		tmp = SHRINK_BITE; -	return shrink_all_memory(tmp); -} - -int swsusp_shrink_memory(void) -{ -	long tmp; -	struct zone *zone; -	unsigned long pages = 0; -	unsigned int i = 0; -	char *p = "-\\|/"; -	struct timeval start, stop; - -	printk(KERN_INFO "PM: Shrinking memory...  "); -	do_gettimeofday(&start); -	do { -		long size, highmem_size; - -		highmem_size = count_highmem_pages(); -		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; -		tmp = size; -		size += highmem_size; -		for_each_populated_zone(zone) { -			tmp += snapshot_additional_pages(zone); -			if (is_highmem(zone)) { -				highmem_size -= -					zone_page_state(zone, NR_FREE_PAGES); -			} else { -				tmp -= zone_page_state(zone, NR_FREE_PAGES); -				tmp += zone->lowmem_reserve[ZONE_NORMAL]; -			} -		} - -		if (highmem_size < 0) -			highmem_size = 0; - -		tmp += highmem_size; -		if (tmp > 0) { -			tmp = __shrink_memory(tmp); -			if (!tmp) -				return -ENOMEM; -			pages += tmp; -		} else if (size > image_size / PAGE_SIZE) { -			tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); -			pages += tmp; -		} -		printk("\b%c", p[i++%4]); -	} while (tmp > 0); -	do_gettimeofday(&stop); -	printk("\bdone (%lu pages freed)\n", pages); -	swsusp_show_speed(&start, &stop, pages, "Freed"); - -	return 0; -} - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent - * resume.  The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { -	unsigned long phys_start; -	unsigned int size; -	void *kaddr; -	void *data; -	struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - *	hibernate_nvs_register - register platform NVS memory region to save - *	@start - physical address of the region - *	@size - size of the region - * - *	The NVS region need not be page-aligned (both ends) and we arrange - *	things so that the data from page-aligned addresses in this region will - *	be copied into separate RAM pages. - */ -int hibernate_nvs_register(unsigned long start, unsigned long size) -{ -	struct nvs_page *entry, *next; - -	while (size > 0) { -		unsigned int nr_bytes; - -		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); -		if (!entry) -			goto Error; - -		list_add_tail(&entry->node, &nvs_list); -		entry->phys_start = start; -		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); -		entry->size = (size < nr_bytes) ? size : nr_bytes; - -		start += entry->size; -		size -= entry->size; -	} -	return 0; - - Error: -	list_for_each_entry_safe(entry, next, &nvs_list, node) { -		list_del(&entry->node); -		kfree(entry); -	} -	return -ENOMEM; -} - -/** - *	hibernate_nvs_free - free data pages allocated for saving NVS regions - */ -void hibernate_nvs_free(void) -{ -	struct nvs_page *entry; - -	list_for_each_entry(entry, &nvs_list, node) -		if (entry->data) { -			free_page((unsigned long)entry->data); -			entry->data = NULL; -			if (entry->kaddr) { -				iounmap(entry->kaddr); -				entry->kaddr = NULL; -			} -		} -} - -/** - *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int hibernate_nvs_alloc(void) -{ -	struct nvs_page *entry; - -	list_for_each_entry(entry, &nvs_list, node) { -		entry->data = (void *)__get_free_page(GFP_KERNEL); -		if (!entry->data) { -			hibernate_nvs_free(); -			return -ENOMEM; -		} -	} -	return 0; -} - -/** - *	hibernate_nvs_save - save NVS memory regions - */ -void hibernate_nvs_save(void) -{ -	struct nvs_page *entry; - -	printk(KERN_INFO "PM: Saving platform NVS memory\n"); - -	list_for_each_entry(entry, &nvs_list, node) -		if (entry->data) { -			entry->kaddr = ioremap(entry->phys_start, entry->size); -			memcpy(entry->data, entry->kaddr, entry->size); -		} -} - -/** - *	hibernate_nvs_restore - restore NVS memory regions - * - *	This function is going to be called with interrupts disabled, so it - *	cannot iounmap the virtual addresses used to access the NVS region. - */ -void hibernate_nvs_restore(void) -{ -	struct nvs_page *entry; - -	printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - -	list_for_each_entry(entry, &nvs_list, node) -		if (entry->data) -			memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/profile.c b/kernel/profile.c index 7724e0409ba..28cf26ad2d2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -111,12 +111,6 @@ int __ref profile_init(void)  	/* only text is profiled */  	prof_len = (_etext - _stext) >> prof_shift;  	buffer_bytes = prof_len*sizeof(atomic_t); -	if (!slab_is_available()) { -		prof_buffer = alloc_bootmem(buffer_bytes); -		alloc_bootmem_cpumask_var(&prof_cpu_mask); -		cpumask_copy(prof_cpu_mask, cpu_possible_mask); -		return 0; -	}  	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))  		return -ENOMEM; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5a0d6..f6d8b8cb5e3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,16 +25,6 @@  /* - * Initialize a new task whose father had been ptraced. - * - * Called from copy_process(). - */ -void ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ -	arch_ptrace_fork(child, clone_flags); -} - -/*   * ptrace a task: make the debugger its new parent and   * move it to the ptrace list.   * @@ -185,10 +175,11 @@ int ptrace_attach(struct task_struct *task)  	if (same_thread_group(task, current))  		goto out; -	/* Protect exec's credential calculations against our interference; -	 * SUID, SGID and LSM creds get determined differently under ptrace. +	/* Protect the target's credential calculations against our +	 * interference; SUID, SGID and LSM creds get determined differently +	 * under ptrace.  	 */ -	retval = mutex_lock_interruptible(&task->cred_exec_mutex); +	retval = mutex_lock_interruptible(&task->cred_guard_mutex);  	if (retval  < 0)  		goto out; @@ -232,7 +223,7 @@ repeat:  bad:  	write_unlock_irqrestore(&tasklist_lock, flags);  	task_unlock(task); -	mutex_unlock(&task->cred_exec_mutex); +	mutex_unlock(&task->cred_guard_mutex);  out:  	return retval;  } @@ -304,6 +295,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)  	if (child->ptrace) {  		child->exit_code = data;  		dead = __ptrace_detach(current, child); +		if (!child->exit_state) +			wake_up_process(child);  	}  	write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ce97a4df64d..beb0e659adc 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)  		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;  		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); -		ret = 0; +		ret = 0; /* unused */  		__wait_event_interruptible(rcu_ctrlblk.sched_wq,  			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,  			ret); -		/* -		 * Signals would prevent us from sleeping, and we cannot -		 * do much with them in any case.  So flush them. -		 */ -		if (ret) -			flush_signals(current);  		couldsleepnext = 0;  	} while (!kthread_should_stop()); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2a372fb0b9..0dccfbba6d2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)  	check_cpu_stall(rsp, rdp);  	/* Is the RCU core waiting for a quiescent state from this CPU? */ -	if (rdp->qs_pending) +	if (rdp->qs_pending) { +		rdp->n_rp_qs_pending++;  		return 1; +	}  	/* Does this CPU have callbacks ready to invoke? */ -	if (cpu_has_callbacks_ready_to_invoke(rdp)) +	if (cpu_has_callbacks_ready_to_invoke(rdp)) { +		rdp->n_rp_cb_ready++;  		return 1; +	}  	/* Has RCU gone idle with this CPU needing another grace period? */ -	if (cpu_needs_another_gp(rsp, rdp)) +	if (cpu_needs_another_gp(rsp, rdp)) { +		rdp->n_rp_cpu_needs_gp++;  		return 1; +	}  	/* Has another RCU grace period completed?  */ -	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ +	if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ +		rdp->n_rp_gp_completed++;  		return 1; +	}  	/* Has a new RCU grace period started? */ -	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ +	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ +		rdp->n_rp_gp_started++;  		return 1; +	}  	/* Has an RCU GP gone long enough to send resched IPIs &c? */  	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && -	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) +	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { +		rdp->n_rp_need_fqs++;  		return 1; +	}  	/* nothing to do */ +	rdp->n_rp_need_nothing++;  	return 0;  } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4b1875ba940..fe1dcdbf1ca 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {  	.release = single_release,  }; -static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ +	seq_printf(m, "%3d%cnp=%ld " +		   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", +		   rdp->cpu, +		   cpu_is_offline(rdp->cpu) ? '!' : ' ', +		   rdp->n_rcu_pending, +		   rdp->n_rp_qs_pending, +		   rdp->n_rp_cb_ready, +		   rdp->n_rp_cpu_needs_gp, +		   rdp->n_rp_gp_completed, +		   rdp->n_rp_gp_started, +		   rdp->n_rp_need_fqs, +		   rdp->n_rp_need_nothing); +} + +static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +{ +	int cpu; +	struct rcu_data *rdp; + +	for_each_possible_cpu(cpu) { +		rdp = rsp->rda[cpu]; +		if (rdp->beenonline) +			print_one_rcu_pending(m, rdp); +	} +} + +static int show_rcu_pending(struct seq_file *m, void *unused) +{ +	seq_puts(m, "rcu:\n"); +	print_rcu_pendings(m, &rcu_state); +	seq_puts(m, "rcu_bh:\n"); +	print_rcu_pendings(m, &rcu_bh_state); +	return 0; +} + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ +	return single_open(file, show_rcu_pending, NULL); +} + +static struct file_operations rcu_pending_fops = { +	.owner = THIS_MODULE, +	.open = rcu_pending_open, +	.read = seq_read, +	.llseek = seq_lseek, +	.release = single_release, +}; + +static struct dentry *rcudir; +static struct dentry *datadir; +static struct dentry *datadir_csv; +static struct dentry *gpdir; +static struct dentry *hierdir; +static struct dentry *rcu_pendingdir; +  static int __init rcuclassic_trace_init(void)  {  	rcudir = debugfs_create_dir("rcu", NULL); @@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)  						NULL, &rcuhier_fops);  	if (!hierdir)  		goto free_out; + +	rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, +						NULL, &rcu_pending_fops); +	if (!rcu_pendingdir) +		goto free_out;  	return 0;  free_out:  	if (datadir) @@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)  	debugfs_remove(datadir_csv);  	debugfs_remove(gpdir);  	debugfs_remove(hierdir); +	debugfs_remove(rcu_pendingdir);  	debugfs_remove(rcudir);  } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..fcd107a78c5 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,   * assigned pending owner [which might not have taken the   * lock yet]:   */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, +				    struct task_struct *task)  {  	struct task_struct *pendowner = rt_mutex_owner(lock);  	struct rt_mutex_waiter *next; @@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)  	if (!rt_mutex_owner_pending(lock))  		return 0; -	if (pendowner == current) +	if (pendowner == task)  		return 1;  	spin_lock_irqsave(&pendowner->pi_lock, flags); -	if (current->prio >= pendowner->prio) { +	if (task->prio >= pendowner->prio) {  		spin_unlock_irqrestore(&pendowner->pi_lock, flags);  		return 0;  	} @@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)  	 * We are going to steal the lock and a waiter was  	 * enqueued on the pending owners pi_waiters queue. So  	 * we have to enqueue this waiter into -	 * current->pi_waiters list. This covers the case, -	 * where current is boosted because it holds another +	 * task->pi_waiters list. This covers the case, +	 * where task is boosted because it holds another  	 * lock and gets unboosted because the booster is  	 * interrupted, so we would delay a waiter with higher -	 * priority as current->normal_prio. +	 * priority as task->normal_prio.  	 *  	 * Note: in the rare case of a SCHED_OTHER task changing  	 * its priority and thus stealing the lock, next->task -	 * might be current: +	 * might be task:  	 */ -	if (likely(next->task != current)) { -		spin_lock_irqsave(¤t->pi_lock, flags); -		plist_add(&next->pi_list_entry, ¤t->pi_waiters); -		__rt_mutex_adjust_prio(current); -		spin_unlock_irqrestore(¤t->pi_lock, flags); +	if (likely(next->task != task)) { +		spin_lock_irqsave(&task->pi_lock, flags); +		plist_add(&next->pi_list_entry, &task->pi_waiters); +		__rt_mutex_adjust_prio(task); +		spin_unlock_irqrestore(&task->pi_lock, flags);  	}  	return 1;  } @@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)  	 */  	mark_rt_mutex_waiters(lock); -	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) +	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))  		return 0;  	/* We got the lock. */ @@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)   */  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,  				   struct rt_mutex_waiter *waiter, +				   struct task_struct *task,  				   int detect_deadlock)  {  	struct task_struct *owner = rt_mutex_owner(lock); @@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,  	unsigned long flags;  	int chain_walk = 0, res; -	spin_lock_irqsave(¤t->pi_lock, flags); -	__rt_mutex_adjust_prio(current); -	waiter->task = current; +	spin_lock_irqsave(&task->pi_lock, flags); +	__rt_mutex_adjust_prio(task); +	waiter->task = task;  	waiter->lock = lock; -	plist_node_init(&waiter->list_entry, current->prio); -	plist_node_init(&waiter->pi_list_entry, current->prio); +	plist_node_init(&waiter->list_entry, task->prio); +	plist_node_init(&waiter->pi_list_entry, task->prio);  	/* Get the top priority waiter on the lock */  	if (rt_mutex_has_waiters(lock))  		top_waiter = rt_mutex_top_waiter(lock);  	plist_add(&waiter->list_entry, &lock->wait_list); -	current->pi_blocked_on = waiter; +	task->pi_blocked_on = waiter; -	spin_unlock_irqrestore(¤t->pi_lock, flags); +	spin_unlock_irqrestore(&task->pi_lock, flags);  	if (waiter == rt_mutex_top_waiter(lock)) {  		spin_lock_irqsave(&owner->pi_lock, flags); @@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,  	spin_unlock(&lock->wait_lock);  	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, -					 current); +					 task);  	spin_lock(&lock->wait_lock); @@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)  	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);  } -/* - * Slow path lock function: +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock:		 the rt_mutex to take + * @state:		 the state the task should block in (TASK_INTERRUPTIBLE + * 			 or TASK_UNINTERRUPTIBLE) + * @timeout:		 the pre-initialized and started timer, or NULL for none + * @waiter:		 the pre-initialized rt_mutex_waiter + * @detect_deadlock:	 passed to task_blocks_on_rt_mutex + * + * lock->wait_lock must be held by the caller.   */  static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, -		  struct hrtimer_sleeper *timeout, -		  int detect_deadlock) +__rt_mutex_slowlock(struct rt_mutex *lock, int state, +		    struct hrtimer_sleeper *timeout, +		    struct rt_mutex_waiter *waiter, +		    int detect_deadlock)  { -	struct rt_mutex_waiter waiter;  	int ret = 0; -	debug_rt_mutex_init_waiter(&waiter); -	waiter.task = NULL; - -	spin_lock(&lock->wait_lock); - -	/* Try to acquire the lock again: */ -	if (try_to_take_rt_mutex(lock)) { -		spin_unlock(&lock->wait_lock); -		return 0; -	} - -	set_current_state(state); - -	/* Setup the timer, when timeout != NULL */ -	if (unlikely(timeout)) { -		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); -		if (!hrtimer_active(&timeout->timer)) -			timeout->task = NULL; -	} -  	for (;;) {  		/* Try to acquire the lock: */  		if (try_to_take_rt_mutex(lock)) @@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,  		}  		/* -		 * waiter.task is NULL the first time we come here and +		 * waiter->task is NULL the first time we come here and  		 * when we have been woken up by the previous owner  		 * but the lock got stolen by a higher prio task.  		 */ -		if (!waiter.task) { -			ret = task_blocks_on_rt_mutex(lock, &waiter, +		if (!waiter->task) { +			ret = task_blocks_on_rt_mutex(lock, waiter, current,  						      detect_deadlock);  			/*  			 * If we got woken up by the owner then start loop  			 * all over without going into schedule to try  			 * to get the lock now:  			 */ -			if (unlikely(!waiter.task)) { +			if (unlikely(!waiter->task)) {  				/*  				 * Reset the return value. We might  				 * have returned with -EDEADLK and the @@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,  		spin_unlock(&lock->wait_lock); -		debug_rt_mutex_print_deadlock(&waiter); +		debug_rt_mutex_print_deadlock(waiter); -		if (waiter.task) +		if (waiter->task)  			schedule_rt_mutex(lock);  		spin_lock(&lock->wait_lock);  		set_current_state(state);  	} +	return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, +		  struct hrtimer_sleeper *timeout, +		  int detect_deadlock) +{ +	struct rt_mutex_waiter waiter; +	int ret = 0; + +	debug_rt_mutex_init_waiter(&waiter); +	waiter.task = NULL; + +	spin_lock(&lock->wait_lock); + +	/* Try to acquire the lock again: */ +	if (try_to_take_rt_mutex(lock)) { +		spin_unlock(&lock->wait_lock); +		return 0; +	} + +	set_current_state(state); + +	/* Setup the timer, when timeout != NULL */ +	if (unlikely(timeout)) { +		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); +		if (!hrtimer_active(&timeout->timer)) +			timeout->task = NULL; +	} + +	ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, +				  detect_deadlock); +  	set_current_state(TASK_RUNNING);  	if (unlikely(waiter.task)) @@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);  /** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - *				       the timeout structure is provided - *				       by the caller + * rt_mutex_timed_lock - lock a rt_mutex interruptible + *			the timeout structure is provided + *			by the caller   *   * @lock: 		the rt_mutex to be locked   * @timeout:		timeout structure or NULL (no timeout) @@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);   * Returns:   *  0 		on success   * -EINTR 	when interrupted by a signal - * -ETIMEOUT	when the timeout expired + * -ETIMEDOUT	when the timeout expired   * -EDEADLK	when the lock would deadlock (when deadlock detection is on)   */  int @@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)  }  EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/*** +/**   * rt_mutex_destroy - mark a mutex unusable   * @lock: the mutex to be destroyed   * @@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,  }  /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock:		the rt_mutex to take + * @waiter:		the pre-initialized rt_mutex_waiter + * @task:		the task to prepare + * @detect_deadlock:	perform deadlock detection (1) or not (0) + * + * Returns: + *  0 - task blocked on lock + *  1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +			      struct rt_mutex_waiter *waiter, +			      struct task_struct *task, int detect_deadlock) +{ +	int ret; + +	spin_lock(&lock->wait_lock); + +	mark_rt_mutex_waiters(lock); + +	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { +		/* We got the lock for task. */ +		debug_rt_mutex_lock(lock); + +		rt_mutex_set_owner(lock, task, 0); + +		rt_mutex_deadlock_account_lock(lock, task); +		return 1; +	} + +	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + +	if (ret && !waiter->task) { +		/* +		 * Reset the return value. We might have +		 * returned with -EDEADLK and the owner +		 * released the lock while we were walking the +		 * pi chain.  Let the waiter sort it out. +		 */ +		ret = 0; +	} +	spin_unlock(&lock->wait_lock); + +	debug_rt_mutex_print_deadlock(waiter); + +	return ret; +} + +/**   * rt_mutex_next_owner - return the next owner of the lock   *   * @lock: the rt lock query @@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)  	return rt_mutex_top_waiter(lock)->task;  } + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock:		the rt_mutex we were woken on + * @to:			the timeout, null if none. hrtimer should already have + * 			been started. + * @waiter:		the pre-initialized rt_mutex_waiter + * @detect_deadlock:	perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + *  0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +			       struct hrtimer_sleeper *to, +			       struct rt_mutex_waiter *waiter, +			       int detect_deadlock) +{ +	int ret; + +	spin_lock(&lock->wait_lock); + +	set_current_state(TASK_INTERRUPTIBLE); + +	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, +				  detect_deadlock); + +	set_current_state(TASK_RUNNING); + +	if (unlikely(waiter->task)) +		remove_waiter(lock, waiter); + +	/* +	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might +	 * have to fix that up. +	 */ +	fixup_rt_mutex_waiters(lock); + +	spin_unlock(&lock->wait_lock); + +	/* +	 * Readjust priority, when we did not get the lock. We might have been +	 * the pending owner and boosted. Since we did not take the lock, the +	 * PI boost has to go. +	 */ +	if (unlikely(ret)) +		rt_mutex_adjust_prio(current); + +	return ret; +} diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800e..97a2f81866a 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,  				       struct task_struct *proxy_owner);  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,  				  struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +				     struct rt_mutex_waiter *waiter, +				     struct task_struct *task, +				     int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +				      struct hrtimer_sleeper *to, +				      struct rt_mutex_waiter *waiter, +				      int detect_deadlock);  #ifdef CONFIG_DEBUG_RT_MUTEXES  # include "rtmutex-debug.h" diff --git a/kernel/sched.c b/kernel/sched.c index 26efa475bdc..8ec9d13140b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@  #include <linux/completion.h>  #include <linux/kernel_stat.h>  #include <linux/debug_locks.h> +#include <linux/perf_counter.h>  #include <linux/security.h>  #include <linux/notifier.h>  #include <linux/profile.h> @@ -68,17 +69,18 @@  #include <linux/pagemap.h>  #include <linux/hrtimer.h>  #include <linux/tick.h> -#include <linux/bootmem.h>  #include <linux/debugfs.h>  #include <linux/ctype.h>  #include <linux/ftrace.h> -#include <trace/sched.h>  #include <asm/tlb.h>  #include <asm/irq_regs.h>  #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> +  /*   * Convert user-nice values [ -20 ... 0 ... 19 ]   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@   */  #define RUNTIME_INF	((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); -  #ifdef CONFIG_SMP  static void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -584,6 +580,7 @@ struct rq {  	struct load_weight load;  	unsigned long nr_load_updates;  	u64 nr_switches; +	u64 nr_migrations_in;  	struct cfs_rq cfs;  	struct rt_rq rt; @@ -630,6 +627,10 @@ struct rq {  	struct list_head migration_queue;  #endif +	/* calc_load related fields */ +	unsigned long calc_load_update; +	long calc_load_active; +  #ifdef CONFIG_SCHED_HRTICK  #ifdef CONFIG_SMP  	int hrtick_csd_pending; @@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)  #define task_rq(p)		cpu_rq(task_cpu(p))  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq)  {  	rq->clock = sched_clock_cpu(cpu_of(rq));  } @@ -1728,6 +1729,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)  }  #endif +static void calc_load_account_active(struct rq *this_rq); +  #include "sched_stats.h"  #include "sched_idletask.c"  #include "sched_fair.c" @@ -1958,7 +1961,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	clock_offset = old_rq->clock - new_rq->clock; -	trace_sched_migrate_task(p, task_cpu(p), new_cpu); +	trace_sched_migrate_task(p, new_cpu);  #ifdef CONFIG_SCHEDSTATS  	if (p->se.wait_start) @@ -1967,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  		p->se.sleep_start -= clock_offset;  	if (p->se.block_start)  		p->se.block_start -= clock_offset; +#endif  	if (old_cpu != new_cpu) { -		schedstat_inc(p, se.nr_migrations); +		p->se.nr_migrations++; +		new_rq->nr_migrations_in++; +#ifdef CONFIG_SCHEDSTATS  		if (task_hot(p, old_rq->clock, NULL))  			schedstat_inc(p, se.nr_forced2_migrations); -	}  #endif +		perf_counter_task_migration(p, new_cpu); +	}  	p->se.vruntime -= old_cfsrq->min_vruntime -  					 new_cfsrq->min_vruntime; @@ -2015,6 +2022,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)  }  /* + * wait_task_context_switch -	wait for a thread to complete at least one + *				context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ +	unsigned long nvcsw, nivcsw, flags; +	int running; +	struct rq *rq; + +	nvcsw	= p->nvcsw; +	nivcsw	= p->nivcsw; +	for (;;) { +		/* +		 * The runqueue is assigned before the actual context +		 * switch. We need to take the runqueue lock. +		 * +		 * We could check initially without the lock but it is +		 * very likely that we need to take the lock in every +		 * iteration. +		 */ +		rq = task_rq_lock(p, &flags); +		running = task_running(rq, p); +		task_rq_unlock(rq, &flags); + +		if (likely(!running)) +			break; +		/* +		 * The switch count is incremented before the actual +		 * context switch. We thus wait for two switches to be +		 * sure at least one completed. +		 */ +		if ((p->nvcsw - nvcsw) > 1) +			break; +		if ((p->nivcsw - nivcsw) > 1) +			break; + +		cpu_relax(); +	} +} + +/*   * wait_task_inactive - wait for a thread to unschedule.   *   * If @match_state is nonzero, it's the @p->state value just checked and @@ -2142,6 +2192,7 @@ void kick_process(struct task_struct *p)  		smp_send_reschedule(cpu);  	preempt_enable();  } +EXPORT_SYMBOL_GPL(kick_process);  /*   * Return a low guess at the load of a migration-source cpu weighted @@ -2324,6 +2375,27 @@ static int sched_balance_self(int cpu, int flag)  #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p:		the task to evaluate + * @func:	the function to be called + * @info:	the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, +			      void (*func) (void *info), void *info) +{ +	int cpu; + +	preempt_disable(); +	cpu = task_cpu(p); +	if (task_curr(p)) +		smp_call_function_single(cpu, func, info, 1); +	preempt_enable(); +} +  /***   * try_to_wake_up - wake up a thread   * @p: the to-be-woken-up thread @@ -2458,6 +2530,17 @@ out:  	return success;  } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes.  Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */  int wake_up_process(struct task_struct *p)  {  	return try_to_wake_up(p, TASK_ALL, 0); @@ -2480,6 +2563,7 @@ static void __sched_fork(struct task_struct *p)  	p->se.exec_start		= 0;  	p->se.sum_exec_runtime		= 0;  	p->se.prev_sum_exec_runtime	= 0; +	p->se.nr_migrations		= 0;  	p->se.last_wakeup		= 0;  	p->se.avg_overlap		= 0;  	p->se.start_runtime		= 0; @@ -2710,6 +2794,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	 */  	prev_state = prev->state;  	finish_arch_switch(prev); +	perf_counter_task_sched_in(current, cpu_of(rq));  	finish_lock_switch(rq, prev);  #ifdef CONFIG_SMP  	if (post_schedule) @@ -2766,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  	 * combine the page table reload and the switch backend into  	 * one hypercall.  	 */ -	arch_enter_lazy_cpu_mode(); +	arch_start_context_switch(prev);  	if (unlikely(!mm)) {  		next->active_mm = oldmm; @@ -2856,19 +2941,81 @@ unsigned long nr_iowait(void)  	return sum;  } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads:	pointer to dest load array + * @offset:	offset to add + * @shift:	shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift)  { -	unsigned long i, running = 0, uninterruptible = 0; +	loads[0] = (avenrun[0] + offset) << shift; +	loads[1] = (avenrun[1] + offset) << shift; +	loads[2] = (avenrun[2] + offset) << shift; +} -	for_each_online_cpu(i) { -		running += cpu_rq(i)->nr_running; -		uninterruptible += cpu_rq(i)->nr_uninterruptible; -	} +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ +	load *= exp; +	load += active * (FIXED_1 - exp); +	return load >> FSHIFT; +} -	if (unlikely((long)uninterruptible < 0)) -		uninterruptible = 0; +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ +	unsigned long upd = calc_load_update + 10; +	long active; -	return running + uninterruptible; +	if (time_before(jiffies, upd)) +		return; + +	active = atomic_long_read(&calc_load_tasks); +	active = active > 0 ? active * FIXED_1 : 0; + +	avenrun[0] = calc_load(avenrun[0], EXP_1, active); +	avenrun[1] = calc_load(avenrun[1], EXP_5, active); +	avenrun[2] = calc_load(avenrun[2], EXP_15, active); + +	calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ +	long nr_active, delta; + +	nr_active = this_rq->nr_running; +	nr_active += (long) this_rq->nr_uninterruptible; + +	if (nr_active != this_rq->calc_load_active) { +		delta = nr_active - this_rq->calc_load_active; +		this_rq->calc_load_active = nr_active; +		atomic_long_add(delta, &calc_load_tasks); +	} +} + +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_migrations(int cpu) +{ +	return cpu_rq(cpu)->nr_migrations_in;  }  /* @@ -2899,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq)  			new_load += scale-1;  		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;  	} + +	if (time_after_eq(jiffies, this_rq->calc_load_update)) { +		this_rq->calc_load_update += LOAD_FREQ; +		calc_load_account_active(this_rq); +	}  }  #ifdef CONFIG_SMP @@ -4240,10 +4392,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)  static struct {  	atomic_t load_balancer;  	cpumask_var_t cpu_mask; +	cpumask_var_t ilb_grp_nohz_mask;  } nohz ____cacheline_aligned = {  	.load_balancer = ATOMIC_INIT(-1),  }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu:	The cpu whose lowest level of sched domain is to + *		be returned. + * @flag:	The flag to check for the lowest sched_domain + *		for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ +	struct sched_domain *sd; + +	for_each_domain(cpu, sd) +		if (sd && (sd->flags & flag)) +			break; + +	return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu:	The cpu whose domains we're iterating over. + * @sd:		variable holding the value of the power_savings_sd + *		for cpu. + * @flag:	The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ +	for (sd = lowest_flag_domain(cpu, flag); \ +		(sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group:	group to be checked for semi-idleness + * + * Returns:	1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ +	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, +					sched_group_cpus(ilb_group)); + +	/* +	 * A sched_group is semi-idle when it has atleast one busy cpu +	 * and atleast one idle cpu. +	 */ +	if (cpumask_empty(nohz.ilb_grp_nohz_mask)) +		return 0; + +	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) +		return 0; + +	return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu:	The cpu which is nominating a new idle_load_balancer. + * + * Returns:	Returns the id of the idle load balancer if it exists, + *		Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ +	struct sched_domain *sd; +	struct sched_group *ilb_group; + +	/* +	 * Have idle load balancer selection from semi-idle packages only +	 * when power-aware load balancing is enabled +	 */ +	if (!(sched_smt_power_savings || sched_mc_power_savings)) +		goto out_done; + +	/* +	 * Optimize for the case when we have no idle CPUs or only one +	 * idle CPU. Don't walk the sched_domain hierarchy in such cases +	 */ +	if (cpumask_weight(nohz.cpu_mask) < 2) +		goto out_done; + +	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { +		ilb_group = sd->groups; + +		do { +			if (is_semi_idle_group(ilb_group)) +				return cpumask_first(nohz.ilb_grp_nohz_mask); + +			ilb_group = ilb_group->next; + +		} while (ilb_group != sd->groups); +	} + +out_done: +	return cpumask_first(nohz.cpu_mask); +} +#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ +	return cpumask_first(nohz.cpu_mask); +} +#endif +  /*   * This routine will try to nominate the ilb (idle load balancing)   * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4298,8 +4566,24 @@ int select_nohz_load_balancer(int stop_tick)  			/* make me the ilb owner */  			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)  				return 1; -		} else if (atomic_read(&nohz.load_balancer) == cpu) +		} else if (atomic_read(&nohz.load_balancer) == cpu) { +			int new_ilb; + +			if (!(sched_smt_power_savings || +						sched_mc_power_savings)) +				return 1; +			/* +			 * Check to see if there is a more power-efficient +			 * ilb. +			 */ +			new_ilb = find_new_ilb(cpu); +			if (new_ilb < nr_cpu_ids && new_ilb != cpu) { +				atomic_set(&nohz.load_balancer, -1); +				resched_cpu(new_ilb); +				return 0; +			}  			return 1; +		}  	} else {  		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))  			return 0; @@ -4468,15 +4752,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)  		}  		if (atomic_read(&nohz.load_balancer) == -1) { -			/* -			 * simple selection for now: Nominate the -			 * first cpu in the nohz list to be the next -			 * ilb owner. -			 * -			 * TBD: Traverse the sched domains and nominate -			 * the nearest cpu in the nohz.cpu_mask. -			 */ -			int ilb = cpumask_first(nohz.cpu_mask); +			int ilb = find_new_ilb(cpu);  			if (ilb < nr_cpu_ids)  				resched_cpu(ilb); @@ -4840,6 +5116,8 @@ void scheduler_tick(void)  	curr->sched_class->task_tick(rq, curr, 0);  	spin_unlock(&rq->lock); +	perf_counter_task_tick(curr, cpu); +  #ifdef CONFIG_SMP  	rq->idle_at_tick = idle_cpu(cpu);  	trigger_load_balance(rq, cpu); @@ -5007,13 +5285,15 @@ pick_next_task(struct rq *rq)  /*   * schedule() is the main scheduler function.   */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void)  {  	struct task_struct *prev, *next;  	unsigned long *switch_count;  	struct rq *rq;  	int cpu; +need_resched: +	preempt_disable();  	cpu = smp_processor_id();  	rq = cpu_rq(cpu);  	rcu_qsctr_inc(cpu); @@ -5053,6 +5333,7 @@ need_resched_nonpreemptible:  	if (likely(prev != next)) {  		sched_info_switch(prev, next); +		perf_counter_task_sched_out(prev, next, cpu);  		rq->nr_switches++;  		rq->curr = next; @@ -5070,15 +5351,9 @@ need_resched_nonpreemptible:  	if (unlikely(reacquire_kernel_lock(current) < 0))  		goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: -	preempt_disable(); -	__schedule();  	preempt_enable_no_resched(); -	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) +	if (need_resched())  		goto need_resched;  }  EXPORT_SYMBOL(schedule); @@ -5221,7 +5496,7 @@ EXPORT_SYMBOL(default_wake_function);   * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns   * zero in this (rare) case, and we handle it by continuing to scan the queue.   */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,  			int nr_exclusive, int sync, void *key)  {  	wait_queue_t *curr, *next; @@ -5241,6 +5516,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,   * @mode: which threads   * @nr_exclusive: how many wake-one or wake-many threads to wake up   * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up.   */  void __wake_up(wait_queue_head_t *q, unsigned int mode,  			int nr_exclusive, void *key) @@ -5279,6 +5557,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)   * with each other. This can prevent needless bouncing between CPUs.   *   * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up.   */  void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,  			int nr_exclusive, void *key) @@ -5315,6 +5596,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */   * awakened in the same order in which they were queued.   *   * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up.   */  void complete(struct completion *x)  { @@ -5332,6 +5616,9 @@ EXPORT_SYMBOL(complete);   * @x:  holds the state of this particular completion   *   * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up.   */  void complete_all(struct completion *x)  { @@ -6490,8 +6777,9 @@ void sched_show_task(struct task_struct *p)  #ifdef CONFIG_DEBUG_STACK_USAGE  	free = stack_not_used(p);  #endif -	printk(KERN_CONT "%5lu %5d %6d\n", free, -		task_pid_nr(p), task_pid_nr(p->real_parent)); +	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, +		task_pid_nr(p), task_pid_nr(p->real_parent), +		(unsigned long)task_thread_info(p)->flags);  	show_stack(p, NULL);  } @@ -6970,6 +7258,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)  	}  } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ +	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +}  #endif /* CONFIG_HOTPLUG_CPU */  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7204,6 +7500,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		/* Update our root-domain */  		rq = cpu_rq(cpu);  		spin_lock_irqsave(&rq->lock, flags); +		rq->calc_load_update = calc_load_update; +		rq->calc_load_active = 0;  		if (rq->rd) {  			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7243,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		cpuset_unlock();  		migrate_nr_uninterruptible(rq);  		BUG_ON(rq->nr_running != 0); - +		calc_global_load_remove(rq);  		/*  		 * No need to migrate the tasks: it was best-effort if  		 * they didn't take sched_hotcpu_mutex. Just wake up @@ -7279,8 +7577,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  	return NOTIFY_OK;  } -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else.  This has to be lower priority than + * the notifier in the perf_counter subsystem, though.   */  static struct notifier_block __cpuinitdata migration_notifier = {  	.notifier_call = migration_call, @@ -7525,24 +7825,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)  static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)  { +	gfp_t gfp = GFP_KERNEL; +  	memset(rd, 0, sizeof(*rd)); -	if (bootmem) { -		alloc_bootmem_cpumask_var(&def_root_domain.span); -		alloc_bootmem_cpumask_var(&def_root_domain.online); -		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); -		cpupri_init(&rd->cpupri, true); -		return 0; -	} +	if (bootmem) +		gfp = GFP_NOWAIT; -	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) +	if (!alloc_cpumask_var(&rd->span, gfp))  		goto out; -	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) +	if (!alloc_cpumask_var(&rd->online, gfp))  		goto free_span; -	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +	if (!alloc_cpumask_var(&rd->rto_mask, gfp))  		goto free_online; -	if (cpupri_init(&rd->cpupri, false) != 0) +	if (cpupri_init(&rd->cpupri, bootmem) != 0)  		goto free_rto_mask;  	return 0; @@ -7753,8 +8050,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;  /*   * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + *   and struct sched_domain. )   */  struct static_sched_group {  	struct sched_group sg; @@ -7875,7 +8173,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)  			struct sched_domain *sd;  			sd = &per_cpu(phys_domains, j).sd; -			if (j != cpumask_first(sched_group_cpus(sd->groups))) { +			if (j != group_first_cpu(sd->groups)) {  				/*  				 * Only add "power" once for each  				 * physical package. @@ -7953,7 +8251,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  	WARN_ON(!sd || !sd->groups); -	if (cpu != cpumask_first(sched_group_cpus(sd->groups))) +	if (cpu != group_first_cpu(sd->groups))  		return;  	child = sd->child; @@ -8865,7 +9163,7 @@ void __init sched_init(void)  	 * we use alloc_bootmem().  	 */  	if (alloc_size) { -		ptr = (unsigned long)alloc_bootmem(alloc_size); +		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);  #ifdef CONFIG_FAIR_GROUP_SCHED  		init_task_group.se = (struct sched_entity **)ptr; @@ -8938,6 +9236,8 @@ void __init sched_init(void)  		rq = cpu_rq(i);  		spin_lock_init(&rq->lock);  		rq->nr_running = 0; +		rq->calc_load_active = 0; +		rq->calc_load_update = jiffies + LOAD_FREQ;  		init_cfs_rq(&rq->cfs, rq);  		init_rt_rq(&rq->rt, rq);  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8958,7 +9258,7 @@ void __init sched_init(void)  		 * 1024) and two child groups A0 and A1 (of weight 1024 each),  		 * then A0's share of the cpu resource is:  		 * -		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% +		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%  		 *  		 * We achieve this by letting init_task_group's tasks sit  		 * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -9045,20 +9345,26 @@ void __init sched_init(void)  	 * when this runqueue becomes "idle".  	 */  	init_idle(current, smp_processor_id()); + +	calc_load_update = jiffies + LOAD_FREQ; +  	/*  	 * During early bootup we pretend to be a normal task:  	 */  	current->sched_class = &fair_sched_class;  	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ -	alloc_bootmem_cpumask_var(&nohz_cpu_mask); +	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);  #ifdef CONFIG_SMP  #ifdef CONFIG_NO_HZ -	alloc_bootmem_cpumask_var(&nohz.cpu_mask); +	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); +	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);  #endif -	alloc_bootmem_cpumask_var(&cpu_isolated_map); +	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  #endif /* SMP */ +	perf_counter_init(); +  	scheduler_running = 1;  } @@ -9800,6 +10106,13 @@ static int sched_rt_global_constraints(void)  	if (sysctl_sched_rt_period <= 0)  		return -EINVAL; +	/* +	 * There's always some RT tasks in the root group +	 * -- migration, kstopmachine etc.. +	 */ +	if (sysctl_sched_rt_runtime == 0) +		return -EBUSY; +  	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);  	for_each_possible_cpu(i) {  		struct rt_rq *rt_rq = &cpu_rq(i)->rt; diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 819f17ac796..e1d16c9a768 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -38,7 +38,8 @@   */  unsigned long long __attribute__((weak)) sched_clock(void)  { -	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +	return (unsigned long long)(jiffies - INITIAL_JIFFIES) +					* (NSEC_PER_SEC / HZ);  }  static __read_mostly int sched_clock_running; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89574c..7deffc9f0e5 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)   */  int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)  { +	gfp_t gfp = GFP_KERNEL;  	int i; +	if (bootmem) +		gfp = GFP_NOWAIT; +  	memset(cp, 0, sizeof(*cp));  	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)  		spin_lock_init(&vec->lock);  		vec->count = 0; -		if (bootmem) -			alloc_bootmem_cpumask_var(&vec->mask); -		else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) +		if (!zalloc_cpumask_var(&vec->mask, gfp))  			goto cleanup;  	} diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f11..5f9650e8fe7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)  	find_matching_se(&se, &pse); -	while (se) { -		BUG_ON(!pse); +	BUG_ON(!pse); -		if (wakeup_preempt_entity(se, pse) == 1) { -			resched_task(curr); -			break; -		} - -		se = parent_entity(se); -		pse = parent_entity(pse); -	} +	if (wakeup_preempt_entity(se, pse) == 1) +		resched_task(curr);  }  static struct task_struct *pick_next_task_fair(struct rq *rq) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c1..499672c10cb 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy  static struct task_struct *pick_next_task_idle(struct rq *rq)  {  	schedstat_inc(rq, sched_goidle); - +	/* adjust the active tasks as we might go into a long sleep */ +	calc_load_account_active(rq);  	return rq->idle;  } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8f971..9bf0d2a7304 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void)  	unsigned int i;  	for_each_possible_cpu(i) -		alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), +		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),  					GFP_KERNEL, cpu_to_node(i));  }  #endif /* CONFIG_SMP */ diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..809a228019a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,7 @@  #include <linux/freezer.h>  #include <linux/pid_namespace.h>  #include <linux/nsproxy.h> -#include <trace/sched.h> +#include <trace/events/sched.h>  #include <asm/param.h>  #include <asm/uaccess.h> @@ -41,8 +41,6 @@  static struct kmem_cache *sigqueue_cachep; -DEFINE_TRACE(sched_signal_send); -  static void __user *sig_handler(struct task_struct *t, int sig)  {  	return t->sighand->action[sig - 1].sa.sa_handler; @@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)  /*   * Flush all pending signals for a task.   */ +void __flush_signals(struct task_struct *t) +{ +	clear_tsk_thread_flag(t, TIF_SIGPENDING); +	flush_sigqueue(&t->pending); +	flush_sigqueue(&t->signal->shared_pending); +} +  void flush_signals(struct task_struct *t)  {  	unsigned long flags;  	spin_lock_irqsave(&t->sighand->siglock, flags); -	clear_tsk_thread_flag(t, TIF_SIGPENDING); -	flush_sigqueue(&t->pending); -	flush_sigqueue(&t->signal->shared_pending); +	__flush_signals(t);  	spin_unlock_irqrestore(&t->sighand->siglock, flags);  } @@ -2278,24 +2281,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)  	return kill_something_info(sig, &info, pid);  } -static int do_tkill(pid_t tgid, pid_t pid, int sig) +static int +do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)  { -	int error; -	struct siginfo info;  	struct task_struct *p;  	unsigned long flags; - -	error = -ESRCH; -	info.si_signo = sig; -	info.si_errno = 0; -	info.si_code = SI_TKILL; -	info.si_pid = task_tgid_vnr(current); -	info.si_uid = current_uid(); +	int error = -ESRCH;  	rcu_read_lock();  	p = find_task_by_vpid(pid);  	if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { -		error = check_kill_permission(sig, &info, p); +		error = check_kill_permission(sig, info, p);  		/*  		 * The null signal is a permissions and process existence  		 * probe.  No signal is actually delivered. @@ -2305,7 +2301,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)  		 * signal is private anyway.  		 */  		if (!error && sig && lock_task_sighand(p, &flags)) { -			error = specific_send_sig_info(sig, &info, p); +			error = specific_send_sig_info(sig, info, p);  			unlock_task_sighand(p, &flags);  		}  	} @@ -2314,6 +2310,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)  	return error;  } +static int do_tkill(pid_t tgid, pid_t pid, int sig) +{ +	struct siginfo info; + +	info.si_signo = sig; +	info.si_errno = 0; +	info.si_code = SI_TKILL; +	info.si_pid = task_tgid_vnr(current); +	info.si_uid = current_uid(); + +	return do_send_specific(tgid, pid, sig, &info); +} +  /**   *  sys_tgkill - send signal to one specific thread   *  @tgid: the thread group ID of the thread @@ -2363,6 +2372,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,  	return kill_proc_info(sig, &info, pid);  } +long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +{ +	/* This is only valid for single tasks */ +	if (pid <= 0 || tgid <= 0) +		return -EINVAL; + +	/* Not even root can pretend to send signals from the kernel. +	   Nor can they impersonate a kill(), which adds source info.  */ +	if (info->si_code >= 0) +		return -EPERM; +	info->si_signo = sig; + +	return do_send_specific(tgid, pid, sig, info); +} + +SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, +		siginfo_t __user *, uinfo) +{ +	siginfo_t info; + +	if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) +		return -EFAULT; + +	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} +  int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)  {  	struct task_struct *t = current; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index b28d19135f4..521ed2004d6 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -372,8 +372,8 @@ static int slow_work_thread(void *_data)  		vsmax *= atomic_read(&slow_work_thread_count);  		vsmax /= 100; -		prepare_to_wait(&slow_work_thread_wq, &wait, -				TASK_INTERRUPTIBLE); +		prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, +					  TASK_INTERRUPTIBLE);  		if (!freezing(current) &&  		    !slow_work_threads_should_exit &&  		    !slow_work_available(vsmax) && diff --git a/kernel/smp.c b/kernel/smp.c index 858baac568e..ad63d850120 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: -		if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, +		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,  				cpu_to_node(cpu)))  			return NOTIFY_BAD;  		break; diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd34851..258885a543d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,7 +24,9 @@  #include <linux/ftrace.h>  #include <linux/smp.h>  #include <linux/tick.h> -#include <trace/irq.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/irq.h>  #include <asm/irq.h>  /* @@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);   */  #define MAX_SOFTIRQ_RESTART 10 -DEFINE_TRACE(softirq_entry); -DEFINE_TRACE(softirq_exit); -  asmlinkage void __do_softirq(void)  {  	struct softirq_action *h; @@ -828,7 +827,7 @@ int __init __weak arch_early_irq_init(void)  	return 0;  } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int node)  {  	return 0;  } diff --git a/kernel/sys.c b/kernel/sys.c index e7998cf3149..438d99a38c8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,6 +14,7 @@  #include <linux/prctl.h>  #include <linux/highuid.h>  #include <linux/fs.h> +#include <linux/perf_counter.h>  #include <linux/resource.h>  #include <linux/kernel.h>  #include <linux/kexec.h> @@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		case PR_SET_TSC:  			error = SET_TSC_CTL(arg2);  			break; +		case PR_TASK_PERF_COUNTERS_DISABLE: +			error = perf_counter_task_disable(); +			break; +		case PR_TASK_PERF_COUNTERS_ENABLE: +			error = perf_counter_task_enable(); +			break;  		case PR_GET_TIMERSLACK:  			error = current->timer_slack_ns;  			break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 27dad296738..68320f6b07b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);  cond_syscall(compat_sys_timerfd_gettime);  cond_syscall(sys_eventfd);  cond_syscall(sys_eventfd2); + +/* performance counters: */ +cond_syscall(sys_perf_counter_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2970d56fb7..ce664f98e3f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@  #include <linux/reboot.h>  #include <linux/ftrace.h>  #include <linux/slow-work.h> +#include <linux/perf_counter.h>  #include <asm/uaccess.h>  #include <asm/processor.h> @@ -114,6 +115,7 @@ static int ngroups_max = NGROUPS_MAX;  #ifdef CONFIG_MODULES  extern char modprobe_path[]; +extern int modules_disabled;  #endif  #ifdef CONFIG_CHR_DEV_SG  extern int sg_big_buff; @@ -534,6 +536,17 @@ static struct ctl_table kern_table[] = {  		.proc_handler	= &proc_dostring,  		.strategy	= &sysctl_string,  	}, +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "modules_disabled", +		.data		= &modules_disabled, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		/* only handle a transition from default "0" to "1" */ +		.proc_handler	= &proc_dointvec_minmax, +		.extra1		= &one, +		.extra2		= &one, +	},  #endif  #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)  	{ @@ -731,6 +744,14 @@ static struct ctl_table kern_table[] = {  	},  	{  		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "bootloader_version", +		.data		= &bootloader_version, +		.maxlen		= sizeof (int), +		.mode		= 0444, +		.proc_handler	= &proc_dointvec, +	}, +	{ +		.ctl_name	= CTL_UNNUMBERED,  		.procname	= "kstack_depth_to_print",  		.data		= &kstack_depth_to_print,  		.maxlen		= sizeof(int), @@ -912,6 +933,32 @@ static struct ctl_table kern_table[] = {  		.child		= slow_work_sysctls,  	},  #endif +#ifdef CONFIG_PERF_COUNTERS +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "perf_counter_paranoid", +		.data		= &sysctl_perf_counter_paranoid, +		.maxlen		= sizeof(sysctl_perf_counter_paranoid), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "perf_counter_mlock_kb", +		.data		= &sysctl_perf_counter_mlock, +		.maxlen		= sizeof(sysctl_perf_counter_mlock), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "perf_counter_max_sample_rate", +		.data		= &sysctl_perf_counter_sample_rate, +		.maxlen		= sizeof(sysctl_perf_counter_sample_rate), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +#endif  /*   * NOTE: do not add new entries to this table unless you have read   * Documentation/sysctl/ctl_unnumbered.txt @@ -1225,7 +1272,6 @@ static struct ctl_table vm_table[] = {  		.strategy	= &sysctl_jiffies,  	},  #endif -#ifdef CONFIG_SECURITY  	{  		.ctl_name	= CTL_UNNUMBERED,  		.procname	= "mmap_min_addr", @@ -1234,7 +1280,6 @@ static struct ctl_table vm_table[] = {  		.mode		= 0644,  		.proc_handler	= &proc_doulongvec_minmax,  	}, -#endif  #ifdef CONFIG_NUMA  	{  		.ctl_name	= CTL_UNNUMBERED, diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ecfd7b5187e..80189f6f1c5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)  	unsigned long flags;  	int ret; -	/* save mult_orig on registration */ -	c->mult_orig = c->mult; -  	spin_lock_irqsave(&clocksource_lock, flags);  	ret = clocksource_enqueue(c);  	if (!ret) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..e8c77d9c633 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@  /*   * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime.   */  __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); @@ -77,6 +77,10 @@ static void clocksource_forward_now(void)  	clock->cycle_last = cycle_now;  	nsec = cyc2ns(clock, cycle_delta); + +	/* If arch requires, add in gettimeoffset() */ +	nsec += arch_gettimeoffset(); +  	timespec_add_ns(&xtime, nsec);  	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; @@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)  		/* convert to nanoseconds: */  		nsecs = cyc2ns(clock, cycle_delta); +		/* If arch requires, add in gettimeoffset() */ +		nsecs += arch_gettimeoffset(); +  	} while (read_seqretry(&xtime_lock, seq));  	timespec_add_ns(ts, nsecs); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..faf2db897de 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@  #include <linux/delay.h>  #include <linux/tick.h>  #include <linux/kallsyms.h> +#include <linux/perf_counter.h>  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -756,6 +757,7 @@ void add_timer_on(struct timer_list *timer, int cpu)  	wake_up_idle_cpu(cpu);  	spin_unlock_irqrestore(&base->lock, flags);  } +EXPORT_SYMBOL_GPL(add_timer_on);  /**   * del_timer - deactive a timer. @@ -1123,53 +1125,14 @@ void update_process_times(int user_tick)  }  /* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ -	return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ -	unsigned long active_tasks; /* fixed-point */ -	static int count = LOAD_FREQ; - -	count -= ticks; -	if (unlikely(count < 0)) { -		active_tasks = count_active_tasks(); -		do { -			CALC_LOAD(avenrun[0], EXP_1, active_tasks); -			CALC_LOAD(avenrun[1], EXP_5, active_tasks); -			CALC_LOAD(avenrun[2], EXP_15, active_tasks); -			count += LOAD_FREQ; -		} while (count < 0); -	} -} - -/*   * This function runs timers and the timer-tq in bottom half context.   */  static void run_timer_softirq(struct softirq_action *h)  {  	struct tvec_base *base = __get_cpu_var(tvec_bases); +	perf_counter_do_pending(); +  	hrtimer_run_pending();  	if (time_after_eq(jiffies, base->timer_jiffies)) @@ -1187,16 +1150,6 @@ void run_local_timers(void)  }  /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ -	update_wall_time(); -	calc_load(ticks); -} - -/*   * The 64-bit jiffies value is not atomic - you MUST NOT read it   * without sampling the sequence number in xtime_lock.   * jiffies is defined in the linker script... @@ -1205,7 +1158,8 @@ static inline void update_times(unsigned long ticks)  void do_timer(unsigned long ticks)  {  	jiffies_64 += ticks; -	update_times(ticks); +	update_wall_time(); +	calc_global_load();  }  #ifdef __ARCH_WANT_SYS_ALARM @@ -1406,37 +1360,17 @@ int do_sysinfo(struct sysinfo *info)  {  	unsigned long mem_total, sav_total;  	unsigned int mem_unit, bitcount; -	unsigned long seq; +	struct timespec tp;  	memset(info, 0, sizeof(struct sysinfo)); -	do { -		struct timespec tp; -		seq = read_seqbegin(&xtime_lock); - -		/* -		 * This is annoying.  The below is the same thing -		 * posix_get_clock_monotonic() does, but it wants to -		 * take the lock which we want to cover the loads stuff -		 * too. -		 */ - -		getnstimeofday(&tp); -		tp.tv_sec += wall_to_monotonic.tv_sec; -		tp.tv_nsec += wall_to_monotonic.tv_nsec; -		monotonic_to_bootbased(&tp); -		if (tp.tv_nsec - NSEC_PER_SEC >= 0) { -			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; -			tp.tv_sec++; -		} -		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); +	ktime_get_ts(&tp); +	monotonic_to_bootbased(&tp); +	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); -		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); -		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); -		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); +	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); -		info->procs = nr_threads; -	} while (read_seqretry(&xtime_lock, seq)); +	info->procs = nr_threads;  	si_meminfo(info);  	si_swapinfo(info); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 417d1985e29..4a13e5a01ce 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -48,6 +48,21 @@ config FTRACE_NMI_ENTER         depends on HAVE_FTRACE_NMI_ENTER         default y +config EVENT_TRACING +	select CONTEXT_SWITCH_TRACER +	bool + +config CONTEXT_SWITCH_TRACER +	select MARKERS +	bool + +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hidding of the automatic options options. +  config TRACING  	bool  	select DEBUG_FS @@ -56,6 +71,11 @@ config TRACING  	select TRACEPOINTS  	select NOP_TRACER  	select BINARY_PRINTF +	select EVENT_TRACING + +config GENERIC_TRACER +	bool +	select TRACING  #  # Minimum requirements an architecture has to meet for us to @@ -73,14 +93,20 @@ config TRACING_SUPPORT  if TRACING_SUPPORT -menu "Tracers" +menuconfig FTRACE +	bool "Tracers" +	default y if DEBUG_KERNEL +	help +	 Enable the kernel tracing infrastructure. + +if FTRACE  config FUNCTION_TRACER  	bool "Kernel Function Tracer"  	depends on HAVE_FUNCTION_TRACER  	select FRAME_POINTER  	select KALLSYMS -	select TRACING +	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	help  	  Enable the kernel to trace every kernel function. This is done @@ -104,13 +130,14 @@ config FUNCTION_GRAPH_TRACER  	  the return value. This is done by setting the current return   	  address on the current task structure into a stack of calls. +  config IRQSOFF_TRACER  	bool "Interrupts-off Latency Tracer"  	default n  	depends on TRACE_IRQFLAGS_SUPPORT  	depends on GENERIC_TIME  	select TRACE_IRQFLAGS -	select TRACING +	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	help  	  This option measures the time spent in irqs-off critical @@ -131,7 +158,7 @@ config PREEMPT_TRACER  	default n  	depends on GENERIC_TIME  	depends on PREEMPT -	select TRACING +	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	help  	  This option measures the time spent in preemption off critical @@ -150,7 +177,7 @@ config PREEMPT_TRACER  config SYSPROF_TRACER  	bool "Sysprof Tracer"  	depends on X86 -	select TRACING +	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	help  	  This tracer provides the trace needed by the 'Sysprof' userspace @@ -158,40 +185,33 @@ config SYSPROF_TRACER  config SCHED_TRACER  	bool "Scheduling Latency Tracer" -	select TRACING +	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	select TRACER_MAX_TRACE  	help  	  This tracer tracks the latency of the highest priority task  	  to be scheduled in, starting from the point it has woken up. -config CONTEXT_SWITCH_TRACER -	bool "Trace process context switches" -	select TRACING -	select MARKERS -	help -	  This tracer gets called from the context switch and records -	  all switching of tasks. - -config EVENT_TRACER -	bool "Trace various events in the kernel" +config ENABLE_DEFAULT_TRACERS +	bool "Trace process context switches and events" +	depends on !GENERIC_TRACER  	select TRACING  	help  	  This tracer hooks to various trace points in the kernel  	  allowing the user to pick and choose which trace point they -	  want to trace. +	  want to trace. It also includes the sched_switch tracer plugin.  config FTRACE_SYSCALLS  	bool "Trace syscalls"  	depends on HAVE_FTRACE_SYSCALLS -	select TRACING +	select GENERIC_TRACER  	select KALLSYMS  	help  	  Basic tracer to catch the syscall entry and exit events.  config BOOT_TRACER  	bool "Trace boot initcalls" -	select TRACING +	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	help  	  This tracer helps developers to optimize boot times: it records @@ -207,8 +227,36 @@ config BOOT_TRACER  	  to enable this on bootup.  config TRACE_BRANCH_PROFILING +	bool +	select GENERIC_TRACER + +choice +	prompt "Branch Profiling" +	default BRANCH_PROFILE_NONE +	help +	 The branch profiling is a software profiler. It will add hooks +	 into the C conditionals to test which path a branch takes. + +	 The likely/unlikely profiler only looks at the conditions that +	 are annotated with a likely or unlikely macro. + +	 The "all branch" profiler will profile every if statement in the +	 kernel. This profiler will also enable the likely/unlikely +	 profiler as well. + +	 Either of the above profilers add a bit of overhead to the system. +	 If unsure choose "No branch profiling". + +config BRANCH_PROFILE_NONE +	bool "No branch profiling" +	help +	 No branch profiling. Branch profiling adds a bit of overhead. +	 Only enable it if you want to analyse the branching behavior. +	 Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES  	bool "Trace likely/unlikely profiler" -	select TRACING +	select TRACE_BRANCH_PROFILING  	help  	  This tracer profiles all the the likely and unlikely macros  	  in the kernel. It will display the results in: @@ -218,11 +266,9 @@ config TRACE_BRANCH_PROFILING  	  Note: this will add a significant overhead, only turn this  	  on if you need to profile the system's use of these macros. -	  Say N if unsure. -  config PROFILE_ALL_BRANCHES  	bool "Profile all if conditionals" -	depends on TRACE_BRANCH_PROFILING +	select TRACE_BRANCH_PROFILING  	help  	  This tracer profiles all branch conditions. Every if ()  	  taken in the kernel is recorded whether it hit or miss. @@ -230,11 +276,12 @@ config PROFILE_ALL_BRANCHES  	  /debugfs/tracing/profile_branch +	  This option also enables the likely/unlikely profiler. +  	  This configuration, when enabled, will impose a great overhead  	  on the system. This should only be enabled when the system  	  is to be analyzed - -	  Say N if unsure. +endchoice  config TRACING_BRANCHES  	bool @@ -261,7 +308,7 @@ config BRANCH_TRACER  config POWER_TRACER  	bool "Trace power consumption behavior"  	depends on X86 -	select TRACING +	select GENERIC_TRACER  	help  	  This tracer helps developers to analyze and optimize the kernels  	  power management decisions, specifically the C-state and P-state @@ -295,14 +342,14 @@ config STACK_TRACER  config HW_BRANCH_TRACER  	depends on HAVE_HW_BRANCH_TRACER  	bool "Trace hw branches" -	select TRACING +	select GENERIC_TRACER  	help  	  This tracer records all branches on the system in a circular  	  buffer giving access to the last N branches for each cpu.  config KMEMTRACE  	bool "Trace SLAB allocations" -	select TRACING +	select GENERIC_TRACER  	help  	  kmemtrace provides tracing for slab allocator functions, such as  	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected @@ -322,7 +369,7 @@ config KMEMTRACE  config WORKQUEUE_TRACER  	bool "Trace workqueues" -	select TRACING +	select GENERIC_TRACER  	help  	  The workqueue tracer provides some statistical informations            about each cpu workqueue thread such as the number of the @@ -338,7 +385,7 @@ config BLK_DEV_IO_TRACE  	select RELAY  	select DEBUG_FS  	select TRACEPOINTS -	select TRACING +	select GENERIC_TRACER  	select STACKTRACE  	help  	  Say Y here if you want to be able to trace the block layer actions @@ -375,6 +422,20 @@ config DYNAMIC_FTRACE  	 were made. If so, it runs stop_machine (stops all CPUS)  	 and modifies the code to jump over the call to ftrace. +config FUNCTION_PROFILER +	bool "Kernel function profiler" +	depends on FUNCTION_TRACER +	default n +	help +	 This option enables the kernel function profiler. A file is created +	 in debugfs called function_profile_enabled which defaults to zero. +	 When a 1 is echoed into this file profiling begins, and when a +	 zero is entered, profiling stops. A file in the trace_stats +	 directory called functions, that show the list of functions that +	 have been hit and their counters. + +	 If in doubt, say N +  config FTRACE_MCOUNT_RECORD  	def_bool y  	depends on DYNAMIC_FTRACE @@ -385,7 +446,7 @@ config FTRACE_SELFTEST  config FTRACE_STARTUP_TEST  	bool "Perform a startup test on ftrace" -	depends on TRACING +	depends on GENERIC_TRACER  	select FTRACE_SELFTEST  	help  	  This option performs a series of startup tests on ftrace. On bootup @@ -396,7 +457,7 @@ config FTRACE_STARTUP_TEST  config MMIOTRACE  	bool "Memory mapped IO tracing"  	depends on HAVE_MMIOTRACE_SUPPORT && PCI -	select TRACING +	select GENERIC_TRACER  	help  	  Mmiotrace traces Memory Mapped I/O access and is meant for  	  debugging and reverse engineering. It is called from the ioremap @@ -416,7 +477,23 @@ config MMIOTRACE_TEST  	  Say N, unless you absolutely know what you are doing. -endmenu +config RING_BUFFER_BENCHMARK +	tristate "Ring buffer benchmark stress tester" +	depends on RING_BUFFER +	help +	  This option creates a test to stress the ring buffer and bench mark it. +	  It creates its own ring buffer such that it will not interfer with +	  any other users of the ring buffer (such as ftrace). It then creates +	  a producer and consumer that will run for 10 seconds and sleep for +	  10 seconds. Each interval it will print out the number of events +	  it recorded and give a rough estimate of how long each iteration took. + +	  It does not disable interrupts or raise its priority, so it may be +	  affected by processes that are running. + +	  If unsure, say N + +endif # FTRACE  endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..844164dca90 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES  KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING  endif +# +# Make the trace clocks available generally: it's infrastructure +# relied on by ptrace for example: +# +obj-y += trace_clock.o +  obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o  obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o  obj-$(CONFIG_TRACING) += trace.o -obj-$(CONFIG_TRACING) += trace_clock.o  obj-$(CONFIG_TRACING) += trace_output.o  obj-$(CONFIG_TRACING) += trace_stat.o  obj-$(CONFIG_TRACING) += trace_printk.o @@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o  obj-$(CONFIG_POWER_TRACER) += trace_power.o  obj-$(CONFIG_KMEMTRACE) += kmemtrace.o  obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o -obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o -obj-$(CONFIG_EVENT_TRACER) += trace_events.o -obj-$(CONFIG_EVENT_TRACER) += events.o -obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o  obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o  obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o -obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o  libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0b..39af8af6fc3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,10 +23,14 @@  #include <linux/mutex.h>  #include <linux/debugfs.h>  #include <linux/time.h> -#include <trace/block.h>  #include <linux/uaccess.h> + +#include <trace/events/block.h> +  #include "trace_output.h" +#ifdef CONFIG_BLK_DEV_IO_TRACE +  static unsigned int blktrace_seq __read_mostly = 1;  static struct trace_array *blk_tr; @@ -147,7 +151,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,  {  	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)  		return 1; -	if (sector < bt->start_lba || sector > bt->end_lba) +	if (sector && (sector < bt->start_lba || sector > bt->end_lba))  		return 1;  	if (bt->pid && pid != bt->pid)  		return 1; @@ -192,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  	what |= MASK_TC_BIT(rw, DISCARD);  	pid = tsk->pid; -	if (unlikely(act_log_check(bt, what, sector, pid))) +	if (act_log_check(bt, what, sector, pid))  		return;  	cpu = raw_smp_processor_id(); @@ -262,6 +266,7 @@ static void blk_trace_free(struct blk_trace *bt)  {  	debugfs_remove(bt->msg_file);  	debugfs_remove(bt->dropped_file); +	debugfs_remove(bt->dir);  	relay_close(bt->rchan);  	free_percpu(bt->sequence);  	free_percpu(bt->msg_data); @@ -403,11 +408,29 @@ static struct rchan_callbacks blk_relay_callbacks = {  	.remove_buf_file	= blk_remove_buf_file_callback,  }; +static void blk_trace_setup_lba(struct blk_trace *bt, +				struct block_device *bdev) +{ +	struct hd_struct *part = NULL; + +	if (bdev) +		part = bdev->bd_part; + +	if (part) { +		bt->start_lba = part->start_sect; +		bt->end_lba = part->start_sect + part->nr_sects; +	} else { +		bt->start_lba = 0; +		bt->end_lba = -1ULL; +	} +} +  /*   * Setup everything required to start tracing   */  int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, -			struct blk_user_trace_setup *buts) +		       struct block_device *bdev, +		       struct blk_user_trace_setup *buts)  {  	struct blk_trace *old_bt, *bt = NULL;  	struct dentry *dir = NULL; @@ -480,10 +503,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,  	if (!bt->act_mask)  		bt->act_mask = (u16) -1; -	bt->start_lba = buts->start_lba; -	bt->end_lba = buts->end_lba; -	if (!bt->end_lba) -		bt->end_lba = -1ULL; +	blk_trace_setup_lba(bt, bdev); + +	/* overwrite with user settings */ +	if (buts->start_lba) +		bt->start_lba = buts->start_lba; +	if (buts->end_lba) +		bt->end_lba = buts->end_lba;  	bt->pid = buts->pid;  	bt->trace_state = Blktrace_setup; @@ -505,6 +531,7 @@ err:  }  int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, +		    struct block_device *bdev,  		    char __user *arg)  {  	struct blk_user_trace_setup buts; @@ -514,7 +541,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,  	if (ret)  		return -EFAULT; -	ret = do_blk_trace_setup(q, name, dev, &buts); +	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);  	if (ret)  		return ret; @@ -582,7 +609,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)  	switch (cmd) {  	case BLKTRACESETUP:  		bdevname(bdev, b); -		ret = blk_trace_setup(q, b, bdev->bd_dev, arg); +		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);  		break;  	case BLKTRACESTART:  		start = 1; @@ -642,12 +669,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,  	if (blk_pc_request(rq)) {  		what |= BLK_TC_ACT(BLK_TC_PC); -		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, -				rq->cmd_len, rq->cmd); +		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, +				what, rq->errors, rq->cmd_len, rq->cmd);  	} else  {  		what |= BLK_TC_ACT(BLK_TC_FS); -		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, -				rw, what, rq->errors, 0, NULL); +		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, +				what, rq->errors, 0, NULL);  	}  } @@ -809,7 +836,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,   * @bio:	the source bio   * @dev:	target device   * @from:	source sector - * @to:		target sector   *   * Description:   *     Device mapper or raid target sometimes need to split a bio because @@ -817,7 +843,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,   *   **/  static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, -				       dev_t dev, sector_t from, sector_t to) +				       dev_t dev, sector_t from)  {  	struct blk_trace *bt = q->blk_trace;  	struct blk_io_trace_remap r; @@ -825,12 +851,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,  	if (likely(!bt))  		return; -	r.device = cpu_to_be32(dev); -	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); -	r.sector = cpu_to_be64(to); +	r.device_from = cpu_to_be32(dev); +	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev); +	r.sector_from = cpu_to_be64(from); -	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, -			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); +	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, +			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), +			sizeof(r), &r);  }  /** @@ -854,11 +881,11 @@ void blk_add_driver_data(struct request_queue *q,  		return;  	if (blk_pc_request(rq)) -		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, -				rq->errors, len, data); +		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, +				BLK_TA_DRV_DATA, rq->errors, len, data);  	else -		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, -				0, BLK_TA_DRV_DATA, rq->errors, len, data); +		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, +				BLK_TA_DRV_DATA, rq->errors, len, data);  }  EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -971,6 +998,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)  	return te_blk_io_trace(ent) + 1;  } +static inline u32 t_action(const struct trace_entry *ent) +{ +	return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ +	return te_blk_io_trace(ent)->bytes; +} +  static inline u32 t_sec(const struct trace_entry *ent)  {  	return te_blk_io_trace(ent)->bytes >> 9; @@ -996,11 +1033,11 @@ static void get_pdu_remap(const struct trace_entry *ent,  			  struct blk_io_trace_remap *r)  {  	const struct blk_io_trace_remap *__r = pdu_start(ent); -	__u64 sector = __r->sector; +	__u64 sector_from = __r->sector_from; -	r->device = be32_to_cpu(__r->device);  	r->device_from = be32_to_cpu(__r->device_from); -	r->sector = be64_to_cpu(sector); +	r->device_to   = be32_to_cpu(__r->device_to); +	r->sector_from = be64_to_cpu(sector_from);  }  typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); @@ -1031,36 +1068,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)  				MAJOR(t->device), MINOR(t->device), act, rwbs);  } +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ +	const unsigned char *pdu_buf; +	int pdu_len; +	int i, end, ret; + +	pdu_buf = pdu_start(ent); +	pdu_len = te_blk_io_trace(ent)->pdu_len; + +	if (!pdu_len) +		return 1; + +	/* find the last zero that needs to be printed */ +	for (end = pdu_len - 1; end >= 0; end--) +		if (pdu_buf[end]) +			break; +	end++; + +	if (!trace_seq_putc(s, '(')) +		return 0; + +	for (i = 0; i < pdu_len; i++) { + +		ret = trace_seq_printf(s, "%s%02x", +				       i == 0 ? "" : " ", pdu_buf[i]); +		if (!ret) +			return ret; + +		/* +		 * stop when the rest is just zeroes and indicate so +		 * with a ".." appended +		 */ +		if (i == end && end != pdu_len - 1) +			return trace_seq_puts(s, " ..) "); +	} + +	return trace_seq_puts(s, ") "); +} +  static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)  {  	char cmd[TASK_COMM_LEN];  	trace_find_cmdline(ent->pid, cmd); -	if (t_sec(ent)) -		return trace_seq_printf(s, "%llu + %u [%s]\n", -					t_sector(ent), t_sec(ent), cmd); -	return trace_seq_printf(s, "[%s]\n", cmd); +	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { +		int ret; + +		ret = trace_seq_printf(s, "%u ", t_bytes(ent)); +		if (!ret) +			return 0; +		ret = blk_log_dump_pdu(s, ent); +		if (!ret) +			return 0; +		return trace_seq_printf(s, "[%s]\n", cmd); +	} else { +		if (t_sec(ent)) +			return trace_seq_printf(s, "%llu + %u [%s]\n", +						t_sector(ent), t_sec(ent), cmd); +		return trace_seq_printf(s, "[%s]\n", cmd); +	}  }  static int blk_log_with_error(struct trace_seq *s,  			      const struct trace_entry *ent)  { -	if (t_sec(ent)) -		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), -					t_sec(ent), t_error(ent)); -	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); +	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { +		int ret; + +		ret = blk_log_dump_pdu(s, ent); +		if (ret) +			return trace_seq_printf(s, "[%d]\n", t_error(ent)); +		return 0; +	} else { +		if (t_sec(ent)) +			return trace_seq_printf(s, "%llu + %u [%d]\n", +						t_sector(ent), +						t_sec(ent), t_error(ent)); +		return trace_seq_printf(s, "%llu [%d]\n", +					t_sector(ent), t_error(ent)); +	}  }  static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)  { -	struct blk_io_trace_remap r = { .device = 0, }; +	struct blk_io_trace_remap r = { .device_from = 0, };  	get_pdu_remap(ent, &r);  	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", -			       t_sector(ent), -			       t_sec(ent), MAJOR(r.device), MINOR(r.device), -			       (unsigned long long)r.sector); +				t_sector(ent), t_sec(ent), +				MAJOR(r.device_from), MINOR(r.device_from), +				(unsigned long long)r.sector_from);  }  static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) @@ -1117,7 +1216,6 @@ static void blk_tracer_print_header(struct seq_file *m)  static void blk_tracer_start(struct trace_array *tr)  {  	blk_tracer_enabled = true; -	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;  }  static int blk_tracer_init(struct trace_array *tr) @@ -1130,7 +1228,6 @@ static int blk_tracer_init(struct trace_array *tr)  static void blk_tracer_stop(struct trace_array *tr)  {  	blk_tracer_enabled = false; -	trace_flags |= TRACE_ITER_CONTEXT_INFO;  }  static void blk_tracer_reset(struct trace_array *tr) @@ -1182,7 +1279,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,  	}  	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) -		ret = trace_seq_printf(s, "Bad pc action %x\n", what); +		ret = trace_seq_printf(s, "Unknown action %x\n", what);  	else {  		ret = log_action(iter, what2act[what].act[long_act]);  		if (ret) @@ -1195,9 +1292,6 @@ out:  static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,  					       int flags)  { -	if (!trace_print_context(iter)) -		return TRACE_TYPE_PARTIAL_LINE; -  	return print_one_line(iter, false);  } @@ -1232,6 +1326,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)  	return print_one_line(iter, true);  } +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ +	/* don't output context-info for blk_classic output */ +	if (bit == TRACE_BLK_OPT_CLASSIC) { +		if (set) +			trace_flags &= ~TRACE_ITER_CONTEXT_INFO; +		else +			trace_flags |= TRACE_ITER_CONTEXT_INFO; +	} +	return 0; +} +  static struct tracer blk_tracer __read_mostly = {  	.name		= "blk",  	.init		= blk_tracer_init, @@ -1241,6 +1347,7 @@ static struct tracer blk_tracer __read_mostly = {  	.print_header	= blk_tracer_print_header,  	.print_line	= blk_tracer_print_line,  	.flags		= &blk_tracer_flags, +	.set_flag	= blk_tracer_set_flag,  };  static struct trace_event trace_blk_event = { @@ -1285,7 +1392,8 @@ static int blk_trace_remove_queue(struct request_queue *q)  /*   * Setup everything required to start tracing   */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +static int blk_trace_setup_queue(struct request_queue *q, +				 struct block_device *bdev)  {  	struct blk_trace *old_bt, *bt = NULL;  	int ret = -ENOMEM; @@ -1298,9 +1406,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)  	if (!bt->msg_data)  		goto free_bt; -	bt->dev = dev; +	bt->dev = bdev->bd_dev;  	bt->act_mask = (u16)-1; -	bt->end_lba = -1ULL; + +	blk_trace_setup_lba(bt, bdev);  	old_bt = xchg(&q->blk_trace, bt);  	if (old_bt != NULL) { @@ -1517,7 +1626,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,  	if (attr == &dev_attr_enable) {  		if (value) -			ret = blk_trace_setup_queue(q, bdev->bd_dev); +			ret = blk_trace_setup_queue(q, bdev);  		else  			ret = blk_trace_remove_queue(q);  		goto out_unlock_bdev; @@ -1525,7 +1634,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,  	ret = 0;  	if (q->blk_trace == NULL) -		ret = blk_trace_setup_queue(q, bdev->bd_dev); +		ret = blk_trace_setup_queue(q, bdev);  	if (ret == 0) {  		if (attr == &dev_attr_act_mask) @@ -1548,3 +1657,77 @@ out:  	return ret ? ret : count;  } +int blk_trace_init_sysfs(struct device *dev) +{ +	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +void blk_dump_cmd(char *buf, struct request *rq) +{ +	int i, end; +	int len = rq->cmd_len; +	unsigned char *cmd = rq->cmd; + +	if (!blk_pc_request(rq)) { +		buf[0] = '\0'; +		return; +	} + +	for (end = len - 1; end >= 0; end--) +		if (cmd[end]) +			break; +	end++; + +	for (i = 0; i < len; i++) { +		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); +		if (i == end && end != len - 1) { +			sprintf(buf, " .."); +			break; +		} +	} +} + +void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +{ +	int i = 0; + +	if (rw & WRITE) +		rwbs[i++] = 'W'; +	else if (rw & 1 << BIO_RW_DISCARD) +		rwbs[i++] = 'D'; +	else if (bytes) +		rwbs[i++] = 'R'; +	else +		rwbs[i++] = 'N'; + +	if (rw & 1 << BIO_RW_AHEAD) +		rwbs[i++] = 'A'; +	if (rw & 1 << BIO_RW_BARRIER) +		rwbs[i++] = 'B'; +	if (rw & 1 << BIO_RW_SYNCIO) +		rwbs[i++] = 'S'; +	if (rw & 1 << BIO_RW_META) +		rwbs[i++] = 'M'; + +	rwbs[i] = '\0'; +} + +void blk_fill_rwbs_rq(char *rwbs, struct request *rq) +{ +	int rw = rq->cmd_flags & 0x03; +	int bytes; + +	if (blk_discard_rq(rq)) +		rw |= (1 << BIO_RW_DISCARD); + +	bytes = blk_rq_bytes(rq); + +	blk_fill_rwbs(rwbs, rw, bytes); +} + +#endif /* CONFIG_EVENT_TRACING */ + diff --git a/kernel/trace/events.c b/kernel/trace/events.c deleted file mode 100644 index 246f2aa6dc4..00000000000 --- a/kernel/trace/events.c +++ /dev/null @@ -1,14 +0,0 @@ -/* - * This is the place to register all trace points as events. - */ - -#include <linux/stringify.h> - -#include <trace/trace_events.h> - -#include "trace_output.h" - -#include "trace_events_stage_1.h" -#include "trace_events_stage_2.h" -#include "trace_events_stage_3.h" - diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ed080406c..bb60732ade0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,11 +29,13 @@  #include <linux/list.h>  #include <linux/hash.h> -#include <trace/sched.h> +#include <trace/events/sched.h>  #include <asm/ftrace.h> +#include <asm/setup.h> -#include "trace.h" +#include "trace_output.h" +#include "trace_stat.h"  #define FTRACE_WARN_ON(cond)			\  	do {					\ @@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);  static struct ftrace_ops ftrace_list_end __read_mostly =  { -	.func = ftrace_stub, +	.func		= ftrace_stub,  };  static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; @@ -240,6 +242,580 @@ static void ftrace_update_pid_func(void)  #endif  } +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { +	struct hlist_node		node; +	unsigned long			ip; +	unsigned long			counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	unsigned long long		time; +#endif +}; + +struct ftrace_profile_page { +	struct ftrace_profile_page	*next; +	unsigned long			index; +	struct ftrace_profile		records[]; +}; + +struct ftrace_profile_stat { +	atomic_t			disabled; +	struct hlist_head		*hash; +	struct ftrace_profile_page	*pages; +	struct ftrace_profile_page	*start; +	struct tracer_stat		stat; +}; + +#define PROFILE_RECORDS_SIZE						\ +	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE					\ +	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static void * +function_stat_next(void *v, int idx) +{ +	struct ftrace_profile *rec = v; +	struct ftrace_profile_page *pg; + +	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: +	rec++; +	if ((void *)rec >= (void *)&pg->records[pg->index]) { +		pg = pg->next; +		if (!pg) +			return NULL; +		rec = &pg->records[0]; +		if (!rec->counter) +			goto again; +	} + +	return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ +	struct ftrace_profile_stat *stat = +		container_of(trace, struct ftrace_profile_stat, stat); + +	if (!stat || !stat->start) +		return NULL; + +	return function_stat_next(&stat->start->records[0], 0); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ +	struct ftrace_profile *a = p1; +	struct ftrace_profile *b = p2; + +	if (a->time < b->time) +		return -1; +	if (a->time > b->time) +		return 1; +	else +		return 0; +} +#else +/* not function graph compares against hits */ +static int function_stat_cmp(void *p1, void *p2) +{ +	struct ftrace_profile *a = p1; +	struct ftrace_profile *b = p2; + +	if (a->counter < b->counter) +		return -1; +	if (a->counter > b->counter) +		return 1; +	else +		return 0; +} +#endif + +static int function_stat_headers(struct seq_file *m) +{ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	seq_printf(m, "  Function                               " +		   "Hit    Time            Avg\n" +		      "  --------                               " +		   "---    ----            ---\n"); +#else +	seq_printf(m, "  Function                               Hit\n" +		      "  --------                               ---\n"); +#endif +	return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ +	struct ftrace_profile *rec = v; +	char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	static DEFINE_MUTEX(mutex); +	static struct trace_seq s; +	unsigned long long avg; +#endif + +	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); +	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	seq_printf(m, "    "); +	avg = rec->time; +	do_div(avg, rec->counter); + +	mutex_lock(&mutex); +	trace_seq_init(&s); +	trace_print_graph_duration(rec->time, &s); +	trace_seq_puts(&s, "    "); +	trace_print_graph_duration(avg, &s); +	trace_print_seq(m, &s); +	mutex_unlock(&mutex); +#endif +	seq_putc(m, '\n'); + +	return 0; +} + +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) +{ +	struct ftrace_profile_page *pg; + +	pg = stat->pages = stat->start; + +	while (pg) { +		memset(pg->records, 0, PROFILE_RECORDS_SIZE); +		pg->index = 0; +		pg = pg->next; +	} + +	memset(stat->hash, 0, +	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +{ +	struct ftrace_profile_page *pg; +	int functions; +	int pages; +	int i; + +	/* If we already allocated, do nothing */ +	if (stat->pages) +		return 0; + +	stat->pages = (void *)get_zeroed_page(GFP_KERNEL); +	if (!stat->pages) +		return -ENOMEM; + +#ifdef CONFIG_DYNAMIC_FTRACE +	functions = ftrace_update_tot_cnt; +#else +	/* +	 * We do not know the number of functions that exist because +	 * dynamic tracing is what counts them. With past experience +	 * we have around 20K functions. That should be more than enough. +	 * It is highly unlikely we will execute every function in +	 * the kernel. +	 */ +	functions = 20000; +#endif + +	pg = stat->start = stat->pages; + +	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + +	for (i = 0; i < pages; i++) { +		pg->next = (void *)get_zeroed_page(GFP_KERNEL); +		if (!pg->next) +			goto out_free; +		pg = pg->next; +	} + +	return 0; + + out_free: +	pg = stat->start; +	while (pg) { +		unsigned long tmp = (unsigned long)pg; + +		pg = pg->next; +		free_page(tmp); +	} + +	free_page((unsigned long)stat->pages); +	stat->pages = NULL; +	stat->start = NULL; + +	return -ENOMEM; +} + +static int ftrace_profile_init_cpu(int cpu) +{ +	struct ftrace_profile_stat *stat; +	int size; + +	stat = &per_cpu(ftrace_profile_stats, cpu); + +	if (stat->hash) { +		/* If the profile is already created, simply reset it */ +		ftrace_profile_reset(stat); +		return 0; +	} + +	/* +	 * We are profiling all functions, but usually only a few thousand +	 * functions are hit. We'll make a hash of 1024 items. +	 */ +	size = FTRACE_PROFILE_HASH_SIZE; + +	stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + +	if (!stat->hash) +		return -ENOMEM; + +	if (!ftrace_profile_bits) { +		size--; + +		for (; size; size >>= 1) +			ftrace_profile_bits++; +	} + +	/* Preallocate the function profiling pages */ +	if (ftrace_profile_pages_init(stat) < 0) { +		kfree(stat->hash); +		stat->hash = NULL; +		return -ENOMEM; +	} + +	return 0; +} + +static int ftrace_profile_init(void) +{ +	int cpu; +	int ret = 0; + +	for_each_online_cpu(cpu) { +		ret = ftrace_profile_init_cpu(cpu); +		if (ret) +			break; +	} + +	return ret; +} + +/* interrupts must be disabled */ +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) +{ +	struct ftrace_profile *rec; +	struct hlist_head *hhd; +	struct hlist_node *n; +	unsigned long key; + +	key = hash_long(ip, ftrace_profile_bits); +	hhd = &stat->hash[key]; + +	if (hlist_empty(hhd)) +		return NULL; + +	hlist_for_each_entry_rcu(rec, n, hhd, node) { +		if (rec->ip == ip) +			return rec; +	} + +	return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile_stat *stat, +			       struct ftrace_profile *rec) +{ +	unsigned long key; + +	key = hash_long(rec->ip, ftrace_profile_bits); +	hlist_add_head_rcu(&rec->node, &stat->hash[key]); +} + +/* + * The memory is already allocated, this simply finds a new record to use. + */ +static struct ftrace_profile * +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) +{ +	struct ftrace_profile *rec = NULL; + +	/* prevent recursion (from NMIs) */ +	if (atomic_inc_return(&stat->disabled) != 1) +		goto out; + +	/* +	 * Try to find the function again since an NMI +	 * could have added it +	 */ +	rec = ftrace_find_profiled_func(stat, ip); +	if (rec) +		goto out; + +	if (stat->pages->index == PROFILES_PER_PAGE) { +		if (!stat->pages->next) +			goto out; +		stat->pages = stat->pages->next; +	} + +	rec = &stat->pages->records[stat->pages->index++]; +	rec->ip = ip; +	ftrace_add_profile(stat, rec); + + out: +	atomic_dec(&stat->disabled); + +	return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ +	struct ftrace_profile_stat *stat; +	struct ftrace_profile *rec; +	unsigned long flags; + +	if (!ftrace_profile_enabled) +		return; + +	local_irq_save(flags); + +	stat = &__get_cpu_var(ftrace_profile_stats); +	if (!stat->hash || !ftrace_profile_enabled) +		goto out; + +	rec = ftrace_find_profiled_func(stat, ip); +	if (!rec) { +		rec = ftrace_profile_alloc(stat, ip); +		if (!rec) +			goto out; +	} + +	rec->counter++; + out: +	local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ +	function_profile_call(trace->func, 0); +	return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ +	struct ftrace_profile_stat *stat; +	unsigned long long calltime; +	struct ftrace_profile *rec; +	unsigned long flags; + +	local_irq_save(flags); +	stat = &__get_cpu_var(ftrace_profile_stats); +	if (!stat->hash || !ftrace_profile_enabled) +		goto out; + +	calltime = trace->rettime - trace->calltime; + +	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { +		int index; + +		index = trace->depth; + +		/* Append this call time to the parent time to subtract */ +		if (index) +			current->ret_stack[index - 1].subtime += calltime; + +		if (current->ret_stack[index].subtime < calltime) +			calltime -= current->ret_stack[index].subtime; +		else +			calltime = 0; +	} + +	rec = ftrace_find_profiled_func(stat, trace->func); +	if (rec) +		rec->time += calltime; + + out: +	local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ +	return register_ftrace_graph(&profile_graph_return, +				     &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ +	unregister_ftrace_graph(); +} +#else +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ +	.func		= function_profile_call, +}; + +static int register_ftrace_profiler(void) +{ +	return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ +	unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, +		     size_t cnt, loff_t *ppos) +{ +	unsigned long val; +	char buf[64];		/* big enough to hold a number */ +	int ret; + +	if (cnt >= sizeof(buf)) +		return -EINVAL; + +	if (copy_from_user(&buf, ubuf, cnt)) +		return -EFAULT; + +	buf[cnt] = 0; + +	ret = strict_strtoul(buf, 10, &val); +	if (ret < 0) +		return ret; + +	val = !!val; + +	mutex_lock(&ftrace_profile_lock); +	if (ftrace_profile_enabled ^ val) { +		if (val) { +			ret = ftrace_profile_init(); +			if (ret < 0) { +				cnt = ret; +				goto out; +			} + +			ret = register_ftrace_profiler(); +			if (ret < 0) { +				cnt = ret; +				goto out; +			} +			ftrace_profile_enabled = 1; +		} else { +			ftrace_profile_enabled = 0; +			/* +			 * unregister_ftrace_profiler calls stop_machine +			 * so this acts like an synchronize_sched. +			 */ +			unregister_ftrace_profiler(); +		} +	} + out: +	mutex_unlock(&ftrace_profile_lock); + +	filp->f_pos += cnt; + +	return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, +		     size_t cnt, loff_t *ppos) +{ +	char buf[64];		/* big enough to hold a number */ +	int r; + +	r = sprintf(buf, "%u\n", ftrace_profile_enabled); +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { +	.open		= tracing_open_generic, +	.read		= ftrace_profile_read, +	.write		= ftrace_profile_write, +}; + +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { +	.name		= "functions", +	.stat_start	= function_stat_start, +	.stat_next	= function_stat_next, +	.stat_cmp	= function_stat_cmp, +	.stat_headers	= function_stat_headers, +	.stat_show	= function_stat_show +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +	struct ftrace_profile_stat *stat; +	struct dentry *entry; +	char *name; +	int ret; +	int cpu; + +	for_each_possible_cpu(cpu) { +		stat = &per_cpu(ftrace_profile_stats, cpu); + +		/* allocate enough for function name + cpu number */ +		name = kmalloc(32, GFP_KERNEL); +		if (!name) { +			/* +			 * The files created are permanent, if something happens +			 * we still do not free memory. +			 */ +			kfree(stat); +			WARN(1, +			     "Could not allocate stat file for cpu %d\n", +			     cpu); +			return; +		} +		stat->stat = function_stats; +		snprintf(name, 32, "function%d", cpu); +		stat->stat.name = name; +		ret = register_stat_tracer(&stat->stat); +		if (ret) { +			WARN(1, +			     "Could not register function stat for cpu %d\n", +			     cpu); +			kfree(name); +			return; +		} +	} + +	entry = debugfs_create_file("function_profile_enabled", 0644, +				    d_tracer, NULL, &ftrace_profile_fops); +	if (!entry) +		pr_warning("Could not create debugfs " +			   "'function_profile_enabled' entry\n"); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ +  /* set when tracing only a pid */  struct pid *ftrace_pid_trace;  static struct pid * const ftrace_swapper_pid = &init_struct_pid; @@ -261,7 +837,6 @@ struct ftrace_func_probe {  	struct rcu_head		rcu;  }; -  enum {  	FTRACE_ENABLE_CALLS		= (1 << 0),  	FTRACE_DISABLE_CALLS		= (1 << 1), @@ -346,30 +921,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)  	rec->flags |= FTRACE_FL_FREE;  } -void ftrace_release(void *start, unsigned long size) -{ -	struct dyn_ftrace *rec; -	struct ftrace_page *pg; -	unsigned long s = (unsigned long)start; -	unsigned long e = s + size; - -	if (ftrace_disabled || !start) -		return; - -	mutex_lock(&ftrace_lock); -	do_for_each_ftrace_rec(pg, rec) { -		if ((rec->ip >= s) && (rec->ip < e)) { -			/* -			 * rec->ip is changed in ftrace_free_rec() -			 * It should not between s and e if record was freed. -			 */ -			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); -			ftrace_free_rec(rec); -		} -	} while_for_each_ftrace_rec(); -	mutex_unlock(&ftrace_lock); -} -  static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)  {  	struct dyn_ftrace *rec; @@ -1408,7 +1959,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)  static struct ftrace_ops trace_probe_ops __read_mostly =  { -	.func = function_trace_probe_call, +	.func		= function_trace_probe_call,  };  static int ftrace_probe_registered; @@ -1823,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)  	ftrace_set_regex(buf, len, reset, 0);  } +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +static int __init set_ftrace_notrace(char *str) +{ +	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); +	return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ +	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); +	return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +static void __init set_ftrace_early_filter(char *buf, int enable) +{ +	char *func; + +	while (buf) { +		func = strsep(&buf, ","); +		ftrace_set_regex(func, strlen(func), 0, enable); +	} +} + +static void __init set_ftrace_early_filters(void) +{ +	if (ftrace_filter_buf[0]) +		set_ftrace_early_filter(ftrace_filter_buf, 1); +	if (ftrace_notrace_buf[0]) +		set_ftrace_early_filter(ftrace_notrace_buf, 0); +} +  static int  ftrace_regex_release(struct inode *inode, struct file *file, int enable)  { @@ -2128,38 +2718,23 @@ static const struct file_operations ftrace_graph_fops = {  static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)  { -	struct dentry *entry; -	entry = debugfs_create_file("available_filter_functions", 0444, -				    d_tracer, NULL, &ftrace_avail_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'available_filter_functions' entry\n"); +	trace_create_file("available_filter_functions", 0444, +			d_tracer, NULL, &ftrace_avail_fops); -	entry = debugfs_create_file("failures", 0444, -				    d_tracer, NULL, &ftrace_failures_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'failures' entry\n"); +	trace_create_file("failures", 0444, +			d_tracer, NULL, &ftrace_failures_fops); -	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, -				    NULL, &ftrace_filter_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_ftrace_filter' entry\n"); +	trace_create_file("set_ftrace_filter", 0644, d_tracer, +			NULL, &ftrace_filter_fops); -	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, +	trace_create_file("set_ftrace_notrace", 0644, d_tracer,  				    NULL, &ftrace_notrace_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_ftrace_notrace' entry\n");  #ifdef CONFIG_FUNCTION_GRAPH_TRACER -	entry = debugfs_create_file("set_graph_function", 0444, d_tracer, +	trace_create_file("set_graph_function", 0444, d_tracer,  				    NULL,  				    &ftrace_graph_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_graph_function' entry\n");  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  	return 0; @@ -2197,14 +2772,72 @@ static int ftrace_convert_nops(struct module *mod,  	return 0;  } -void ftrace_init_module(struct module *mod, -			unsigned long *start, unsigned long *end) +#ifdef CONFIG_MODULES +void ftrace_release(void *start, void *end) +{ +	struct dyn_ftrace *rec; +	struct ftrace_page *pg; +	unsigned long s = (unsigned long)start; +	unsigned long e = (unsigned long)end; + +	if (ftrace_disabled || !start || start == end) +		return; + +	mutex_lock(&ftrace_lock); +	do_for_each_ftrace_rec(pg, rec) { +		if ((rec->ip >= s) && (rec->ip < e)) { +			/* +			 * rec->ip is changed in ftrace_free_rec() +			 * It should not between s and e if record was freed. +			 */ +			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); +			ftrace_free_rec(rec); +		} +	} while_for_each_ftrace_rec(); +	mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, +			       unsigned long *start, unsigned long *end)  {  	if (ftrace_disabled || start == end)  		return;  	ftrace_convert_nops(mod, start, end);  } +static int ftrace_module_notify(struct notifier_block *self, +				unsigned long val, void *data) +{ +	struct module *mod = data; + +	switch (val) { +	case MODULE_STATE_COMING: +		ftrace_init_module(mod, mod->ftrace_callsites, +				   mod->ftrace_callsites + +				   mod->num_ftrace_callsites); +		break; +	case MODULE_STATE_GOING: +		ftrace_release(mod->ftrace_callsites, +			       mod->ftrace_callsites + +			       mod->num_ftrace_callsites); +		break; +	} + +	return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, +				unsigned long val, void *data) +{ +	return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { +	.notifier_call = ftrace_module_notify, +	.priority = 0, +}; +  extern unsigned long __start_mcount_loc[];  extern unsigned long __stop_mcount_loc[]; @@ -2236,6 +2869,12 @@ void __init ftrace_init(void)  				  __start_mcount_loc,  				  __stop_mcount_loc); +	ret = register_module_notifier(&ftrace_module_nb); +	if (ret) +		pr_warning("Failed to register trace ftrace module notifier\n"); + +	set_ftrace_early_filters(); +  	return;   failed:  	ftrace_disabled = 1; @@ -2417,7 +3056,6 @@ static const struct file_operations ftrace_pid_fops = {  static __init int ftrace_init_debugfs(void)  {  	struct dentry *d_tracer; -	struct dentry *entry;  	d_tracer = tracing_init_dentry();  	if (!d_tracer) @@ -2425,11 +3063,11 @@ static __init int ftrace_init_debugfs(void)  	ftrace_init_dyn_debugfs(d_tracer); -	entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, -				    NULL, &ftrace_pid_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_ftrace_pid' entry\n"); +	trace_create_file("set_ftrace_pid", 0644, d_tracer, +			    NULL, &ftrace_pid_fops); + +	ftrace_profile_debugfs(d_tracer); +  	return 0;  }  fs_initcall(ftrace_init_debugfs); @@ -2538,7 +3176,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_graph_active; +static int ftrace_graph_active;  static struct notifier_block ftrace_suspend_notifier;  int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -2580,12 +3218,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)  		}  		if (t->ret_stack == NULL) { -			t->curr_ret_stack = -1; -			/* Make sure IRQs see the -1 first: */ -			barrier(); -			t->ret_stack = ret_stack_list[start++];  			atomic_set(&t->tracing_graph_pause, 0);  			atomic_set(&t->trace_overrun, 0); +			t->curr_ret_stack = -1; +			/* Make sure the tasks see the -1 first: */ +			smp_wmb(); +			t->ret_stack = ret_stack_list[start++];  		}  	} while_each_thread(g, t); @@ -2643,8 +3281,10 @@ static int start_graph_tracing(void)  		return -ENOMEM;  	/* The cpu_boot init_task->ret_stack will never be freed */ -	for_each_online_cpu(cpu) -		ftrace_graph_init_task(idle_task(cpu)); +	for_each_online_cpu(cpu) { +		if (!idle_task(cpu)->ret_stack) +			ftrace_graph_init_task(idle_task(cpu)); +	}  	do {  		ret = alloc_retstack_tasklist(ret_stack_list); @@ -2690,7 +3330,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  	mutex_lock(&ftrace_lock);  	/* we currently allow only one tracer registered at a time */ -	if (atomic_read(&ftrace_graph_active)) { +	if (ftrace_graph_active) {  		ret = -EBUSY;  		goto out;  	} @@ -2698,10 +3338,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;  	register_pm_notifier(&ftrace_suspend_notifier); -	atomic_inc(&ftrace_graph_active); +	ftrace_graph_active++;  	ret = start_graph_tracing();  	if (ret) { -		atomic_dec(&ftrace_graph_active); +		ftrace_graph_active--;  		goto out;  	} @@ -2719,10 +3359,10 @@ void unregister_ftrace_graph(void)  {  	mutex_lock(&ftrace_lock); -	if (!unlikely(atomic_read(&ftrace_graph_active))) +	if (unlikely(!ftrace_graph_active))  		goto out; -	atomic_dec(&ftrace_graph_active); +	ftrace_graph_active--;  	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);  	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;  	ftrace_graph_entry = ftrace_graph_entry_stub; @@ -2736,18 +3376,25 @@ void unregister_ftrace_graph(void)  /* Allocate a return stack for newly created task */  void ftrace_graph_init_task(struct task_struct *t)  { -	if (atomic_read(&ftrace_graph_active)) { -		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH +	/* Make sure we do not use the parent ret_stack */ +	t->ret_stack = NULL; + +	if (ftrace_graph_active) { +		struct ftrace_ret_stack *ret_stack; + +		ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH  				* sizeof(struct ftrace_ret_stack),  				GFP_KERNEL); -		if (!t->ret_stack) +		if (!ret_stack)  			return;  		t->curr_ret_stack = -1;  		atomic_set(&t->tracing_graph_pause, 0);  		atomic_set(&t->trace_overrun, 0);  		t->ftrace_timestamp = 0; -	} else -		t->ret_stack = NULL; +		/* make curr_ret_stack visable before we add the ret_stack */ +		smp_wmb(); +		t->ret_stack = ret_stack; +	}  }  void ftrace_graph_exit_task(struct task_struct *t) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 5011f4d91e3..86cdf671d7e 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -12,7 +12,7 @@  #include <linux/dcache.h>  #include <linux/fs.h> -#include <trace/kmemtrace.h> +#include <linux/kmemtrace.h>  #include "trace_output.h"  #include "trace.h" @@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,  				   gfp_t gfp_flags,  				   int node)  { +	struct ftrace_event_call *call = &event_kmem_alloc;  	struct trace_array *tr = kmemtrace_array;  	struct kmemtrace_alloc_entry *entry;  	struct ring_buffer_event *event; @@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,  	entry->gfp_flags	= gfp_flags;  	entry->node		= node; -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);  	trace_wake_up();  } @@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,  				  unsigned long call_site,  				  const void *ptr)  { +	struct ftrace_event_call *call = &event_kmem_free;  	struct trace_array *tr = kmemtrace_array;  	struct kmemtrace_free_entry *entry;  	struct ring_buffer_event *event; @@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,  	entry->call_site	= call_site;  	entry->ptr		= ptr; -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);  	trace_wake_up();  } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf44c84..2e642b2b725 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -22,6 +22,28 @@  #include "trace.h"  /* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ +	int ret; + +	ret = trace_seq_printf(s, "# compressed entry header\n"); +	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n"); +	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n"); +	ret = trace_seq_printf(s, "\tarray       :   32 bits\n"); +	ret = trace_seq_printf(s, "\n"); +	ret = trace_seq_printf(s, "\tpadding     : type == %d\n", +			       RINGBUF_TYPE_PADDING); +	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", +			       RINGBUF_TYPE_TIME_EXTEND); +	ret = trace_seq_printf(s, "\tdata max type_len  == %d\n", +			       RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + +	return ret; +} + +/*   * The ring buffer is made up of a list of pages. A separate list of pages is   * allocated for each CPU. A writer may only write to a buffer that is   * associated with the CPU it is currently executing on.  A reader may read @@ -182,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);  #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))  #define RB_ALIGNMENT		4U -#define RB_MAX_SMALL_DATA	28 +#define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX  enum {  	RB_LEN_TIME_EXTEND = 8, @@ -191,48 +216,28 @@ enum {  static inline int rb_null_event(struct ring_buffer_event *event)  { -	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; +	return event->type_len == RINGBUF_TYPE_PADDING +			&& event->time_delta == 0;  }  static inline int rb_discarded_event(struct ring_buffer_event *event)  { -	return event->type == RINGBUF_TYPE_PADDING && event->time_delta; +	return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;  }  static void rb_event_set_padding(struct ring_buffer_event *event)  { -	event->type = RINGBUF_TYPE_PADDING; +	event->type_len = RINGBUF_TYPE_PADDING;  	event->time_delta = 0;  } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ -	event->type = RINGBUF_TYPE_PADDING; -	/* time delta must be non zero */ -	if (!event->time_delta) -		event->time_delta = 1; -} -  static unsigned  rb_event_data_length(struct ring_buffer_event *event)  {  	unsigned length; -	if (event->len) -		length = event->len * RB_ALIGNMENT; +	if (event->type_len) +		length = event->type_len * RB_ALIGNMENT;  	else  		length = event->array[0];  	return length + RB_EVNT_HDR_SIZE; @@ -242,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)  static unsigned  rb_event_length(struct ring_buffer_event *event)  { -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		if (rb_null_event(event))  			/* undefined */  			return -1; -		return rb_event_data_length(event); +		return  event->array[0] + RB_EVNT_HDR_SIZE;  	case RINGBUF_TYPE_TIME_EXTEND:  		return RB_LEN_TIME_EXTEND; @@ -271,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)  unsigned ring_buffer_event_length(struct ring_buffer_event *event)  {  	unsigned length = rb_event_length(event); -	if (event->type != RINGBUF_TYPE_DATA) +	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)  		return length;  	length -= RB_EVNT_HDR_SIZE;  	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -284,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);  static void *  rb_event_data(struct ring_buffer_event *event)  { -	BUG_ON(event->type != RINGBUF_TYPE_DATA); +	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);  	/* If length is in len field, then array[0] has the data */ -	if (event->len) +	if (event->type_len)  		return (void *)&event->array[0];  	/* Otherwise length is in array[0] and array[1] has the data */  	return (void *)&event->array[1]; @@ -316,9 +321,10 @@ struct buffer_data_page {  };  struct buffer_page { +	struct list_head list;		/* list of buffer pages */  	local_t		 write;		/* index for next write */  	unsigned	 read;		/* index for next read */ -	struct list_head list;		/* list of free pages */ +	local_t		 entries;	/* entries on this page */  	struct buffer_data_page *page;	/* Actual data page */  }; @@ -361,6 +367,34 @@ static inline int test_time_stamp(u64 delta)  #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + +int ring_buffer_print_page_header(struct trace_seq *s) +{ +	struct buffer_data_page field; +	int ret; + +	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" +			       "offset:0;\tsize:%u;\n", +			       (unsigned int)sizeof(field.time_stamp)); + +	ret = trace_seq_printf(s, "\tfield: local_t commit;\t" +			       "offset:%u;\tsize:%u;\n", +			       (unsigned int)offsetof(typeof(field), commit), +			       (unsigned int)sizeof(field.commit)); + +	ret = trace_seq_printf(s, "\tfield: char data;\t" +			       "offset:%u;\tsize:%u;\n", +			       (unsigned int)offsetof(typeof(field), data), +			       (unsigned int)BUF_PAGE_SIZE); + +	return ret; +} +  /*   * head_page == tail_page && head == tail then buffer is empty.   */ @@ -375,8 +409,11 @@ struct ring_buffer_per_cpu {  	struct buffer_page		*tail_page;	/* write to tail */  	struct buffer_page		*commit_page;	/* committed pages */  	struct buffer_page		*reader_page; +	unsigned long			nmi_dropped; +	unsigned long			commit_overrun;  	unsigned long			overrun; -	unsigned long			entries; +	unsigned long			read; +	local_t				entries;  	u64				write_stamp;  	u64				read_stamp;  	atomic_t			record_disabled; @@ -389,6 +426,8 @@ struct ring_buffer {  	atomic_t			record_disabled;  	cpumask_var_t			cpumask; +	struct lock_class_key		*reader_lock_key; +  	struct mutex			mutex;  	struct ring_buffer_per_cpu	**buffers; @@ -420,13 +459,18 @@ struct ring_buffer_iter {  /* Up this if you want to test the TIME_EXTENTS and normalization */  #define DEBUG_SHIFT 0 +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ +	/* shift to debug/test normalization and TIME_EXTENTS */ +	return buffer->clock() << DEBUG_SHIFT; +} +  u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)  {  	u64 time;  	preempt_disable_notrace(); -	/* shift to debug/test normalization and TIME_EXTENTS */ -	time = buffer->clock() << DEBUG_SHIFT; +	time = rb_time_stamp(buffer, cpu);  	preempt_enable_no_resched_notrace();  	return time; @@ -523,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)  	cpu_buffer->cpu = cpu;  	cpu_buffer->buffer = buffer;  	spin_lock_init(&cpu_buffer->reader_lock); +	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);  	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;  	INIT_LIST_HEAD(&cpu_buffer->pages); @@ -593,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,   * when the buffer wraps. If this flag is not set, the buffer will   * drop data when the tail hits the head.   */ -struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, +					struct lock_class_key *key)  {  	struct ring_buffer *buffer;  	int bsize; @@ -616,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)  	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);  	buffer->flags = flags;  	buffer->clock = trace_clock_local; +	buffer->reader_lock_key = key;  	/* need at least two pages */  	if (buffer->pages == 1) @@ -673,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)  	kfree(buffer);  	return NULL;  } -EXPORT_SYMBOL_GPL(ring_buffer_alloc); +EXPORT_SYMBOL_GPL(__ring_buffer_alloc);  /**   * ring_buffer_free - free a ring buffer. @@ -947,31 +994,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)  	return rb_page_commit(cpu_buffer->head_page);  } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ -	struct ring_buffer_event *event; -	unsigned long head; - -	for (head = 0; head < rb_head_size(cpu_buffer); -	     head += rb_event_length(event)) { - -		event = __rb_page_index(cpu_buffer->head_page, head); -		if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) -			return; -		/* Only count data entries */ -		if (event->type != RINGBUF_TYPE_DATA) -			continue; -		cpu_buffer->overrun++; -		cpu_buffer->entries--; -	} -} -  static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,  			       struct buffer_page **bpage)  { @@ -991,7 +1013,7 @@ rb_event_index(struct ring_buffer_event *event)  	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);  } -static int +static inline int  rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,  	     struct ring_buffer_event *event)  { @@ -1110,28 +1132,21 @@ static void  rb_update_event(struct ring_buffer_event *event,  			 unsigned type, unsigned length)  { -	event->type = type; +	event->type_len = type;  	switch (type) {  	case RINGBUF_TYPE_PADDING: -		break; -  	case RINGBUF_TYPE_TIME_EXTEND: -		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); -		break; -  	case RINGBUF_TYPE_TIME_STAMP: -		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);  		break; -	case RINGBUF_TYPE_DATA: +	case 0:  		length -= RB_EVNT_HDR_SIZE; -		if (length > RB_MAX_SMALL_DATA) { -			event->len = 0; +		if (length > RB_MAX_SMALL_DATA)  			event->array[0] = length; -		} else -			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); +		else +			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);  		break;  	default:  		BUG(); @@ -1155,131 +1170,156 @@ static unsigned rb_calculate_event_length(unsigned length)  	return length;  } +  static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, -		  unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, +	     unsigned long length, unsigned long tail, +	     struct buffer_page *commit_page, +	     struct buffer_page *tail_page, u64 *ts)  { -	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; -	unsigned long tail, write; +	struct buffer_page *next_page, *head_page, *reader_page;  	struct ring_buffer *buffer = cpu_buffer->buffer;  	struct ring_buffer_event *event; -	unsigned long flags;  	bool lock_taken = false; +	unsigned long flags; -	commit_page = cpu_buffer->commit_page; -	/* we just need to protect against interrupts */ -	barrier(); -	tail_page = cpu_buffer->tail_page; -	write = local_add_return(length, &tail_page->write); -	tail = write - length; +	next_page = tail_page; -	/* See if we shot pass the end of this buffer page */ -	if (write > BUF_PAGE_SIZE) { -		struct buffer_page *next_page = tail_page; +	local_irq_save(flags); +	/* +	 * Since the write to the buffer is still not +	 * fully lockless, we must be careful with NMIs. +	 * The locks in the writers are taken when a write +	 * crosses to a new page. The locks protect against +	 * races with the readers (this will soon be fixed +	 * with a lockless solution). +	 * +	 * Because we can not protect against NMIs, and we +	 * want to keep traces reentrant, we need to manage +	 * what happens when we are in an NMI. +	 * +	 * NMIs can happen after we take the lock. +	 * If we are in an NMI, only take the lock +	 * if it is not already taken. Otherwise +	 * simply fail. +	 */ +	if (unlikely(in_nmi())) { +		if (!__raw_spin_trylock(&cpu_buffer->lock)) { +			cpu_buffer->nmi_dropped++; +			goto out_reset; +		} +	} else +		__raw_spin_lock(&cpu_buffer->lock); -		local_irq_save(flags); -		/* -		 * Since the write to the buffer is still not -		 * fully lockless, we must be careful with NMIs. -		 * The locks in the writers are taken when a write -		 * crosses to a new page. The locks protect against -		 * races with the readers (this will soon be fixed -		 * with a lockless solution). -		 * -		 * Because we can not protect against NMIs, and we -		 * want to keep traces reentrant, we need to manage -		 * what happens when we are in an NMI. -		 * -		 * NMIs can happen after we take the lock. -		 * If we are in an NMI, only take the lock -		 * if it is not already taken. Otherwise -		 * simply fail. -		 */ -		if (unlikely(in_nmi())) { -			if (!__raw_spin_trylock(&cpu_buffer->lock)) -				goto out_reset; -		} else -			__raw_spin_lock(&cpu_buffer->lock); +	lock_taken = true; -		lock_taken = true; +	rb_inc_page(cpu_buffer, &next_page); -		rb_inc_page(cpu_buffer, &next_page); +	head_page = cpu_buffer->head_page; +	reader_page = cpu_buffer->reader_page; -		head_page = cpu_buffer->head_page; -		reader_page = cpu_buffer->reader_page; +	/* we grabbed the lock before incrementing */ +	if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) +		goto out_reset; -		/* we grabbed the lock before incrementing */ -		if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) -			goto out_reset; +	/* +	 * If for some reason, we had an interrupt storm that made +	 * it all the way around the buffer, bail, and warn +	 * about it. +	 */ +	if (unlikely(next_page == commit_page)) { +		cpu_buffer->commit_overrun++; +		goto out_reset; +	} -		/* -		 * If for some reason, we had an interrupt storm that made -		 * it all the way around the buffer, bail, and warn -		 * about it. -		 */ -		if (unlikely(next_page == commit_page)) { -			WARN_ON_ONCE(1); +	if (next_page == head_page) { +		if (!(buffer->flags & RB_FL_OVERWRITE))  			goto out_reset; + +		/* tail_page has not moved yet? */ +		if (tail_page == cpu_buffer->tail_page) { +			/* count overflows */ +			cpu_buffer->overrun += +				local_read(&head_page->entries); + +			rb_inc_page(cpu_buffer, &head_page); +			cpu_buffer->head_page = head_page; +			cpu_buffer->head_page->read = 0;  		} +	} -		if (next_page == head_page) { -			if (!(buffer->flags & RB_FL_OVERWRITE)) -				goto out_reset; +	/* +	 * If the tail page is still the same as what we think +	 * it is, then it is up to us to update the tail +	 * pointer. +	 */ +	if (tail_page == cpu_buffer->tail_page) { +		local_set(&next_page->write, 0); +		local_set(&next_page->entries, 0); +		local_set(&next_page->page->commit, 0); +		cpu_buffer->tail_page = next_page; -			/* tail_page has not moved yet? */ -			if (tail_page == cpu_buffer->tail_page) { -				/* count overflows */ -				rb_update_overflow(cpu_buffer); +		/* reread the time stamp */ +		*ts = rb_time_stamp(buffer, cpu_buffer->cpu); +		cpu_buffer->tail_page->page->time_stamp = *ts; +	} -				rb_inc_page(cpu_buffer, &head_page); -				cpu_buffer->head_page = head_page; -				cpu_buffer->head_page->read = 0; -			} -		} +	/* +	 * The actual tail page has moved forward. +	 */ +	if (tail < BUF_PAGE_SIZE) { +		/* Mark the rest of the page with padding */ +		event = __rb_page_index(tail_page, tail); +		rb_event_set_padding(event); +	} -		/* -		 * If the tail page is still the same as what we think -		 * it is, then it is up to us to update the tail -		 * pointer. -		 */ -		if (tail_page == cpu_buffer->tail_page) { -			local_set(&next_page->write, 0); -			local_set(&next_page->page->commit, 0); -			cpu_buffer->tail_page = next_page; +	/* Set the write back to the previous setting */ +	local_sub(length, &tail_page->write); -			/* reread the time stamp */ -			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); -			cpu_buffer->tail_page->page->time_stamp = *ts; -		} +	/* +	 * If this was a commit entry that failed, +	 * increment that too +	 */ +	if (tail_page == cpu_buffer->commit_page && +	    tail == rb_commit_index(cpu_buffer)) { +		rb_set_commit_to_write(cpu_buffer); +	} -		/* -		 * The actual tail page has moved forward. -		 */ -		if (tail < BUF_PAGE_SIZE) { -			/* Mark the rest of the page with padding */ -			event = __rb_page_index(tail_page, tail); -			rb_event_set_padding(event); -		} +	__raw_spin_unlock(&cpu_buffer->lock); +	local_irq_restore(flags); -		if (tail <= BUF_PAGE_SIZE) -			/* Set the write back to the previous setting */ -			local_set(&tail_page->write, tail); +	/* fail and let the caller try again */ +	return ERR_PTR(-EAGAIN); -		/* -		 * If this was a commit entry that failed, -		 * increment that too -		 */ -		if (tail_page == cpu_buffer->commit_page && -		    tail == rb_commit_index(cpu_buffer)) { -			rb_set_commit_to_write(cpu_buffer); -		} + out_reset: +	/* reset write */ +	local_sub(length, &tail_page->write); +	if (likely(lock_taken))  		__raw_spin_unlock(&cpu_buffer->lock); -		local_irq_restore(flags); +	local_irq_restore(flags); +	return NULL; +} -		/* fail and let the caller try again */ -		return ERR_PTR(-EAGAIN); -	} +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, +		  unsigned type, unsigned long length, u64 *ts) +{ +	struct buffer_page *tail_page, *commit_page; +	struct ring_buffer_event *event; +	unsigned long tail, write; + +	commit_page = cpu_buffer->commit_page; +	/* we just need to protect against interrupts */ +	barrier(); +	tail_page = cpu_buffer->tail_page; +	write = local_add_return(length, &tail_page->write); +	tail = write - length; + +	/* See if we shot pass the end of this buffer page */ +	if (write > BUF_PAGE_SIZE) +		return rb_move_tail(cpu_buffer, length, tail, +				    commit_page, tail_page, ts);  	/* We reserved something on the buffer */ @@ -1289,6 +1329,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  	event = __rb_page_index(tail_page, tail);  	rb_update_event(event, type, length); +	/* The passed in type is zero for DATA */ +	if (likely(!type)) +		local_inc(&tail_page->entries); +  	/*  	 * If this is a commit and the tail is zero, then update  	 * this page's time stamp. @@ -1297,16 +1341,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  		cpu_buffer->commit_page->page->time_stamp = *ts;  	return event; +} - out_reset: -	/* reset write */ -	if (tail <= BUF_PAGE_SIZE) -		local_set(&tail_page->write, tail); +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, +		  struct ring_buffer_event *event) +{ +	unsigned long new_index, old_index; +	struct buffer_page *bpage; +	unsigned long index; +	unsigned long addr; -	if (likely(lock_taken)) -		__raw_spin_unlock(&cpu_buffer->lock); -	local_irq_restore(flags); -	return NULL; +	new_index = rb_event_index(event); +	old_index = new_index + rb_event_length(event); +	addr = (unsigned long)event; +	addr &= PAGE_MASK; + +	bpage = cpu_buffer->tail_page; + +	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { +		/* +		 * This is on the tail page. It is possible that +		 * a write could come in and move the tail page +		 * and write to the next page. That is fine +		 * because we just shorten what is on this page. +		 */ +		index = local_cmpxchg(&bpage->write, old_index, new_index); +		if (index == old_index) +			return 1; +	} + +	/* could not discard */ +	return 0;  }  static int @@ -1351,16 +1417,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,  			event->array[0] = *delta >> TS_SHIFT;  		} else {  			cpu_buffer->commit_page->page->time_stamp = *ts; -			event->time_delta = 0; -			event->array[0] = 0; +			/* try to discard, since we do not need this */ +			if (!rb_try_to_discard(cpu_buffer, event)) { +				/* nope, just zero it */ +				event->time_delta = 0; +				event->array[0] = 0; +			}  		}  		cpu_buffer->write_stamp = *ts;  		/* let the caller know this was the commit */  		ret = 1;  	} else { -		/* Darn, this is just wasted space */ -		event->time_delta = 0; -		event->array[0] = 0; +		/* Try to discard the event */ +		if (!rb_try_to_discard(cpu_buffer, event)) { +			/* Darn, this is just wasted space */ +			event->time_delta = 0; +			event->array[0] = 0; +		}  		ret = 0;  	} @@ -1371,13 +1444,14 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,  static struct ring_buffer_event *  rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, -		      unsigned type, unsigned long length) +		      unsigned long length)  {  	struct ring_buffer_event *event; -	u64 ts, delta; +	u64 ts, delta = 0;  	int commit = 0;  	int nr_loops = 0; +	length = rb_calculate_event_length(length);   again:  	/*  	 * We allow for interrupts to reenter here and do a trace. @@ -1391,7 +1465,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))  		return NULL; -	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); +	ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);  	/*  	 * Only the first commit can update the timestamp. @@ -1401,23 +1475,24 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	 * also be made. But only the entry that did the actual  	 * commit will be something other than zero.  	 */ -	if (cpu_buffer->tail_page == cpu_buffer->commit_page && -	    rb_page_write(cpu_buffer->tail_page) == -	    rb_commit_index(cpu_buffer)) { +	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && +		   rb_page_write(cpu_buffer->tail_page) == +		   rb_commit_index(cpu_buffer))) { +		u64 diff; -		delta = ts - cpu_buffer->write_stamp; +		diff = ts - cpu_buffer->write_stamp; -		/* make sure this delta is calculated here */ +		/* make sure this diff is calculated here */  		barrier();  		/* Did the write stamp get updated already? */  		if (unlikely(ts < cpu_buffer->write_stamp)) -			delta = 0; +			goto get_event; -		if (test_time_stamp(delta)) { +		delta = diff; +		if (unlikely(test_time_stamp(delta))) {  			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); -  			if (commit == -EBUSY)  				return NULL; @@ -1426,12 +1501,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  			RB_WARN_ON(cpu_buffer, commit < 0);  		} -	} else -		/* Non commits have zero deltas */ -		delta = 0; +	} -	event = __rb_reserve_next(cpu_buffer, type, length, &ts); -	if (PTR_ERR(event) == -EAGAIN) + get_event: +	event = __rb_reserve_next(cpu_buffer, 0, length, &ts); +	if (unlikely(PTR_ERR(event) == -EAGAIN))  		goto again;  	if (!event) { @@ -1448,7 +1522,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	 * If the timestamp was commited, make the commit our entry  	 * now so that we will update it when needed.  	 */ -	if (commit) +	if (unlikely(commit))  		rb_set_commit_event(cpu_buffer, event);  	else if (!rb_is_commit(cpu_buffer, event))  		delta = 0; @@ -1458,6 +1532,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	return event;  } +#define TRACE_RECURSIVE_DEPTH 16 + +static int trace_recursive_lock(void) +{ +	current->trace_recursion++; + +	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) +		return 0; + +	/* Disable all tracing before we do anything else */ +	tracing_off_permanent(); + +	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" +		    "HC[%lu]:SC[%lu]:NMI[%lu]\n", +		    current->trace_recursion, +		    hardirq_count() >> HARDIRQ_SHIFT, +		    softirq_count() >> SOFTIRQ_SHIFT, +		    in_nmi()); + +	WARN_ON_ONCE(1); +	return -1; +} + +static void trace_recursive_unlock(void) +{ +	WARN_ON_ONCE(!current->trace_recursion); + +	current->trace_recursion--; +} +  static DEFINE_PER_CPU(int, rb_need_resched);  /** @@ -1491,6 +1595,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)  	/* If we are tracing schedule, we don't want to recurse */  	resched = ftrace_preempt_disable(); +	if (trace_recursive_lock()) +		goto out_nocheck; +  	cpu = raw_smp_processor_id();  	if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1501,11 +1608,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)  	if (atomic_read(&cpu_buffer->record_disabled))  		goto out; -	length = rb_calculate_event_length(length); -	if (length > BUF_PAGE_SIZE) +	if (length > BUF_MAX_DATA_SIZE)  		goto out; -	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); +	event = rb_reserve_next_event(cpu_buffer, length);  	if (!event)  		goto out; @@ -1520,6 +1626,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)  	return event;   out: +	trace_recursive_unlock(); + + out_nocheck:  	ftrace_preempt_enable(resched);  	return NULL;  } @@ -1528,7 +1637,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);  static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,  		      struct ring_buffer_event *event)  { -	cpu_buffer->entries++; +	local_inc(&cpu_buffer->entries);  	/* Only process further if we own the commit */  	if (!rb_is_commit(cpu_buffer, event)) @@ -1558,6 +1667,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	trace_recursive_unlock(); +  	/*  	 * Only the last preempt count needs to restore preemption.  	 */ @@ -1570,6 +1681,99 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  }  EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ +	/* array[0] holds the actual length for the discarded event */ +	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; +	event->type_len = RINGBUF_TYPE_PADDING; +	/* time delta must be non zero */ +	if (!event->time_delta) +		event->time_delta = 1; +} + +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ +	rb_event_discard(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, +				struct ring_buffer_event *event) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	int cpu; + +	/* The event is discarded regardless */ +	rb_event_discard(event); + +	/* +	 * This must only be called if the event has not been +	 * committed yet. Thus we can assume that preemption +	 * is still disabled. +	 */ +	RB_WARN_ON(buffer, preemptible()); + +	cpu = smp_processor_id(); +	cpu_buffer = buffer->buffers[cpu]; + +	if (!rb_try_to_discard(cpu_buffer, event)) +		goto out; + +	/* +	 * The commit is still visible by the reader, so we +	 * must increment entries. +	 */ +	local_inc(&cpu_buffer->entries); + out: +	/* +	 * If a write came in and pushed the tail page +	 * we still need to update the commit pointer +	 * if we were the commit. +	 */ +	if (rb_is_commit(cpu_buffer, event)) +		rb_set_commit_to_write(cpu_buffer); + +	trace_recursive_unlock(); + +	/* +	 * Only the last preempt count needs to restore preemption. +	 */ +	if (preempt_count() == 1) +		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); +	else +		preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); +  /**   * ring_buffer_write - write data to the buffer without reserving   * @buffer: The ring buffer to write to. @@ -1589,7 +1793,6 @@ int ring_buffer_write(struct ring_buffer *buffer,  {  	struct ring_buffer_per_cpu *cpu_buffer;  	struct ring_buffer_event *event; -	unsigned long event_length;  	void *body;  	int ret = -EBUSY;  	int cpu, resched; @@ -1612,9 +1815,10 @@ int ring_buffer_write(struct ring_buffer *buffer,  	if (atomic_read(&cpu_buffer->record_disabled))  		goto out; -	event_length = rb_calculate_event_length(length); -	event = rb_reserve_next_event(cpu_buffer, -				      RINGBUF_TYPE_DATA, event_length); +	if (length > BUF_MAX_DATA_SIZE) +		goto out; + +	event = rb_reserve_next_event(cpu_buffer, length);  	if (!event)  		goto out; @@ -1728,7 +1932,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)  		return 0;  	cpu_buffer = buffer->buffers[cpu]; -	ret = cpu_buffer->entries; +	ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) +		- cpu_buffer->read;  	return ret;  } @@ -1755,6 +1960,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)  EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);  /** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long ret; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	ret = cpu_buffer->nmi_dropped; + +	return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long ret; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	ret = cpu_buffer->commit_overrun; + +	return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/**   * ring_buffer_entries - get the number of entries in a buffer   * @buffer: The ring buffer   * @@ -1770,7 +2016,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)  	/* if you care about this being correct, lock the buffer */  	for_each_buffer_cpu(buffer, cpu) {  		cpu_buffer = buffer->buffers[cpu]; -		entries += cpu_buffer->entries; +		entries += (local_read(&cpu_buffer->entries) - +			    cpu_buffer->overrun) - cpu_buffer->read;  	}  	return entries; @@ -1862,7 +2109,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,  {  	u64 delta; -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		return; @@ -1893,7 +2140,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,  {  	u64 delta; -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		return; @@ -1966,6 +2213,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)  	cpu_buffer->reader_page->list.prev = reader->list.prev;  	local_set(&cpu_buffer->reader_page->write, 0); +	local_set(&cpu_buffer->reader_page->entries, 0);  	local_set(&cpu_buffer->reader_page->page->commit, 0);  	/* Make the reader page now replace the head */ @@ -2008,8 +2256,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)  	event = rb_reader_event(cpu_buffer); -	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) -		cpu_buffer->entries--; +	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX +			|| rb_discarded_event(event)) +		cpu_buffer->read++;  	rb_update_read_stamp(cpu_buffer, event); @@ -2031,8 +2280,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)  	 * Check if we are at the end of the buffer.  	 */  	if (iter->head >= rb_page_size(iter->head_page)) { -		if (RB_WARN_ON(buffer, -			       iter->head_page == cpu_buffer->commit_page)) +		/* discarded commits can make the page empty */ +		if (iter->head_page == cpu_buffer->commit_page)  			return;  		rb_inc_iter(iter);  		return; @@ -2075,12 +2324,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	/*  	 * We repeat when a timestamp is encountered. It is possible  	 * to get multiple timestamps from an interrupt entering just -	 * as one timestamp is about to be written. The max times -	 * that this can happen is the number of nested interrupts we -	 * can have.  Nesting 10 deep of interrupts is clearly -	 * an anomaly. +	 * as one timestamp is about to be written, or from discarded +	 * commits. The most that we can have is the number on a single page.  	 */ -	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) +	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))  		return NULL;  	reader = rb_get_reader_page(cpu_buffer); @@ -2089,7 +2336,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	event = rb_reader_event(cpu_buffer); -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		if (rb_null_event(event))  			RB_WARN_ON(cpu_buffer, 1); @@ -2146,14 +2393,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)   again:  	/* -	 * We repeat when a timestamp is encountered. It is possible -	 * to get multiple timestamps from an interrupt entering just -	 * as one timestamp is about to be written. The max times -	 * that this can happen is the number of nested interrupts we -	 * can have. Nesting 10 deep of interrupts is clearly -	 * an anomaly. +	 * We repeat when a timestamp is encountered. +	 * We can get multiple timestamps by nested interrupts or also +	 * if filtering is on (discarding commits). Since discarding +	 * commits can be frequent we can get a lot of timestamps. +	 * But we limit them by not adding timestamps if they begin +	 * at the start of a page.  	 */ -	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) +	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))  		return NULL;  	if (rb_per_cpu_empty(cpu_buffer)) @@ -2161,7 +2408,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)  	event = rb_iter_head_event(iter); -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		if (rb_null_event(event)) {  			rb_inc_iter(iter); @@ -2220,7 +2467,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	event = rb_buffer_peek(buffer, cpu, ts);  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2248,7 +2495,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)  	event = rb_iter_peek(iter, ts);  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2293,7 +2540,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)   out:  	preempt_enable(); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2386,7 +2633,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)   out:  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2411,6 +2658,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)  	cpu_buffer->head_page  		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);  	local_set(&cpu_buffer->head_page->write, 0); +	local_set(&cpu_buffer->head_page->entries, 0);  	local_set(&cpu_buffer->head_page->page->commit, 0);  	cpu_buffer->head_page->read = 0; @@ -2420,11 +2668,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)  	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);  	local_set(&cpu_buffer->reader_page->write, 0); +	local_set(&cpu_buffer->reader_page->entries, 0);  	local_set(&cpu_buffer->reader_page->page->commit, 0);  	cpu_buffer->reader_page->read = 0; +	cpu_buffer->nmi_dropped = 0; +	cpu_buffer->commit_overrun = 0;  	cpu_buffer->overrun = 0; -	cpu_buffer->entries = 0; +	cpu_buffer->read = 0; +	local_set(&cpu_buffer->entries, 0);  	cpu_buffer->write_stamp = 0;  	cpu_buffer->read_stamp = 0; @@ -2443,6 +2695,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	if (!cpumask_test_cpu(cpu, buffer->cpumask))  		return; +	atomic_inc(&cpu_buffer->record_disabled); +  	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	__raw_spin_lock(&cpu_buffer->lock); @@ -2452,6 +2706,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	__raw_spin_unlock(&cpu_buffer->lock);  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + +	atomic_dec(&cpu_buffer->record_disabled);  }  EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2578,28 +2834,6 @@ out:  }  EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, -			      struct buffer_data_page *bpage, -			      unsigned int offset) -{ -	struct ring_buffer_event *event; -	unsigned long head; - -	__raw_spin_lock(&cpu_buffer->lock); -	for (head = offset; head < local_read(&bpage->commit); -	     head += rb_event_length(event)) { - -		event = __rb_data_page_index(bpage, head); -		if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) -			return; -		/* Only count data entries */ -		if (event->type != RINGBUF_TYPE_DATA) -			continue; -		cpu_buffer->entries--; -	} -	__raw_spin_unlock(&cpu_buffer->lock); -} -  /**   * ring_buffer_alloc_read_page - allocate a page to read from buffer   * @buffer: the buffer to allocate for. @@ -2630,6 +2864,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)  	return bpage;  } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);  /**   * ring_buffer_free_read_page - free an allocated read page @@ -2642,6 +2877,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)  {  	free_page((unsigned long)data);  } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);  /**   * ring_buffer_read_page - extract a page from the ring buffer @@ -2768,16 +3004,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  		/* we copied everything to the beginning */  		read = 0;  	} else { +		/* update the entry counter */ +		cpu_buffer->read += local_read(&reader->entries); +  		/* swap the pages */  		rb_init_page(bpage);  		bpage = reader->page;  		reader->page = *data_page;  		local_set(&reader->write, 0); +		local_set(&reader->entries, 0);  		reader->read = 0;  		*data_page = bpage; - -		/* update the entry counter */ -		rb_remove_entries(cpu_buffer, bpage, read);  	}  	ret = read; @@ -2787,6 +3024,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,   out:  	return ret;  } +EXPORT_SYMBOL_GPL(ring_buffer_read_page);  static ssize_t  rb_simple_read(struct file *filp, char __user *ubuf, @@ -2845,14 +3083,11 @@ static const struct file_operations rb_simple_fops = {  static __init int rb_init_debugfs(void)  {  	struct dentry *d_tracer; -	struct dentry *entry;  	d_tracer = tracing_init_dentry(); -	entry = debugfs_create_file("tracing_on", 0644, d_tracer, -				    &ring_buffer_flags, &rb_simple_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'tracing_on' entry\n"); +	trace_create_file("tracing_on", 0644, d_tracer, +			    &ring_buffer_flags, &rb_simple_fops);  	return 0;  } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 00000000000..8d68e149a8b --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,416 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/completion.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/time.h> + +struct rb_page { +	u64		ts; +	local_t		commit; +	char		data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME	10 +#define SLEEP_TIME	10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST()				\ +	do {					\ +		if (!kill_test) {		\ +			kill_test = 1;		\ +			WARN_ON(1);		\ +		}				\ +	} while (0) + +enum event_status { +	EVENT_FOUND, +	EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ +	struct ring_buffer_event *event; +	int *entry; +	u64 ts; + +	event = ring_buffer_consume(buffer, cpu, &ts); +	if (!event) +		return EVENT_DROPPED; + +	entry = ring_buffer_event_data(event); +	if (*entry != cpu) { +		KILL_TEST(); +		return EVENT_DROPPED; +	} + +	read++; +	return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ +	struct ring_buffer_event *event; +	struct rb_page *rpage; +	unsigned long commit; +	void *bpage; +	int *entry; +	int ret; +	int inc; +	int i; + +	bpage = ring_buffer_alloc_read_page(buffer); +	if (!bpage) +		return EVENT_DROPPED; + +	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); +	if (ret >= 0) { +		rpage = bpage; +		commit = local_read(&rpage->commit); +		for (i = 0; i < commit && !kill_test; i += inc) { + +			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { +				KILL_TEST(); +				break; +			} + +			inc = -1; +			event = (void *)&rpage->data[i]; +			switch (event->type_len) { +			case RINGBUF_TYPE_PADDING: +				/* We don't expect any padding */ +				KILL_TEST(); +				break; +			case RINGBUF_TYPE_TIME_EXTEND: +				inc = 8; +				break; +			case 0: +				entry = ring_buffer_event_data(event); +				if (*entry != cpu) { +					KILL_TEST(); +					break; +				} +				read++; +				if (!event->array[0]) { +					KILL_TEST(); +					break; +				} +				inc = event->array[0]; +				break; +			default: +				entry = ring_buffer_event_data(event); +				if (*entry != cpu) { +					KILL_TEST(); +					break; +				} +				read++; +				inc = ((event->type_len + 1) * 4); +			} +			if (kill_test) +				break; + +			if (inc <= 0) { +				KILL_TEST(); +				break; +			} +		} +	} +	ring_buffer_free_read_page(buffer, bpage); + +	if (ret < 0) +		return EVENT_DROPPED; +	return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ +	/* toggle between reading pages and events */ +	read_events ^= 1; + +	read = 0; +	while (!reader_finish && !kill_test) { +		int found; + +		do { +			int cpu; + +			found = 0; +			for_each_online_cpu(cpu) { +				enum event_status stat; + +				if (read_events) +					stat = read_event(cpu); +				else +					stat = read_page(cpu); + +				if (kill_test) +					break; +				if (stat == EVENT_FOUND) +					found = 1; +			} +		} while (found && !kill_test); + +		set_current_state(TASK_INTERRUPTIBLE); +		if (reader_finish) +			break; + +		schedule(); +		__set_current_state(TASK_RUNNING); +	} +	reader_finish = 0; +	complete(&read_done); +} + +static void ring_buffer_producer(void) +{ +	struct timeval start_tv; +	struct timeval end_tv; +	unsigned long long time; +	unsigned long long entries; +	unsigned long long overruns; +	unsigned long missed = 0; +	unsigned long hit = 0; +	unsigned long avg; +	int cnt = 0; + +	/* +	 * Hammer the buffer for 10 secs (this may +	 * make the system stall) +	 */ +	pr_info("Starting ring buffer hammer\n"); +	do_gettimeofday(&start_tv); +	do { +		struct ring_buffer_event *event; +		int *entry; + +		event = ring_buffer_lock_reserve(buffer, 10); +		if (!event) { +			missed++; +		} else { +			hit++; +			entry = ring_buffer_event_data(event); +			*entry = smp_processor_id(); +			ring_buffer_unlock_commit(buffer, event); +		} +		do_gettimeofday(&end_tv); + +		cnt++; +		if (consumer && !(cnt % wakeup_interval)) +			wake_up_process(consumer); + +#ifndef CONFIG_PREEMPT +		/* +		 * If we are a non preempt kernel, the 10 second run will +		 * stop everything while it runs. Instead, we will call +		 * cond_resched and also add any time that was lost by a +		 * rescedule. +		 * +		 * Do a cond resched at the same frequency we would wake up +		 * the reader. +		 */ +		if (cnt % wakeup_interval) +			cond_resched(); +#endif + +	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); +	pr_info("End ring buffer hammer\n"); + +	if (consumer) { +		/* Init both completions here to avoid races */ +		init_completion(&read_start); +		init_completion(&read_done); +		/* the completions must be visible before the finish var */ +		smp_wmb(); +		reader_finish = 1; +		/* finish var visible before waking up the consumer */ +		smp_wmb(); +		wake_up_process(consumer); +		wait_for_completion(&read_done); +	} + +	time = end_tv.tv_sec - start_tv.tv_sec; +	time *= USEC_PER_SEC; +	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + +	entries = ring_buffer_entries(buffer); +	overruns = ring_buffer_overruns(buffer); + +	if (kill_test) +		pr_info("ERROR!\n"); +	pr_info("Time:     %lld (usecs)\n", time); +	pr_info("Overruns: %lld\n", overruns); +	if (disable_reader) +		pr_info("Read:     (reader disabled)\n"); +	else +		pr_info("Read:     %ld  (by %s)\n", read, +			read_events ? "events" : "pages"); +	pr_info("Entries:  %lld\n", entries); +	pr_info("Total:    %lld\n", entries + overruns + read); +	pr_info("Missed:   %ld\n", missed); +	pr_info("Hit:      %ld\n", hit); + +	/* Convert time from usecs to millisecs */ +	do_div(time, USEC_PER_MSEC); +	if (time) +		hit /= (long)time; +	else +		pr_info("TIME IS ZERO??\n"); + +	pr_info("Entries per millisec: %ld\n", hit); + +	if (hit) { +		/* Calculate the average time in nanosecs */ +		avg = NSEC_PER_MSEC / hit; +		pr_info("%ld ns per entry\n", avg); +	} + +	if (missed) { +		if (time) +			missed /= (long)time; + +		pr_info("Total iterations per millisec: %ld\n", hit + missed); + +		/* it is possible that hit + missed will overflow and be zero */ +		if (!(hit + missed)) { +			pr_info("hit + missed overflowed and totalled zero!\n"); +			hit--; /* make it non zero */ +		} + +		/* Caculate the average time in nanosecs */ +		avg = NSEC_PER_MSEC / (hit + missed); +		pr_info("%ld ns per entry\n", avg); +	} +} + +static void wait_to_die(void) +{ +	set_current_state(TASK_INTERRUPTIBLE); +	while (!kthread_should_stop()) { +		schedule(); +		set_current_state(TASK_INTERRUPTIBLE); +	} +	__set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ +	while (!kthread_should_stop() && !kill_test) { +		complete(&read_start); + +		ring_buffer_consumer(); + +		set_current_state(TASK_INTERRUPTIBLE); +		if (kthread_should_stop() || kill_test) +			break; + +		schedule(); +		__set_current_state(TASK_RUNNING); +	} +	__set_current_state(TASK_RUNNING); + +	if (kill_test) +		wait_to_die(); + +	return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ +	init_completion(&read_start); + +	while (!kthread_should_stop() && !kill_test) { +		ring_buffer_reset(buffer); + +		if (consumer) { +			smp_wmb(); +			wake_up_process(consumer); +			wait_for_completion(&read_start); +		} + +		ring_buffer_producer(); + +		pr_info("Sleeping for 10 secs\n"); +		set_current_state(TASK_INTERRUPTIBLE); +		schedule_timeout(HZ * SLEEP_TIME); +		__set_current_state(TASK_RUNNING); +	} + +	if (kill_test) +		wait_to_die(); + +	return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ +	int ret; + +	/* make a one meg buffer in overwite mode */ +	buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); +	if (!buffer) +		return -ENOMEM; + +	if (!disable_reader) { +		consumer = kthread_create(ring_buffer_consumer_thread, +					  NULL, "rb_consumer"); +		ret = PTR_ERR(consumer); +		if (IS_ERR(consumer)) +			goto out_fail; +	} + +	producer = kthread_run(ring_buffer_producer_thread, +			       NULL, "rb_producer"); +	ret = PTR_ERR(producer); + +	if (IS_ERR(producer)) +		goto out_kill; + +	return 0; + + out_kill: +	if (consumer) +		kthread_stop(consumer); + + out_fail: +	ring_buffer_free(buffer); +	return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ +	kthread_stop(producer); +	if (consumer) +		kthread_stop(consumer); +	ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a884c09006c..8acd9b81a5d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -171,6 +171,13 @@ static struct trace_array	global_trace;  static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +int filter_current_check_discard(struct ftrace_event_call *call, void *rec, +				 struct ring_buffer_event *event) +{ +	return filter_check_discard(call, rec, global_trace.buffer, event); +} +EXPORT_SYMBOL_GPL(filter_current_check_discard); +  cycle_t ftrace_now(int cpu)  {  	u64 ts; @@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);  /* trace_flags holds trace_options default values */  unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | -	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; +	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | +	TRACE_ITER_GRAPH_TIME;  /**   * trace_wake_up - wake up tasks waiting for trace input @@ -317,6 +325,7 @@ static const char *trace_options[] = {  	"latency-format",  	"global-clock",  	"sleep-time", +	"graph-time",  	NULL  }; @@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)  	return cnt;  } -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ -	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - -	s->buffer[len] = 0; -	seq_puts(m, s->buffer); - -	trace_seq_init(s); -} -  /**   * update_max_tr - snapshot all trace buffers from global_trace to max_tr   * @tr: tracer @@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)  		tracing_reset(tr, cpu);  } +void tracing_reset_current(int cpu) +{ +	tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ +	tracing_reset_online_cpus(&global_trace); +} +  #define SAVED_CMDLINES 128  #define NO_CMDLINE_MAP UINT_MAX  static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])  		return;  	} +	preempt_disable();  	__raw_spin_lock(&trace_cmdline_lock);  	map = map_pid_to_cmdline[pid];  	if (map != NO_CMDLINE_MAP) @@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])  		strcpy(comm, "<...>");  	__raw_spin_unlock(&trace_cmdline_lock); +	preempt_enable();  }  void tracing_record_cmdline(struct task_struct *tsk) @@ -840,7 +850,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  }  struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, -						    unsigned char type, +						    int type,  						    unsigned long len,  						    unsigned long flags, int pc)  { @@ -883,30 +893,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,  }  struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len,  				  unsigned long flags, int pc)  {  	return trace_buffer_lock_reserve(&global_trace,  					 type, len, flags, pc);  } +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);  void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,  					unsigned long flags, int pc)  { -	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); +	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);  } +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);  void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,  					unsigned long flags, int pc)  { -	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); +	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); +} +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); + +void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +{ +	ring_buffer_discard_commit(global_trace.buffer, event);  } +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);  void  trace_function(struct trace_array *tr,  	       unsigned long ip, unsigned long parent_ip, unsigned long flags,  	       int pc)  { +	struct ftrace_event_call *call = &event_function;  	struct ring_buffer_event *event;  	struct ftrace_entry *entry; @@ -921,7 +941,9 @@ trace_function(struct trace_array *tr,  	entry	= ring_buffer_event_data(event);  	entry->ip			= ip;  	entry->parent_ip		= parent_ip; -	ring_buffer_unlock_commit(tr->buffer, event); + +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);  }  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -930,6 +952,7 @@ static int __trace_graph_entry(struct trace_array *tr,  				unsigned long flags,  				int pc)  { +	struct ftrace_event_call *call = &event_funcgraph_entry;  	struct ring_buffer_event *event;  	struct ftrace_graph_ent_entry *entry; @@ -942,7 +965,8 @@ static int __trace_graph_entry(struct trace_array *tr,  		return 0;  	entry	= ring_buffer_event_data(event);  	entry->graph_ent			= *trace; -	ring_buffer_unlock_commit(global_trace.buffer, event); +	if (!filter_current_check_discard(call, entry, event)) +		ring_buffer_unlock_commit(global_trace.buffer, event);  	return 1;  } @@ -952,6 +976,7 @@ static void __trace_graph_return(struct trace_array *tr,  				unsigned long flags,  				int pc)  { +	struct ftrace_event_call *call = &event_funcgraph_exit;  	struct ring_buffer_event *event;  	struct ftrace_graph_ret_entry *entry; @@ -964,7 +989,8 @@ static void __trace_graph_return(struct trace_array *tr,  		return;  	entry	= ring_buffer_event_data(event);  	entry->ret				= *trace; -	ring_buffer_unlock_commit(global_trace.buffer, event); +	if (!filter_current_check_discard(call, entry, event)) +		ring_buffer_unlock_commit(global_trace.buffer, event);  }  #endif @@ -982,6 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,  				 int skip, int pc)  {  #ifdef CONFIG_STACKTRACE +	struct ftrace_event_call *call = &event_kernel_stack;  	struct ring_buffer_event *event;  	struct stack_entry *entry;  	struct stack_trace trace; @@ -999,7 +1026,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,  	trace.entries		= entry->caller;  	save_stack_trace(&trace); -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);  #endif  } @@ -1024,6 +1052,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,  				   unsigned long flags, int pc)  {  #ifdef CONFIG_STACKTRACE +	struct ftrace_event_call *call = &event_user_stack;  	struct ring_buffer_event *event;  	struct userstack_entry *entry;  	struct stack_trace trace; @@ -1045,7 +1074,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,  	trace.entries		= entry->caller;  	save_stack_trace_user(&trace); -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);  #endif  } @@ -1089,6 +1119,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  			   struct task_struct *next,  			   unsigned long flags, int pc)  { +	struct ftrace_event_call *call = &event_context_switch;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; @@ -1104,7 +1135,9 @@ tracing_sched_switch_trace(struct trace_array *tr,  	entry->next_prio		= next->prio;  	entry->next_state		= next->state;  	entry->next_cpu	= task_cpu(next); -	trace_buffer_unlock_commit(tr, event, flags, pc); + +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		trace_buffer_unlock_commit(tr, event, flags, pc);  }  void @@ -1113,6 +1146,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  			   struct task_struct *curr,  			   unsigned long flags, int pc)  { +	struct ftrace_event_call *call = &event_wakeup;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; @@ -1129,7 +1163,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	entry->next_state		= wakee->state;  	entry->next_cpu			= task_cpu(wakee); -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);  	ftrace_trace_stack(tr, flags, 6, pc);  	ftrace_trace_userstack(tr, flags, pc);  } @@ -1230,11 +1265,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;  	static u32 trace_buf[TRACE_BUF_SIZE]; +	struct ftrace_event_call *call = &event_bprint;  	struct ring_buffer_event *event;  	struct trace_array *tr = &global_trace;  	struct trace_array_cpu *data;  	struct bprint_entry *entry;  	unsigned long flags; +	int disable;  	int resched;  	int cpu, len = 0, size, pc; @@ -1249,7 +1286,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	cpu = raw_smp_processor_id();  	data = tr->data[cpu]; -	if (unlikely(atomic_read(&data->disabled))) +	disable = atomic_inc_return(&data->disabled); +	if (unlikely(disable != 1))  		goto out;  	/* Lockdep uses trace_printk for lock tracing */ @@ -1269,13 +1307,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	entry->fmt			= fmt;  	memcpy(entry->buf, trace_buf, sizeof(u32) * len); -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);  out_unlock:  	__raw_spin_unlock(&trace_buf_lock);  	local_irq_restore(flags);  out: +	atomic_dec_return(&data->disabled);  	ftrace_preempt_enable(resched);  	unpause_graph_tracing(); @@ -1288,12 +1328,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;  	static char trace_buf[TRACE_BUF_SIZE]; +	struct ftrace_event_call *call = &event_print;  	struct ring_buffer_event *event;  	struct trace_array *tr = &global_trace;  	struct trace_array_cpu *data;  	int cpu, len = 0, size, pc;  	struct print_entry *entry;  	unsigned long irq_flags; +	int disable;  	if (tracing_disabled || tracing_selftest_running)  		return 0; @@ -1303,7 +1345,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  	cpu = raw_smp_processor_id();  	data = tr->data[cpu]; -	if (unlikely(atomic_read(&data->disabled))) +	disable = atomic_inc_return(&data->disabled); +	if (unlikely(disable != 1))  		goto out;  	pause_graph_tracing(); @@ -1323,13 +1366,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  	memcpy(&entry->buf, trace_buf, len);  	entry->buf[len] = 0; -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);   out_unlock:  	__raw_spin_unlock(&trace_buf_lock);  	raw_local_irq_restore(irq_flags);  	unpause_graph_tracing();   out: +	atomic_dec_return(&data->disabled);  	preempt_enable_notrace();  	return len; @@ -1526,12 +1571,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)  		p = s_next(m, p, &l);  	} +	trace_event_read_lock();  	return p;  }  static void s_stop(struct seq_file *m, void *p)  {  	atomic_dec(&trace_record_cmdline_disabled); +	trace_event_read_unlock();  }  static void print_lat_help_header(struct seq_file *m) @@ -1774,6 +1821,7 @@ static int trace_empty(struct trace_iterator *iter)  	return 1;  } +/*  Called with trace_event_read_lock() held. */  static enum print_line_t print_trace_line(struct trace_iterator *iter)  {  	enum print_line_t ret; @@ -2380,7 +2428,7 @@ static const char readme_msg[] =  	"# echo print-parent > /debug/tracing/trace_options\n"  	"# echo 1 > /debug/tracing/tracing_enabled\n"  	"# cat /debug/tracing/trace > /tmp/trace.txt\n" -	"echo 0 > /debug/tracing/tracing_enabled\n" +	"# echo 0 > /debug/tracing/tracing_enabled\n"  ;  static ssize_t @@ -2397,6 +2445,56 @@ static const struct file_operations tracing_readme_fops = {  };  static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, +				size_t cnt, loff_t *ppos) +{ +	char *buf_comm; +	char *file_buf; +	char *buf; +	int len = 0; +	int pid; +	int i; + +	file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); +	if (!file_buf) +		return -ENOMEM; + +	buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); +	if (!buf_comm) { +		kfree(file_buf); +		return -ENOMEM; +	} + +	buf = file_buf; + +	for (i = 0; i < SAVED_CMDLINES; i++) { +		int r; + +		pid = map_cmdline_to_pid[i]; +		if (pid == -1 || pid == NO_CMDLINE_MAP) +			continue; + +		trace_find_cmdline(pid, buf_comm); +		r = sprintf(buf, "%d %s\n", pid, buf_comm); +		buf += r; +		len += r; +	} + +	len = simple_read_from_buffer(ubuf, cnt, ppos, +				      file_buf, len); + +	kfree(file_buf); +	kfree(buf_comm); + +	return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { +    .open       = tracing_open_generic, +    .read       = tracing_saved_cmdlines_read, +}; + +static ssize_t  tracing_ctrl_read(struct file *filp, char __user *ubuf,  		  size_t cnt, loff_t *ppos)  { @@ -2728,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  	/* trace pipe does not show start of buffer */  	cpumask_setall(iter->started); +	if (trace_flags & TRACE_ITER_LATENCY_FMT) +		iter->iter_flags |= TRACE_FILE_LAT_FMT; +  	iter->cpu_file = cpu_file;  	iter->tr = &global_trace;  	mutex_init(&iter->mutex); @@ -2915,6 +3016,7 @@ waitagain:  	       offsetof(struct trace_iterator, seq));  	iter->pos = -1; +	trace_event_read_lock();  	while (find_next_entry_inc(iter) != NULL) {  		enum print_line_t ret;  		int len = iter->seq.len; @@ -2931,6 +3033,7 @@ waitagain:  		if (iter->seq.len >= cnt)  			break;  	} +	trace_event_read_unlock();  	/* Now copy what we have to the user */  	sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -3053,6 +3156,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		goto out_err;  	} +	trace_event_read_lock(); +  	/* Fill as many pages as possible. */  	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {  		pages[i] = alloc_page(GFP_KERNEL); @@ -3075,6 +3180,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		trace_seq_init(&iter->seq);  	} +	trace_event_read_unlock();  	mutex_unlock(&iter->mutex);  	spd.nr_pages = i; @@ -3425,7 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		.spd_release	= buffer_spd_release,  	};  	struct buffer_ref *ref; -	int size, i; +	int entries, size, i;  	size_t ret;  	if (*ppos & (PAGE_SIZE - 1)) { @@ -3440,7 +3546,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		len &= PAGE_MASK;  	} -	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { +	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + +	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {  		struct page *page;  		int r; @@ -3457,7 +3565,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		}  		r = ring_buffer_read_page(ref->buffer, &ref->page, -					  len, info->cpu, 0); +					  len, info->cpu, 1);  		if (r < 0) {  			ring_buffer_free_read_page(ref->buffer,  						   ref->page); @@ -3481,6 +3589,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		spd.partial[i].private = (unsigned long)ref;  		spd.nr_pages++;  		*ppos += PAGE_SIZE; + +		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);  	}  	spd.nr_pages = i; @@ -3508,6 +3618,45 @@ static const struct file_operations tracing_buffers_fops = {  	.llseek		= no_llseek,  }; +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, +		   size_t count, loff_t *ppos) +{ +	unsigned long cpu = (unsigned long)filp->private_data; +	struct trace_array *tr = &global_trace; +	struct trace_seq *s; +	unsigned long cnt; + +	s = kmalloc(sizeof(*s), GFP_ATOMIC); +	if (!s) +		return ENOMEM; + +	trace_seq_init(s); + +	cnt = ring_buffer_entries_cpu(tr->buffer, cpu); +	trace_seq_printf(s, "entries: %ld\n", cnt); + +	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); +	trace_seq_printf(s, "overrun: %ld\n", cnt); + +	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); +	trace_seq_printf(s, "commit overrun: %ld\n", cnt); + +	cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); +	trace_seq_printf(s, "nmi dropped: %ld\n", cnt); + +	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + +	kfree(s); + +	return count; +} + +static const struct file_operations tracing_stats_fops = { +	.open		= tracing_open_generic, +	.read		= tracing_stats_read, +}; +  #ifdef CONFIG_DYNAMIC_FTRACE  int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3597,7 +3746,7 @@ struct dentry *tracing_dentry_percpu(void)  static void tracing_init_debugfs_percpu(long cpu)  {  	struct dentry *d_percpu = tracing_dentry_percpu(); -	struct dentry *entry, *d_cpu; +	struct dentry *d_cpu;  	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */  	char cpu_dir[7]; @@ -3612,21 +3761,18 @@ static void tracing_init_debugfs_percpu(long cpu)  	}  	/* per cpu trace_pipe */ -	entry = debugfs_create_file("trace_pipe", 0444, d_cpu, -				(void *) cpu, &tracing_pipe_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'trace_pipe' entry\n"); +	trace_create_file("trace_pipe", 0444, d_cpu, +			(void *) cpu, &tracing_pipe_fops);  	/* per cpu trace */ -	entry = debugfs_create_file("trace", 0644, d_cpu, -				(void *) cpu, &tracing_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'trace' entry\n"); +	trace_create_file("trace", 0644, d_cpu, +			(void *) cpu, &tracing_fops); + +	trace_create_file("trace_pipe_raw", 0444, d_cpu, +			(void *) cpu, &tracing_buffers_fops); -	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, -				    (void *) cpu, &tracing_buffers_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); +	trace_create_file("stats", 0444, d_cpu, +			(void *) cpu, &tracing_stats_fops);  }  #ifdef CONFIG_FTRACE_SELFTEST @@ -3782,6 +3928,22 @@ static const struct file_operations trace_options_core_fops = {  	.write = trace_options_core_write,  }; +struct dentry *trace_create_file(const char *name, +				 mode_t mode, +				 struct dentry *parent, +				 void *data, +				 const struct file_operations *fops) +{ +	struct dentry *ret; + +	ret = debugfs_create_file(name, mode, parent, data, fops); +	if (!ret) +		pr_warning("Could not create debugfs '%s' entry\n", name); + +	return ret; +} + +  static struct dentry *trace_options_init_dentry(void)  {  	struct dentry *d_tracer; @@ -3809,7 +3971,6 @@ create_trace_option_file(struct trace_option_dentry *topt,  			 struct tracer_opt *opt)  {  	struct dentry *t_options; -	struct dentry *entry;  	t_options = trace_options_init_dentry();  	if (!t_options) @@ -3818,11 +3979,9 @@ create_trace_option_file(struct trace_option_dentry *topt,  	topt->flags = flags;  	topt->opt = opt; -	entry = debugfs_create_file(opt->name, 0644, t_options, topt, +	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,  				    &trace_options_fops); -	topt->entry = entry; -  }  static struct trace_option_dentry * @@ -3877,123 +4036,84 @@ static struct dentry *  create_trace_option_core_file(const char *option, long index)  {  	struct dentry *t_options; -	struct dentry *entry;  	t_options = trace_options_init_dentry();  	if (!t_options)  		return NULL; -	entry = debugfs_create_file(option, 0644, t_options, (void *)index, +	return trace_create_file(option, 0644, t_options, (void *)index,  				    &trace_options_core_fops); - -	return entry;  }  static __init void create_trace_options_dir(void)  {  	struct dentry *t_options; -	struct dentry *entry;  	int i;  	t_options = trace_options_init_dentry();  	if (!t_options)  		return; -	for (i = 0; trace_options[i]; i++) { -		entry = create_trace_option_core_file(trace_options[i], i); -		if (!entry) -			pr_warning("Could not create debugfs %s entry\n", -				   trace_options[i]); -	} +	for (i = 0; trace_options[i]; i++) +		create_trace_option_core_file(trace_options[i], i);  }  static __init int tracer_init_debugfs(void)  {  	struct dentry *d_tracer; -	struct dentry *entry;  	int cpu;  	d_tracer = tracing_init_dentry(); -	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, -				    &global_trace, &tracing_ctrl_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); +	trace_create_file("tracing_enabled", 0644, d_tracer, +			&global_trace, &tracing_ctrl_fops); -	entry = debugfs_create_file("trace_options", 0644, d_tracer, -				    NULL, &tracing_iter_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'trace_options' entry\n"); +	trace_create_file("trace_options", 0644, d_tracer, +			NULL, &tracing_iter_fops); -	create_trace_options_dir(); +	trace_create_file("tracing_cpumask", 0644, d_tracer, +			NULL, &tracing_cpumask_fops); -	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, -				    NULL, &tracing_cpumask_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); +	trace_create_file("trace", 0644, d_tracer, +			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops); -	entry = debugfs_create_file("trace", 0644, d_tracer, -				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'trace' entry\n"); +	trace_create_file("available_tracers", 0444, d_tracer, +			&global_trace, &show_traces_fops); -	entry = debugfs_create_file("available_tracers", 0444, d_tracer, -				    &global_trace, &show_traces_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'available_tracers' entry\n"); +	trace_create_file("current_tracer", 0644, d_tracer, +			&global_trace, &set_tracer_fops); -	entry = debugfs_create_file("current_tracer", 0444, d_tracer, -				    &global_trace, &set_tracer_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'current_tracer' entry\n"); +	trace_create_file("tracing_max_latency", 0644, d_tracer, +			&tracing_max_latency, &tracing_max_lat_fops); -	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, -				    &tracing_max_latency, -				    &tracing_max_lat_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'tracing_max_latency' entry\n"); +	trace_create_file("tracing_thresh", 0644, d_tracer, +			&tracing_thresh, &tracing_max_lat_fops); -	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, -				    &tracing_thresh, &tracing_max_lat_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'tracing_thresh' entry\n"); -	entry = debugfs_create_file("README", 0644, d_tracer, -				    NULL, &tracing_readme_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'README' entry\n"); +	trace_create_file("README", 0444, d_tracer, +			NULL, &tracing_readme_fops); -	entry = debugfs_create_file("trace_pipe", 0444, d_tracer, +	trace_create_file("trace_pipe", 0444, d_tracer,  			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'trace_pipe' entry\n"); -	entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, -				    &global_trace, &tracing_entries_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'buffer_size_kb' entry\n"); +	trace_create_file("buffer_size_kb", 0644, d_tracer, +			&global_trace, &tracing_entries_fops); -	entry = debugfs_create_file("trace_marker", 0220, d_tracer, -				    NULL, &tracing_mark_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'trace_marker' entry\n"); +	trace_create_file("trace_marker", 0220, d_tracer, +			NULL, &tracing_mark_fops); + +	trace_create_file("saved_cmdlines", 0444, d_tracer, +			NULL, &tracing_saved_cmdlines_fops);  #ifdef CONFIG_DYNAMIC_FTRACE -	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, -				    &ftrace_update_tot_cnt, -				    &tracing_dyn_info_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'dyn_ftrace_total_info' entry\n"); +	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, +			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);  #endif  #ifdef CONFIG_SYSPROF_TRACER  	init_tracer_sysprof_debugfs(d_tracer);  #endif +	create_trace_options_dir(); +  	for_each_tracing_cpu(cpu)  		tracing_init_debugfs_percpu(cpu); @@ -4064,7 +4184,8 @@ trace_printk_seq(struct trace_seq *s)  static void __ftrace_dump(bool disable_tracing)  { -	static DEFINE_SPINLOCK(ftrace_dump_lock); +	static raw_spinlock_t ftrace_dump_lock = +		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;  	/* use static because iter can be a bit big for the stack */  	static struct trace_iterator iter;  	unsigned int old_userobj; @@ -4073,7 +4194,8 @@ static void __ftrace_dump(bool disable_tracing)  	int cnt = 0, cpu;  	/* only one dump */ -	spin_lock_irqsave(&ftrace_dump_lock, flags); +	local_irq_save(flags); +	__raw_spin_lock(&ftrace_dump_lock);  	if (dump_ran)  		goto out; @@ -4145,7 +4267,8 @@ static void __ftrace_dump(bool disable_tracing)  	}   out: -	spin_unlock_irqrestore(&ftrace_dump_lock, flags); +	__raw_spin_unlock(&ftrace_dump_lock); +	local_irq_restore(flags);  }  /* By default: disable tracing after the dump */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e685ac2b2ba..6e735d4771f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,9 +9,12 @@  #include <linux/mmiotrace.h>  #include <linux/ftrace.h>  #include <trace/boot.h> -#include <trace/kmemtrace.h> +#include <linux/kmemtrace.h>  #include <trace/power.h> +#include <linux/trace_seq.h> +#include <linux/ftrace_event.h> +  enum trace_type {  	__TRACE_FIRST_TYPE = 0, @@ -42,20 +45,6 @@ enum trace_type {  };  /* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { -	unsigned char		type; -	unsigned char		flags; -	unsigned char		preempt_count; -	int			pid; -	int			tgid; -}; - -/*   * Function trace entry - function address and parent function addres:   */  struct ftrace_entry { @@ -263,8 +252,6 @@ struct trace_array_cpu {  	char			comm[TASK_COMM_LEN];  }; -struct trace_iterator; -  /*   * The trace array - an array of per-CPU trace arrays. This is the   * highest level data structure that individual tracers deal with. @@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);  		__ftrace_bad_type();					\  	} while (0) -/* Return values for print_line callback */ -enum print_line_t { -	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */ -	TRACE_TYPE_HANDLED	= 1, -	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */ -	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */ -}; - -  /*   * An option specific to a tracer. This is a boolean value.   * The bit is the bit index that sets its value on the @@ -423,60 +401,30 @@ struct tracer {  	struct tracer_stat	*stats;  }; -struct trace_seq { -	unsigned char		buffer[PAGE_SIZE]; -	unsigned int		len; -	unsigned int		readpos; -}; - -static inline void -trace_seq_init(struct trace_seq *s) -{ -	s->len = 0; -	s->readpos = 0; -} -  #define TRACE_PIPE_ALL_CPU	-1 -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { -	struct trace_array	*tr; -	struct tracer		*trace; -	void			*private; -	int			cpu_file; -	struct mutex		mutex; -	struct ring_buffer_iter	*buffer_iter[NR_CPUS]; - -	/* The below is zeroed out in pipe_read */ -	struct trace_seq	seq; -	struct trace_entry	*ent; -	int			cpu; -	u64			ts; - -	unsigned long		iter_flags; -	loff_t			pos; -	long			idx; - -	cpumask_var_t		started; -}; -  int tracer_init(struct tracer *t, struct trace_array *tr);  int tracing_is_enabled(void);  void trace_wake_up(void);  void tracing_reset(struct trace_array *tr, int cpu);  void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, +				 mode_t mode, +				 struct dentry *parent, +				 void *data, +				 const struct file_operations *fops); +  struct dentry *tracing_init_dentry(void);  void init_tracer_sysprof_debugfs(struct dentry *d_tracer);  struct ring_buffer_event;  struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, -						    unsigned char type, +						    int type,  						    unsigned long len,  						    unsigned long flags,  						    int pc); @@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,  				struct ring_buffer_event *event,  				unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, -				  unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, -					unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, -					unsigned long flags, int pc); -  struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,  						struct trace_array_cpu *data); @@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,  				struct task_struct *prev,  				struct task_struct *next,  				unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk);  void tracing_sched_wakeup_trace(struct trace_array *tr,  				struct task_struct *wakee, @@ -599,6 +538,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,  					       struct trace_array *tr);  extern int trace_selftest_startup_branch(struct tracer *trace,  					 struct trace_array *tr); +extern int trace_selftest_startup_hw_branches(struct tracer *trace, +					      struct trace_array *tr);  #endif /* CONFIG_FTRACE_STARTUP_TEST */  extern void *head_page(struct trace_array_cpu *data); @@ -613,6 +554,8 @@ extern unsigned long trace_flags;  /* Standard output formatting function used for function return traces */  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  extern enum print_line_t print_graph_function(struct trace_iterator *iter); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);  #ifdef CONFIG_DYNAMIC_FTRACE  /* TODO: make this variable */ @@ -644,7 +587,6 @@ static inline int ftrace_graph_addr(unsigned long addr)  	return 1;  }  #endif /* CONFIG_DYNAMIC_FTRACE */ -  #else /* CONFIG_FUNCTION_GRAPH_TRACER */  static inline enum print_line_t  print_graph_function(struct trace_iterator *iter) @@ -692,6 +634,7 @@ enum trace_iterator_flags {  	TRACE_ITER_LATENCY_FMT		= 0x40000,  	TRACE_ITER_GLOBAL_CLK		= 0x80000,  	TRACE_ITER_SLEEP_TIME		= 0x100000, +	TRACE_ITER_GRAPH_TIME		= 0x200000,  };  /* @@ -790,103 +733,113 @@ struct ftrace_event_field {  	char			*type;  	int			offset;  	int			size; +	int			is_signed;  }; -struct ftrace_event_call { -	char			*name; -	char			*system; -	struct dentry		*dir; -	int			enabled; -	int			(*regfunc)(void); -	void			(*unregfunc)(void); -	int			id; -	int			(*raw_init)(void); -	int			(*show_format)(struct trace_seq *s); -	int			(*define_fields)(void); -	struct list_head	fields; +struct event_filter { +	int			n_preds;  	struct filter_pred	**preds; - -#ifdef CONFIG_EVENT_PROFILE -	atomic_t	profile_count; -	int		(*profile_enable)(struct ftrace_event_call *); -	void		(*profile_disable)(struct ftrace_event_call *); -#endif +	char			*filter_string;  };  struct event_subsystem {  	struct list_head	list;  	const char		*name;  	struct dentry		*entry; -	struct filter_pred	**preds; +	void			*filter;  }; -#define events_for_each(event)						\ -	for (event = __start_ftrace_events;				\ -	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \ -	     event++) - -#define MAX_FILTER_PRED 8 -  struct filter_pred; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, +				 int val1, int val2);  struct filter_pred {  	filter_pred_fn_t fn;  	u64 val; -	char *str_val; +	char str_val[MAX_FILTER_STR_VAL];  	int str_len;  	char *field_name;  	int offset;  	int not; -	int or; -	int compound; -	int clear; +	int op; +	int pop_n;  }; -int trace_define_field(struct ftrace_event_call *call, char *type, -		       char *name, int offset, int size); -extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, +extern void print_event_filter(struct ftrace_event_call *call,  			       struct trace_seq *s); -extern int filter_parse(char **pbuf, struct filter_pred *pred); -extern int filter_add_pred(struct ftrace_event_call *call, -			   struct filter_pred *pred); -extern void filter_free_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); -extern void filter_free_subsystem_preds(struct event_subsystem *system); -extern int filter_add_subsystem_pred(struct event_subsystem *system, -				     struct filter_pred *pred); +extern int apply_event_filter(struct ftrace_event_call *call, +			      char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, +					char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, +					 struct trace_seq *s); -void event_trace_printk(unsigned long ip, const char *fmt, ...); -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; +static inline int +filter_check_discard(struct ftrace_event_call *call, void *rec, +		     struct ring_buffer *buffer, +		     struct ring_buffer_event *event) +{ +	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { +		ring_buffer_discard_commit(buffer, event); +		return 1; +	} -#define for_each_event(event)						\ -	for (event = __start_ftrace_events;				\ -	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \ -	     event++) +	return 0; +} + +#define DEFINE_COMPARISON_PRED(type)					\ +static int filter_pred_##type(struct filter_pred *pred, void *event,	\ +			      int val1, int val2)			\ +{									\ +	type *addr = (type *)(event + pred->offset);			\ +	type val = (type)pred->val;					\ +	int match = 0;							\ +									\ +	switch (pred->op) {						\ +	case OP_LT:							\ +		match = (*addr < val);					\ +		break;							\ +	case OP_LE:							\ +		match = (*addr <= val);					\ +		break;							\ +	case OP_GT:							\ +		match = (*addr > val);					\ +		break;							\ +	case OP_GE:							\ +		match = (*addr >= val);					\ +		break;							\ +	default:							\ +		break;							\ +	}								\ +									\ +	return match;							\ +} + +#define DEFINE_EQUALITY_PRED(size)					\ +static int filter_pred_##size(struct filter_pred *pred, void *event,	\ +			      int val1, int val2)			\ +{									\ +	u##size *addr = (u##size *)(event + pred->offset);		\ +	u##size val = (u##size)pred->val;				\ +	int match;							\ +									\ +	match = (val == *addr) ^ pred->not;				\ +									\ +	return match;							\ +} + +extern struct mutex event_mutex; +extern struct list_head ftrace_events;  extern const char *__start___trace_bprintk_fmt[];  extern const char *__stop___trace_bprintk_fmt[]; -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...)				\ -do {									\ -	__trace_printk_check_format(fmt, ##args);			\ -	tracing_record_cmdline(current);				\ -	if (__builtin_constant_p(fmt)) {				\ -		static const char *trace_printk_fmt			\ -		  __attribute__((section("__trace_printk_fmt"))) =	\ -			__builtin_constant_p(fmt) ? fmt : NULL;		\ -									\ -		__trace_bprintk(ip, trace_printk_fmt, ##args);		\ -	} else								\ -		__trace_printk(ip, fmt, ##args);			\ -} while (0) +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\ +	extern struct ftrace_event_call event_##call; +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) +#include "trace_event_types.h"  #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7a30fc4c364..a29ef23ffb4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -9,6 +9,7 @@  #include <linux/debugfs.h>  #include <linux/ftrace.h>  #include <linux/kallsyms.h> +#include <linux/time.h>  #include "trace.h"  #include "trace_output.h" @@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)  	trace_assign_type(field, entry);  	call = &field->boot_call;  	ts = iter->ts; -	nsec_rem = do_div(ts, 1000000000); +	nsec_rem = do_div(ts, NSEC_PER_SEC);  	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",  			(unsigned long)ts, nsec_rem, call->func, call->caller); @@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)  	trace_assign_type(field, entry);  	init_ret = &field->boot_ret;  	ts = iter->ts; -	nsec_rem = do_div(ts, 1000000000); +	nsec_rem = do_div(ts, NSEC_PER_SEC);  	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "  			"returned %d after %llu msecs\n", diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8333715e406..7a7a9fd249a 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;  static void  probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  { +	struct ftrace_event_call *call = &event_branch;  	struct trace_array *tr = branch_tracer;  	struct ring_buffer_event *event;  	struct trace_branch *entry; @@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  	entry->line = f->line;  	entry->correct = val == expect; -	ring_buffer_unlock_commit(tr->buffer, event); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		ring_buffer_unlock_commit(tr->buffer, event);   out:  	atomic_dec(&tr->data[cpu]->disabled); @@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)  	return 0;  } -static void *annotated_branch_stat_start(void) +static void *annotated_branch_stat_start(struct tracer_stat *trace)  {  	return __start_annotated_branch_profile;  } @@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)  	return 0;  } -static void *all_branch_stat_start(void) +static void *all_branch_stat_start(struct tracer_stat *trace)  {  	return __start_branch_profile;  } diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 22cba997077..5b5895afecf 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -10,22 +10,30 @@  int ftrace_profile_enable(int event_id)  {  	struct ftrace_event_call *event; +	int ret = -EINVAL; -	for_each_event(event) { -		if (event->id == event_id) -			return event->profile_enable(event); +	mutex_lock(&event_mutex); +	list_for_each_entry(event, &ftrace_events, list) { +		if (event->id == event_id) { +			ret = event->profile_enable(event); +			break; +		}  	} +	mutex_unlock(&event_mutex); -	return -EINVAL; +	return ret;  }  void ftrace_profile_disable(int event_id)  {  	struct ftrace_event_call *event; -	for_each_event(event) { -		if (event->id == event_id) -			return event->profile_disable(event); +	mutex_lock(&event_mutex); +	list_for_each_entry(event, &ftrace_events, list) { +		if (event->id == event_id) { +			event->profile_disable(event); +			break; +		}  	} +	mutex_unlock(&event_mutex);  } - diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fd78bee71dd..5e32e375134 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,  	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")  ); -TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, +TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,  	TRACE_STRUCT(  		TRACE_FIELD(unsigned long, arg1, arg1)  		TRACE_FIELD(unsigned long, arg2, arg2) @@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,  TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,  	TRACE_STRUCT(  		TRACE_FIELD(unsigned int, line, line) -		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) -		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) +		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, +				    TRACE_FUNC_SIZE+1, func) +		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, +				    TRACE_FUNC_SIZE+1, file)  		TRACE_FIELD(char, correct, correct)  	),  	TP_RAW_FMT("%u:%s:%s (%u)") @@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,  TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,  	TRACE_STRUCT( -		TRACE_FIELD(ktime_t, state_data.stamp, stamp) -		TRACE_FIELD(ktime_t, state_data.end, end) +		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) +		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)  		TRACE_FIELD(int, state_data.type, type)  		TRACE_FIELD(int, state_data.state, state)  	), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 576f4fa2af0..aa08be69a1b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,19 +8,25 @@   *   */ +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/kthread.h>  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/module.h>  #include <linux/ctype.h> +#include <linux/delay.h>  #include "trace_output.h"  #define TRACE_SYSTEM "TRACE_SYSTEM" -static DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_mutex); + +LIST_HEAD(ftrace_events);  int trace_define_field(struct ftrace_event_call *call, char *type, -		       char *name, int offset, int size) +		       char *name, int offset, int size, int is_signed)  {  	struct ftrace_event_field *field; @@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,  	field->offset = offset;  	field->size = size; +	field->is_signed = is_signed;  	list_add(&field->link, &call->fields);  	return 0; @@ -51,47 +58,94 @@ err:  	return -ENOMEM;  } +EXPORT_SYMBOL_GPL(trace_define_field); -static void ftrace_clear_events(void) -{ -	struct ftrace_event_call *call = (void *)__start_ftrace_events; - +#ifdef CONFIG_MODULES -	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { +static void trace_destroy_fields(struct ftrace_event_call *call) +{ +	struct ftrace_event_field *field, *next; -		if (call->enabled) { -			call->enabled = 0; -			call->unregfunc(); -		} -		call++; +	list_for_each_entry_safe(field, next, &call->fields, link) { +		list_del(&field->link); +		kfree(field->type); +		kfree(field->name); +		kfree(field);  	}  } +#endif /* CONFIG_MODULES */ +  static void ftrace_event_enable_disable(struct ftrace_event_call *call,  					int enable)  { -  	switch (enable) {  	case 0:  		if (call->enabled) {  			call->enabled = 0; +			tracing_stop_cmdline_record();  			call->unregfunc();  		}  		break;  	case 1:  		if (!call->enabled) {  			call->enabled = 1; +			tracing_start_cmdline_record();  			call->regfunc();  		}  		break;  	}  } +static void ftrace_clear_events(void) +{ +	struct ftrace_event_call *call; + +	mutex_lock(&event_mutex); +	list_for_each_entry(call, &ftrace_events, list) { +		ftrace_event_enable_disable(call, 0); +	} +	mutex_unlock(&event_mutex); +} + +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int __ftrace_set_clr_event(const char *match, const char *sub, +				  const char *event, int set) +{ +	struct ftrace_event_call *call; +	int ret = -EINVAL; + +	mutex_lock(&event_mutex); +	list_for_each_entry(call, &ftrace_events, list) { + +		if (!call->name || !call->regfunc) +			continue; + +		if (match && +		    strcmp(match, call->name) != 0 && +		    strcmp(match, call->system) != 0) +			continue; + +		if (sub && strcmp(sub, call->system) != 0) +			continue; + +		if (event && strcmp(event, call->name) != 0) +			continue; + +		ftrace_event_enable_disable(call, set); + +		ret = 0; +	} +	mutex_unlock(&event_mutex); + +	return ret; +} +  static int ftrace_set_clr_event(char *buf, int set)  { -	struct ftrace_event_call *call = __start_ftrace_events;  	char *event = NULL, *sub = NULL, *match; -	int ret = -EINVAL;  	/*  	 * The buf format can be <subsystem>:<event-name> @@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)  			event = NULL;  	} -	mutex_lock(&event_mutex); -	for_each_event(call) { - -		if (!call->name || !call->regfunc) -			continue; - -		if (match && -		    strcmp(match, call->name) != 0 && -		    strcmp(match, call->system) != 0) -			continue; - -		if (sub && strcmp(sub, call->system) != 0) -			continue; - -		if (event && strcmp(event, call->name) != 0) -			continue; - -		ftrace_event_enable_disable(call, set); - -		ret = 0; -	} -	mutex_unlock(&event_mutex); +	return __ftrace_set_clr_event(match, sub, event, set); +} -	return ret; +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ +	return __ftrace_set_clr_event(NULL, system, event, set);  }  /* 128 should be much more than enough */ @@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  static void *  t_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = m->private; -	struct ftrace_event_call *next = call; +	struct list_head *list = m->private; +	struct ftrace_event_call *call;  	(*pos)++;  	for (;;) { -		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) +		if (list == &ftrace_events)  			return NULL; +		call = list_entry(list, struct ftrace_event_call, list); +  		/*  		 * The ftrace subsystem is for showing formats only.  		 * They can not be enabled or disabled via the event files. @@ -240,45 +290,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  		if (call->regfunc)  			break; -		call++; -		next = call; +		list = list->next;  	} -	m->private = ++next; +	m->private = list->next;  	return call;  }  static void *t_start(struct seq_file *m, loff_t *pos)  { +	mutex_lock(&event_mutex); +	if (*pos == 0) +		m->private = ftrace_events.next;  	return t_next(m, NULL, pos);  }  static void *  s_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = m->private; -	struct ftrace_event_call *next; +	struct list_head *list = m->private; +	struct ftrace_event_call *call;  	(*pos)++;   retry: -	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) +	if (list == &ftrace_events)  		return NULL; +	call = list_entry(list, struct ftrace_event_call, list); +  	if (!call->enabled) { -		call++; +		list = list->next;  		goto retry;  	} -	next = call; -	m->private = ++next; +	m->private = list->next;  	return call;  }  static void *s_start(struct seq_file *m, loff_t *pos)  { +	mutex_lock(&event_mutex); +	if (*pos == 0) +		m->private = ftrace_events.next;  	return s_next(m, NULL, pos);  } @@ -295,12 +351,12 @@ static int t_show(struct seq_file *m, void *v)  static void t_stop(struct seq_file *m, void *p)  { +	mutex_unlock(&event_mutex);  }  static int  ftrace_event_seq_open(struct inode *inode, struct file *file)  { -	int ret;  	const struct seq_operations *seq_ops;  	if ((file->f_mode & FMODE_WRITE) && @@ -308,13 +364,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)  		ftrace_clear_events();  	seq_ops = inode->i_private; -	ret = seq_open(file, seq_ops); -	if (!ret) { -		struct seq_file *m = file->private_data; - -		m->private = __start_ftrace_events; -	} -	return ret; +	return seq_open(file, seq_ops);  }  static ssize_t @@ -374,8 +424,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	return cnt;  } +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, +		   loff_t *ppos) +{ +	const char set_to_char[4] = { '?', '0', '1', 'X' }; +	const char *system = filp->private_data; +	struct ftrace_event_call *call; +	char buf[2]; +	int set = 0; +	int ret; + +	mutex_lock(&event_mutex); +	list_for_each_entry(call, &ftrace_events, list) { +		if (!call->name || !call->regfunc) +			continue; + +		if (system && strcmp(call->system, system) != 0) +			continue; + +		/* +		 * We need to find out if all the events are set +		 * or if all events or cleared, or if we have +		 * a mixture. +		 */ +		set |= (1 << !!call->enabled); + +		/* +		 * If we have a mixture, no need to look further. +		 */ +		if (set == 3) +			break; +	} +	mutex_unlock(&event_mutex); + +	buf[0] = set_to_char[set]; +	buf[1] = '\n'; + +	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + +	return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, +		    loff_t *ppos) +{ +	const char *system = filp->private_data; +	unsigned long val; +	char buf[64]; +	ssize_t ret; + +	if (cnt >= sizeof(buf)) +		return -EINVAL; + +	if (copy_from_user(&buf, ubuf, cnt)) +		return -EFAULT; + +	buf[cnt] = 0; + +	ret = strict_strtoul(buf, 10, &val); +	if (ret < 0) +		return ret; + +	ret = tracing_update_buffers(); +	if (ret < 0) +		return ret; + +	if (val != 0 && val != 1) +		return -EINVAL; + +	ret = __ftrace_set_clr_event(NULL, system, NULL, val); +	if (ret) +		goto out; + +	ret = cnt; + +out: +	*ppos += cnt; + +	return ret; +} + +extern char *__bad_type_size(void); +  #undef FIELD  #define FIELD(type, name)						\ +	sizeof(type) != sizeof(field.name) ? __bad_type_size() :	\  	#type, "common_" #name, offsetof(typeof(field), name),		\  		sizeof(field.name) @@ -391,7 +526,7 @@ static int trace_write_header(struct trace_seq *s)  				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"  				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"  				"\n", -				FIELD(unsigned char, type), +				FIELD(unsigned short, type),  				FIELD(unsigned char, flags),  				FIELD(unsigned char, preempt_count),  				FIELD(int, pid), @@ -481,7 +616,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  	trace_seq_init(s); -	filter_print_preds(call->preds, s); +	print_event_filter(call, s);  	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);  	kfree(s); @@ -494,38 +629,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  {  	struct ftrace_event_call *call = filp->private_data; -	char buf[64], *pbuf = buf; -	struct filter_pred *pred; +	char *buf;  	int err; -	if (cnt >= sizeof(buf)) +	if (cnt >= PAGE_SIZE)  		return -EINVAL; -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; -	buf[cnt] = '\0'; - -	pred = kzalloc(sizeof(*pred), GFP_KERNEL); -	if (!pred) +	buf = (char *)__get_free_page(GFP_TEMPORARY); +	if (!buf)  		return -ENOMEM; -	err = filter_parse(&pbuf, pred); -	if (err < 0) { -		filter_free_pred(pred); -		return err; -	} - -	if (pred->clear) { -		filter_free_preds(call); -		filter_free_pred(pred); -		return cnt; +	if (copy_from_user(buf, ubuf, cnt)) { +		free_page((unsigned long) buf); +		return -EFAULT;  	} +	buf[cnt] = '\0'; -	err = filter_add_pred(call, pred); -	if (err < 0) { -		filter_free_pred(pred); +	err = apply_event_filter(call, buf); +	free_page((unsigned long) buf); +	if (err < 0)  		return err; -	}  	*ppos += cnt; @@ -549,7 +672,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  	trace_seq_init(s); -	filter_print_preds(system->preds, s); +	print_subsystem_event_filter(system, s);  	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);  	kfree(s); @@ -562,45 +685,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  {  	struct event_subsystem *system = filp->private_data; -	char buf[64], *pbuf = buf; -	struct filter_pred *pred; +	char *buf;  	int err; -	if (cnt >= sizeof(buf)) +	if (cnt >= PAGE_SIZE)  		return -EINVAL; -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; -	buf[cnt] = '\0'; - -	pred = kzalloc(sizeof(*pred), GFP_KERNEL); -	if (!pred) +	buf = (char *)__get_free_page(GFP_TEMPORARY); +	if (!buf)  		return -ENOMEM; -	err = filter_parse(&pbuf, pred); -	if (err < 0) { -		filter_free_pred(pred); -		return err; -	} - -	if (pred->clear) { -		filter_free_subsystem_preds(system); -		filter_free_pred(pred); -		return cnt; +	if (copy_from_user(buf, ubuf, cnt)) { +		free_page((unsigned long) buf); +		return -EFAULT;  	} +	buf[cnt] = '\0'; -	err = filter_add_subsystem_pred(system, pred); -	if (err < 0) { -		filter_free_subsystem_preds(system); -		filter_free_pred(pred); +	err = apply_subsystem_event_filter(system, buf); +	free_page((unsigned long) buf); +	if (err < 0)  		return err; -	}  	*ppos += cnt;  	return cnt;  } +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ +	int (*func)(struct trace_seq *s) = filp->private_data; +	struct trace_seq *s; +	int r; + +	if (*ppos) +		return 0; + +	s = kmalloc(sizeof(*s), GFP_KERNEL); +	if (!s) +		return -ENOMEM; + +	trace_seq_init(s); + +	func(s); +	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + +	kfree(s); + +	return r; +} +  static const struct seq_operations show_event_seq_ops = {  	.start = t_start,  	.next = t_next, @@ -658,6 +792,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {  	.write = subsystem_filter_write,  }; +static const struct file_operations ftrace_system_enable_fops = { +	.open = tracing_open_generic, +	.read = system_enable_read, +	.write = system_enable_write, +}; + +static const struct file_operations ftrace_show_header_fops = { +	.open = tracing_open_generic, +	.read = show_header, +}; +  static struct dentry *event_trace_events_dir(void)  {  	static struct dentry *d_tracer; @@ -684,6 +829,7 @@ static struct dentry *  event_subsystem_dir(const char *name, struct dentry *d_events)  {  	struct event_subsystem *system; +	struct dentry *entry;  	/* First see if we did not already create this dir */  	list_for_each_entry(system, &event_subsystems, list) { @@ -707,16 +853,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)  		return d_events;  	} -	system->name = name; +	system->name = kstrdup(name, GFP_KERNEL); +	if (!system->name) { +		debugfs_remove(system->entry); +		kfree(system); +		return d_events; +	} +  	list_add(&system->list, &event_subsystems); -	system->preds = NULL; +	system->filter = NULL; + +	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); +	if (!system->filter) { +		pr_warning("Could not allocate filter for subsystem " +			   "'%s'\n", name); +		return system->entry; +	} + +	entry = debugfs_create_file("filter", 0644, system->entry, system, +				    &ftrace_subsystem_filter_fops); +	if (!entry) { +		kfree(system->filter); +		system->filter = NULL; +		pr_warning("Could not create debugfs " +			   "'%s/filter' entry\n", name); +	} + +	entry = trace_create_file("enable", 0644, system->entry, +				  (void *)system->name, +				  &ftrace_system_enable_fops);  	return system->entry;  }  static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, +		 const struct file_operations *id, +		 const struct file_operations *enable, +		 const struct file_operations *filter, +		 const struct file_operations *format)  {  	struct dentry *entry;  	int ret; @@ -725,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)  	 * If the trace point header did not define TRACE_SYSTEM  	 * then the system would be called "TRACE_SYSTEM".  	 */ -	if (strcmp(call->system, "TRACE_SYSTEM") != 0) +	if (strcmp(call->system, TRACE_SYSTEM) != 0)  		d_events = event_subsystem_dir(call->system, d_events);  	if (call->raw_init) { @@ -744,21 +920,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)  		return -1;  	} -	if (call->regfunc) { -		entry = debugfs_create_file("enable", 0644, call->dir, call, -					    &ftrace_enable_fops); -		if (!entry) -			pr_warning("Could not create debugfs " -				   "'%s/enable' entry\n", call->name); -	} +	if (call->regfunc) +		entry = trace_create_file("enable", 0644, call->dir, call, +					  enable); -	if (call->id) { -		entry = debugfs_create_file("id", 0444, call->dir, call, -				&ftrace_event_id_fops); -		if (!entry) -			pr_warning("Could not create debugfs '%s/id' entry\n", -					call->name); -	} +	if (call->id) +		entry = trace_create_file("id", 0444, call->dir, call, +					  id);  	if (call->define_fields) {  		ret = call->define_fields(); @@ -767,32 +935,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)  				   " events/%s\n", call->name);  			return ret;  		} -		entry = debugfs_create_file("filter", 0644, call->dir, call, -					    &ftrace_event_filter_fops); -		if (!entry) -			pr_warning("Could not create debugfs " -				   "'%s/filter' entry\n", call->name); +		entry = trace_create_file("filter", 0644, call->dir, call, +					  filter);  	}  	/* A trace may not want to export its format */  	if (!call->show_format)  		return 0; -	entry = debugfs_create_file("format", 0444, call->dir, call, -				    &ftrace_event_format_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'%s/format' entry\n", call->name); +	entry = trace_create_file("format", 0444, call->dir, call, +				  format);  	return 0;  } +#define for_each_event(event, start, end)			\ +	for (event = start;					\ +	     (unsigned long)event < (unsigned long)end;		\ +	     event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { +	struct list_head		list; +	struct module			*mod; +	struct file_operations		id; +	struct file_operations		enable; +	struct file_operations		format; +	struct file_operations		filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ +	struct ftrace_module_file_ops *file_ops; + +	/* +	 * This is a bit of a PITA. To allow for correct reference +	 * counting, modules must "own" their file_operations. +	 * To do this, we allocate the file operations that will be +	 * used in the event directory. +	 */ + +	file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); +	if (!file_ops) +		return NULL; + +	file_ops->mod = mod; + +	file_ops->id = ftrace_event_id_fops; +	file_ops->id.owner = mod; + +	file_ops->enable = ftrace_enable_fops; +	file_ops->enable.owner = mod; + +	file_ops->filter = ftrace_event_filter_fops; +	file_ops->filter.owner = mod; + +	file_ops->format = ftrace_event_format_fops; +	file_ops->format.owner = mod; + +	list_add(&file_ops->list, &ftrace_module_file_list); + +	return file_ops; +} + +static void trace_module_add_events(struct module *mod) +{ +	struct ftrace_module_file_ops *file_ops = NULL; +	struct ftrace_event_call *call, *start, *end; +	struct dentry *d_events; + +	start = mod->trace_events; +	end = mod->trace_events + mod->num_trace_events; + +	if (start == end) +		return; + +	d_events = event_trace_events_dir(); +	if (!d_events) +		return; + +	for_each_event(call, start, end) { +		/* The linker may leave blanks */ +		if (!call->name) +			continue; + +		/* +		 * This module has events, create file ops for this module +		 * if not already done. +		 */ +		if (!file_ops) { +			file_ops = trace_create_file_ops(mod); +			if (!file_ops) +				return; +		} +		call->mod = mod; +		list_add(&call->list, &ftrace_events); +		event_create_dir(call, d_events, +				 &file_ops->id, &file_ops->enable, +				 &file_ops->filter, &file_ops->format); +	} +} + +static void trace_module_remove_events(struct module *mod) +{ +	struct ftrace_module_file_ops *file_ops; +	struct ftrace_event_call *call, *p; +	bool found = false; + +	down_write(&trace_event_mutex); +	list_for_each_entry_safe(call, p, &ftrace_events, list) { +		if (call->mod == mod) { +			found = true; +			ftrace_event_enable_disable(call, 0); +			if (call->event) +				__unregister_ftrace_event(call->event); +			debugfs_remove_recursive(call->dir); +			list_del(&call->list); +			trace_destroy_fields(call); +			destroy_preds(call); +		} +	} + +	/* Now free the file_operations */ +	list_for_each_entry(file_ops, &ftrace_module_file_list, list) { +		if (file_ops->mod == mod) +			break; +	} +	if (&file_ops->list != &ftrace_module_file_list) { +		list_del(&file_ops->list); +		kfree(file_ops); +	} + +	/* +	 * It is safest to reset the ring buffer if the module being unloaded +	 * registered any events. +	 */ +	if (found) +		tracing_reset_current_online_cpus(); +	up_write(&trace_event_mutex); +} + +static int trace_module_notify(struct notifier_block *self, +			       unsigned long val, void *data) +{ +	struct module *mod = data; + +	mutex_lock(&event_mutex); +	switch (val) { +	case MODULE_STATE_COMING: +		trace_module_add_events(mod); +		break; +	case MODULE_STATE_GOING: +		trace_module_remove_events(mod); +		break; +	} +	mutex_unlock(&event_mutex); + +	return 0; +} +#else +static int trace_module_notify(struct notifier_block *self, +			       unsigned long val, void *data) +{ +	return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block trace_module_nb = { +	.notifier_call = trace_module_notify, +	.priority = 0, +}; + +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; +  static __init int event_trace_init(void)  { -	struct ftrace_event_call *call = __start_ftrace_events; +	struct ftrace_event_call *call;  	struct dentry *d_tracer;  	struct dentry *entry;  	struct dentry *d_events; +	int ret;  	d_tracer = tracing_init_dentry();  	if (!d_tracer) @@ -816,13 +1147,243 @@ static __init int event_trace_init(void)  	if (!d_events)  		return 0; -	for_each_event(call) { +	/* ring buffer internal formats */ +	trace_create_file("header_page", 0444, d_events, +			  ring_buffer_print_page_header, +			  &ftrace_show_header_fops); + +	trace_create_file("header_event", 0444, d_events, +			  ring_buffer_print_entry_header, +			  &ftrace_show_header_fops); + +	trace_create_file("enable", 0644, d_events, +			  NULL, &ftrace_system_enable_fops); + +	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {  		/* The linker may leave blanks */  		if (!call->name)  			continue; -		event_create_dir(call, d_events); +		list_add(&call->list, &ftrace_events); +		event_create_dir(call, d_events, &ftrace_event_id_fops, +				 &ftrace_enable_fops, &ftrace_event_filter_fops, +				 &ftrace_event_format_fops);  	} +	ret = register_module_notifier(&trace_module_nb); +	if (ret) +		pr_warning("Failed to register trace events module notifier\n"); +  	return 0;  }  fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ +	spin_lock(&test_spinlock); +	spin_lock_irq(&test_spinlock_irq); +	udelay(1); +	spin_unlock_irq(&test_spinlock_irq); +	spin_unlock(&test_spinlock); + +	mutex_lock(&test_mutex); +	msleep(1); +	mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ +	void *test_malloc; + +	test_malloc = kmalloc(1234, GFP_KERNEL); +	if (!test_malloc) +		pr_info("failed to kmalloc\n"); + +	schedule_on_each_cpu(test_work); + +	kfree(test_malloc); + +	set_current_state(TASK_INTERRUPTIBLE); +	while (!kthread_should_stop()) +		schedule(); + +	return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ +	struct task_struct *test_thread; + +	test_thread = kthread_run(event_test_thread, NULL, "test-events"); +	msleep(1); +	kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init void event_trace_self_tests(void) +{ +	struct ftrace_event_call *call; +	struct event_subsystem *system; +	int ret; + +	pr_info("Running tests on trace events:\n"); + +	list_for_each_entry(call, &ftrace_events, list) { + +		/* Only test those that have a regfunc */ +		if (!call->regfunc) +			continue; + +		pr_info("Testing event %s: ", call->name); + +		/* +		 * If an event is already enabled, someone is using +		 * it and the self test should not be on. +		 */ +		if (call->enabled) { +			pr_warning("Enabled event during self test!\n"); +			WARN_ON_ONCE(1); +			continue; +		} + +		ftrace_event_enable_disable(call, 1); +		event_test_stuff(); +		ftrace_event_enable_disable(call, 0); + +		pr_cont("OK\n"); +	} + +	/* Now test at the sub system level */ + +	pr_info("Running tests on trace event systems:\n"); + +	list_for_each_entry(system, &event_subsystems, list) { + +		/* the ftrace system is special, skip it */ +		if (strcmp(system->name, "ftrace") == 0) +			continue; + +		pr_info("Testing event system %s: ", system->name); + +		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); +		if (WARN_ON_ONCE(ret)) { +			pr_warning("error enabling system %s\n", +				   system->name); +			continue; +		} + +		event_test_stuff(); + +		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); +		if (WARN_ON_ONCE(ret)) +			pr_warning("error disabling system %s\n", +				   system->name); + +		pr_cont("OK\n"); +	} + +	/* Test with all events enabled */ + +	pr_info("Running tests on all trace events:\n"); +	pr_info("Testing all events: "); + +	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); +	if (WARN_ON_ONCE(ret)) { +		pr_warning("error enabling all events\n"); +		return; +	} + +	event_test_stuff(); + +	/* reset sysname */ +	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); +	if (WARN_ON_ONCE(ret)) { +		pr_warning("error disabling all events\n"); +		return; +	} + +	pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ +	struct ring_buffer_event *event; +	struct ftrace_entry *entry; +	unsigned long flags; +	long disabled; +	int resched; +	int cpu; +	int pc; + +	pc = preempt_count(); +	resched = ftrace_preempt_disable(); +	cpu = raw_smp_processor_id(); +	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + +	if (disabled != 1) +		goto out; + +	local_save_flags(flags); + +	event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), +						  flags, pc); +	if (!event) +		goto out; +	entry	= ring_buffer_event_data(event); +	entry->ip			= ip; +	entry->parent_ip		= parent_ip; + +	trace_nowake_buffer_unlock_commit(event, flags, pc); + + out: +	atomic_dec(&per_cpu(test_event_disable, cpu)); +	ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_ops __initdata  = +{ +	.func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ +	register_ftrace_function(&trace_ops); +	pr_info("Running tests again, along with the function tracer\n"); +	event_trace_self_tests(); +	unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + +	event_trace_self_tests(); + +	event_trace_self_test_with_function(); + +	return 0; +} + +late_initcall(event_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e03cbf1e38f..db6e54bdb59 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,57 +22,164 @@  #include <linux/uaccess.h>  #include <linux/module.h>  #include <linux/ctype.h> +#include <linux/mutex.h>  #include "trace.h"  #include "trace_output.h" -static int filter_pred_64(struct filter_pred *pred, void *event) +static DEFINE_MUTEX(filter_mutex); + +enum filter_op_ids  { -	u64 *addr = (u64 *)(event + pred->offset); -	u64 val = (u64)pred->val; -	int match; +	OP_OR, +	OP_AND, +	OP_NE, +	OP_EQ, +	OP_LT, +	OP_LE, +	OP_GT, +	OP_GE, +	OP_NONE, +	OP_OPEN_PAREN, +}; -	match = (val == *addr) ^ pred->not; +struct filter_op { +	int id; +	char *string; +	int precedence; +}; -	return match; -} +static struct filter_op filter_ops[] = { +	{ OP_OR, "||", 1 }, +	{ OP_AND, "&&", 2 }, +	{ OP_NE, "!=", 4 }, +	{ OP_EQ, "==", 4 }, +	{ OP_LT, "<", 5 }, +	{ OP_LE, "<=", 5 }, +	{ OP_GT, ">", 5 }, +	{ OP_GE, ">=", 5 }, +	{ OP_NONE, "OP_NONE", 0 }, +	{ OP_OPEN_PAREN, "(", 0 }, +}; -static int filter_pred_32(struct filter_pred *pred, void *event) -{ -	u32 *addr = (u32 *)(event + pred->offset); -	u32 val = (u32)pred->val; -	int match; +enum { +	FILT_ERR_NONE, +	FILT_ERR_INVALID_OP, +	FILT_ERR_UNBALANCED_PAREN, +	FILT_ERR_TOO_MANY_OPERANDS, +	FILT_ERR_OPERAND_TOO_LONG, +	FILT_ERR_FIELD_NOT_FOUND, +	FILT_ERR_ILLEGAL_FIELD_OP, +	FILT_ERR_ILLEGAL_INTVAL, +	FILT_ERR_BAD_SUBSYS_FILTER, +	FILT_ERR_TOO_MANY_PREDS, +	FILT_ERR_MISSING_FIELD, +	FILT_ERR_INVALID_FILTER, +}; -	match = (val == *addr) ^ pred->not; +static char *err_text[] = { +	"No error", +	"Invalid operator", +	"Unbalanced parens", +	"Too many operands", +	"Operand too long", +	"Field not found", +	"Illegal operation for field type", +	"Illegal integer value", +	"Couldn't find or set field in one of a subsystem's events", +	"Too many terms in predicate expression", +	"Missing field name and/or value", +	"Meaningless filter expression", +}; -	return match; -} +struct opstack_op { +	int op; +	struct list_head list; +}; -static int filter_pred_16(struct filter_pred *pred, void *event) -{ -	u16 *addr = (u16 *)(event + pred->offset); -	u16 val = (u16)pred->val; -	int match; +struct postfix_elt { +	int op; +	char *operand; +	struct list_head list; +}; -	match = (val == *addr) ^ pred->not; +struct filter_parse_state { +	struct filter_op *ops; +	struct list_head opstack; +	struct list_head postfix; +	int lasterr; +	int lasterr_pos; -	return match; +	struct { +		char *string; +		unsigned int cnt; +		unsigned int tail; +	} infix; + +	struct { +		char string[MAX_FILTER_STR_VAL]; +		int pos; +		unsigned int tail; +	} operand; +}; + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +static int filter_pred_and(struct filter_pred *pred __attribute((unused)), +			   void *event __attribute((unused)), +			   int val1, int val2) +{ +	return val1 && val2;  } -static int filter_pred_8(struct filter_pred *pred, void *event) +static int filter_pred_or(struct filter_pred *pred __attribute((unused)), +			  void *event __attribute((unused)), +			  int val1, int val2)  { -	u8 *addr = (u8 *)(event + pred->offset); -	u8 val = (u8)pred->val; -	int match; +	return val1 || val2; +} + +/* Filter predicate for fixed sized arrays of characters */ +static int filter_pred_string(struct filter_pred *pred, void *event, +			      int val1, int val2) +{ +	char *addr = (char *)(event + pred->offset); +	int cmp, match; + +	cmp = strncmp(addr, pred->str_val, pred->str_len); -	match = (val == *addr) ^ pred->not; +	match = (!cmp) ^ pred->not;  	return match;  } -static int filter_pred_string(struct filter_pred *pred, void *event) +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event, +			      int val1, int val2)  { -	char *addr = (char *)(event + pred->offset); +	int str_loc = *(int *)(event + pred->offset); +	char *addr = (char *)(event + str_loc);  	int cmp, match;  	cmp = strncmp(addr, pred->str_val, pred->str_len); @@ -82,59 +189,130 @@ static int filter_pred_string(struct filter_pred *pred, void *event)  	return match;  } +static int filter_pred_none(struct filter_pred *pred, void *event, +			    int val1, int val2) +{ +	return 0; +} +  /* return 1 if event matches, 0 otherwise (discard) */  int filter_match_preds(struct ftrace_event_call *call, void *rec)  { -	int i, matched, and_failed = 0; +	struct event_filter *filter = call->filter; +	int match, top = 0, val1 = 0, val2 = 0; +	int stack[MAX_FILTER_PRED];  	struct filter_pred *pred; +	int i; -	for (i = 0; i < MAX_FILTER_PRED; i++) { -		if (call->preds[i]) { -			pred = call->preds[i]; -			if (and_failed && !pred->or) -				continue; -			matched = pred->fn(pred, rec); -			if (!matched && !pred->or) { -				and_failed = 1; -				continue; -			} else if (matched && pred->or) -				return 1; -		} else -			break; +	for (i = 0; i < filter->n_preds; i++) { +		pred = filter->preds[i]; +		if (!pred->pop_n) { +			match = pred->fn(pred, rec, val1, val2); +			stack[top++] = match; +			continue; +		} +		if (pred->pop_n > top) { +			WARN_ON_ONCE(1); +			return 0; +		} +		val1 = stack[--top]; +		val2 = stack[--top]; +		match = pred->fn(pred, rec, val1, val2); +		stack[top++] = match;  	} -	if (and_failed) -		return 0; +	return stack[--top]; +} +EXPORT_SYMBOL_GPL(filter_match_preds); -	return 1; +static void parse_error(struct filter_parse_state *ps, int err, int pos) +{ +	ps->lasterr = err; +	ps->lasterr_pos = pos;  } -void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +static void remove_filter_string(struct event_filter *filter)  { -	char *field_name; -	struct filter_pred *pred; -	int i; +	kfree(filter->filter_string); +	filter->filter_string = NULL; +} -	if (!preds) { -		trace_seq_printf(s, "none\n"); +static int replace_filter_string(struct event_filter *filter, +				 char *filter_string) +{ +	kfree(filter->filter_string); +	filter->filter_string = kstrdup(filter_string, GFP_KERNEL); +	if (!filter->filter_string) +		return -ENOMEM; + +	return 0; +} + +static int append_filter_string(struct event_filter *filter, +				char *string) +{ +	int newlen; +	char *new_filter_string; + +	BUG_ON(!filter->filter_string); +	newlen = strlen(filter->filter_string) + strlen(string) + 1; +	new_filter_string = kmalloc(newlen, GFP_KERNEL); +	if (!new_filter_string) +		return -ENOMEM; + +	strcpy(new_filter_string, filter->filter_string); +	strcat(new_filter_string, string); +	kfree(filter->filter_string); +	filter->filter_string = new_filter_string; + +	return 0; +} + +static void append_filter_err(struct filter_parse_state *ps, +			      struct event_filter *filter) +{ +	int pos = ps->lasterr_pos; +	char *buf, *pbuf; + +	buf = (char *)__get_free_page(GFP_TEMPORARY); +	if (!buf)  		return; -	} -	for (i = 0; i < MAX_FILTER_PRED; i++) { -		if (preds[i]) { -			pred = preds[i]; -			field_name = pred->field_name; -			if (i) -				trace_seq_printf(s, pred->or ? "|| " : "&& "); -			trace_seq_printf(s, "%s ", field_name); -			trace_seq_printf(s, pred->not ? "!= " : "== "); -			if (pred->str_val) -				trace_seq_printf(s, "%s\n", pred->str_val); -			else -				trace_seq_printf(s, "%llu\n", pred->val); -		} else -			break; -	} +	append_filter_string(filter, "\n"); +	memset(buf, ' ', PAGE_SIZE); +	if (pos > PAGE_SIZE - 128) +		pos = 0; +	buf[pos] = '^'; +	pbuf = &buf[pos] + 1; + +	sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); +	append_filter_string(filter, buf); +	free_page((unsigned long) buf); +} + +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +{ +	struct event_filter *filter = call->filter; + +	mutex_lock(&filter_mutex); +	if (filter->filter_string) +		trace_seq_printf(s, "%s\n", filter->filter_string); +	else +		trace_seq_printf(s, "none\n"); +	mutex_unlock(&filter_mutex); +} + +void print_subsystem_event_filter(struct event_subsystem *system, +				  struct trace_seq *s) +{ +	struct event_filter *filter = system->filter; + +	mutex_lock(&filter_mutex); +	if (filter->filter_string) +		trace_seq_printf(s, "%s\n", filter->filter_string); +	else +		trace_seq_printf(s, "none\n"); +	mutex_unlock(&filter_mutex);  }  static struct ftrace_event_field * @@ -150,284 +328,828 @@ find_event_field(struct ftrace_event_call *call, char *name)  	return NULL;  } -void filter_free_pred(struct filter_pred *pred) +static void filter_free_pred(struct filter_pred *pred)  {  	if (!pred)  		return;  	kfree(pred->field_name); -	kfree(pred->str_val);  	kfree(pred);  } -void filter_free_preds(struct ftrace_event_call *call) +static void filter_clear_pred(struct filter_pred *pred)  { -	int i; +	kfree(pred->field_name); +	pred->field_name = NULL; +	pred->str_len = 0; +} -	if (call->preds) { -		for (i = 0; i < MAX_FILTER_PRED; i++) -			filter_free_pred(call->preds[i]); -		kfree(call->preds); -		call->preds = NULL; +static int filter_set_pred(struct filter_pred *dest, +			   struct filter_pred *src, +			   filter_pred_fn_t fn) +{ +	*dest = *src; +	if (src->field_name) { +		dest->field_name = kstrdup(src->field_name, GFP_KERNEL); +		if (!dest->field_name) +			return -ENOMEM;  	} +	dest->fn = fn; + +	return 0;  } -void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_disable_preds(struct ftrace_event_call *call)  { -	struct ftrace_event_call *call = __start_ftrace_events; +	struct event_filter *filter = call->filter;  	int i; -	if (system->preds) { -		for (i = 0; i < MAX_FILTER_PRED; i++) -			filter_free_pred(system->preds[i]); -		kfree(system->preds); -		system->preds = NULL; -	} +	call->filter_active = 0; +	filter->n_preds = 0; -	events_for_each(call) { -		if (!call->name || !call->regfunc) -			continue; +	for (i = 0; i < MAX_FILTER_PRED; i++) +		filter->preds[i]->fn = filter_pred_none; +} -		if (!strcmp(call->system, system->name)) -			filter_free_preds(call); +void destroy_preds(struct ftrace_event_call *call) +{ +	struct event_filter *filter = call->filter; +	int i; + +	for (i = 0; i < MAX_FILTER_PRED; i++) { +		if (filter->preds[i]) +			filter_free_pred(filter->preds[i]);  	} +	kfree(filter->preds); +	kfree(filter); +	call->filter = NULL;  } -static int __filter_add_pred(struct ftrace_event_call *call, -			     struct filter_pred *pred) +int init_preds(struct ftrace_event_call *call)  { +	struct event_filter *filter; +	struct filter_pred *pred;  	int i; -	if (call->preds && !pred->compound) -		filter_free_preds(call); +	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); +	if (!call->filter) +		return -ENOMEM; + +	call->filter_active = 0; +	filter->n_preds = 0; -	if (!call->preds) { -		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), -				      GFP_KERNEL); -		if (!call->preds) -			return -ENOMEM; -	} +	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); +	if (!filter->preds) +		goto oom;  	for (i = 0; i < MAX_FILTER_PRED; i++) { -		if (!call->preds[i]) { -			call->preds[i] = pred; -			return 0; +		pred = kzalloc(sizeof(*pred), GFP_KERNEL); +		if (!pred) +			goto oom; +		pred->fn = filter_pred_none; +		filter->preds[i] = pred; +	} + +	return 0; + +oom: +	destroy_preds(call); + +	return -ENOMEM; +} +EXPORT_SYMBOL_GPL(init_preds); + +static void filter_free_subsystem_preds(struct event_subsystem *system) +{ +	struct event_filter *filter = system->filter; +	struct ftrace_event_call *call; +	int i; + +	if (filter->n_preds) { +		for (i = 0; i < filter->n_preds; i++) +			filter_free_pred(filter->preds[i]); +		kfree(filter->preds); +		filter->preds = NULL; +		filter->n_preds = 0; +	} + +	mutex_lock(&event_mutex); +	list_for_each_entry(call, &ftrace_events, list) { +		if (!call->define_fields) +			continue; + +		if (!strcmp(call->system, system->name)) { +			filter_disable_preds(call); +			remove_filter_string(call->filter);  		}  	} +	mutex_unlock(&event_mutex); +} + +static int filter_add_pred_fn(struct filter_parse_state *ps, +			      struct ftrace_event_call *call, +			      struct filter_pred *pred, +			      filter_pred_fn_t fn) +{ +	struct event_filter *filter = call->filter; +	int idx, err; + +	if (filter->n_preds == MAX_FILTER_PRED) { +		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); +		return -ENOSPC; +	} + +	idx = filter->n_preds; +	filter_clear_pred(filter->preds[idx]); +	err = filter_set_pred(filter->preds[idx], pred, fn); +	if (err) +		return err; -	return -ENOSPC; +	filter->n_preds++; +	call->filter_active = 1; + +	return 0;  } +enum { +	FILTER_STATIC_STRING = 1, +	FILTER_DYN_STRING +}; +  static int is_string_field(const char *type)  { +	if (strstr(type, "__data_loc") && strstr(type, "char")) +		return FILTER_DYN_STRING; +  	if (strchr(type, '[') && strstr(type, "char")) -		return 1; +		return FILTER_STATIC_STRING;  	return 0;  } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +static int is_legal_op(struct ftrace_event_field *field, int op)  { -	struct ftrace_event_field *field; - -	field = find_event_field(call, pred->field_name); -	if (!field) -		return -EINVAL; +	if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) +		return 0; -	pred->offset = field->offset; +	return 1; +} -	if (is_string_field(field->type)) { -		if (!pred->str_val) -			return -EINVAL; -		pred->fn = filter_pred_string; -		pred->str_len = field->size; -		return __filter_add_pred(call, pred); -	} else { -		if (pred->str_val) -			return -EINVAL; -	} +static filter_pred_fn_t select_comparison_fn(int op, int field_size, +					     int field_is_signed) +{ +	filter_pred_fn_t fn = NULL; -	switch (field->size) { +	switch (field_size) {  	case 8: -		pred->fn = filter_pred_64; +		if (op == OP_EQ || op == OP_NE) +			fn = filter_pred_64; +		else if (field_is_signed) +			fn = filter_pred_s64; +		else +			fn = filter_pred_u64;  		break;  	case 4: -		pred->fn = filter_pred_32; +		if (op == OP_EQ || op == OP_NE) +			fn = filter_pred_32; +		else if (field_is_signed) +			fn = filter_pred_s32; +		else +			fn = filter_pred_u32;  		break;  	case 2: -		pred->fn = filter_pred_16; +		if (op == OP_EQ || op == OP_NE) +			fn = filter_pred_16; +		else if (field_is_signed) +			fn = filter_pred_s16; +		else +			fn = filter_pred_u16;  		break;  	case 1: -		pred->fn = filter_pred_8; +		if (op == OP_EQ || op == OP_NE) +			fn = filter_pred_8; +		else if (field_is_signed) +			fn = filter_pred_s8; +		else +			fn = filter_pred_u8;  		break; -	default: -		return -EINVAL;  	} -	return __filter_add_pred(call, pred); +	return fn;  } -static struct filter_pred *copy_pred(struct filter_pred *pred) +static int filter_add_pred(struct filter_parse_state *ps, +			   struct ftrace_event_call *call, +			   struct filter_pred *pred)  { -	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); -	if (!new_pred) -		return NULL; +	struct ftrace_event_field *field; +	filter_pred_fn_t fn; +	unsigned long long val; +	int string_type; -	memcpy(new_pred, pred, sizeof(*pred)); +	pred->fn = filter_pred_none; -	if (pred->field_name) { -		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); -		if (!new_pred->field_name) { -			kfree(new_pred); -			return NULL; -		} +	if (pred->op == OP_AND) { +		pred->pop_n = 2; +		return filter_add_pred_fn(ps, call, pred, filter_pred_and); +	} else if (pred->op == OP_OR) { +		pred->pop_n = 2; +		return filter_add_pred_fn(ps, call, pred, filter_pred_or); +	} + +	field = find_event_field(call, pred->field_name); +	if (!field) { +		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); +		return -EINVAL; +	} + +	pred->offset = field->offset; + +	if (!is_legal_op(field, pred->op)) { +		parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); +		return -EINVAL;  	} -	if (pred->str_val) { -		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); -		if (!new_pred->str_val) { -			filter_free_pred(new_pred); -			return NULL; +	string_type = is_string_field(field->type); +	if (string_type) { +		if (string_type == FILTER_STATIC_STRING) +			fn = filter_pred_string; +		else +			fn = filter_pred_strloc; +		pred->str_len = field->size; +		if (pred->op == OP_NE) +			pred->not = 1; +		return filter_add_pred_fn(ps, call, pred, fn); +	} else { +		if (strict_strtoull(pred->str_val, 0, &val)) { +			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); +			return -EINVAL;  		} +		pred->val = val;  	} -	return new_pred; +	fn = select_comparison_fn(pred->op, field->size, field->is_signed); +	if (!fn) { +		parse_error(ps, FILT_ERR_INVALID_OP, 0); +		return -EINVAL; +	} + +	if (pred->op == OP_NE) +		pred->not = 1; + +	return filter_add_pred_fn(ps, call, pred, fn);  } -int filter_add_subsystem_pred(struct event_subsystem *system, -			      struct filter_pred *pred) +static int filter_add_subsystem_pred(struct filter_parse_state *ps, +				     struct event_subsystem *system, +				     struct filter_pred *pred, +				     char *filter_string)  { -	struct ftrace_event_call *call = __start_ftrace_events; -	struct filter_pred *event_pred; -	int i; +	struct event_filter *filter = system->filter; +	struct ftrace_event_call *call; +	int err = 0; -	if (system->preds && !pred->compound) -		filter_free_subsystem_preds(system); - -	if (!system->preds) { -		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), +	if (!filter->preds) { +		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),  					GFP_KERNEL); -		if (!system->preds) + +		if (!filter->preds)  			return -ENOMEM;  	} -	for (i = 0; i < MAX_FILTER_PRED; i++) { -		if (!system->preds[i]) { -			system->preds[i] = pred; -			break; -		} +	if (filter->n_preds == MAX_FILTER_PRED) { +		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); +		return -ENOSPC;  	} -	if (i == MAX_FILTER_PRED) -		return -ENOSPC; +	filter->preds[filter->n_preds] = pred; +	filter->n_preds++; -	events_for_each(call) { -		int err; +	mutex_lock(&event_mutex); +	list_for_each_entry(call, &ftrace_events, list) { -		if (!call->name || !call->regfunc) +		if (!call->define_fields)  			continue;  		if (strcmp(call->system, system->name))  			continue; -		if (!find_event_field(call, pred->field_name)) -			continue; +		err = filter_add_pred(ps, call, pred); +		if (err) { +			mutex_unlock(&event_mutex); +			filter_free_subsystem_preds(system); +			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); +			goto out; +		} +		replace_filter_string(call->filter, filter_string); +	} +	mutex_unlock(&event_mutex); +out: +	return err; +} -		event_pred = copy_pred(pred); -		if (!event_pred) -			goto oom; +static void parse_init(struct filter_parse_state *ps, +		       struct filter_op *ops, +		       char *infix_string) +{ +	memset(ps, '\0', sizeof(*ps)); -		err = filter_add_pred(call, event_pred); -		if (err) -			filter_free_pred(event_pred); -		if (err == -ENOMEM) -			goto oom; +	ps->infix.string = infix_string; +	ps->infix.cnt = strlen(infix_string); +	ps->ops = ops; + +	INIT_LIST_HEAD(&ps->opstack); +	INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ +	ps->infix.cnt--; + +	return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ +	if (ps->infix.tail == strlen(ps->infix.string)) +		return 0; + +	return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ +	ps->infix.cnt--; +	ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, +				      int a, int b) +{ +	return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ +	int i; + +	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { +		if (ps->ops[i].string[0] == c) +			return 1;  	}  	return 0; +} -oom: -	system->preds[i] = NULL; -	return -ENOMEM; +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ +	char nextc = infix_peek(ps); +	char opstr[3]; +	int i; + +	opstr[0] = firstc; +	opstr[1] = nextc; +	opstr[2] = '\0'; + +	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { +		if (!strcmp(opstr, ps->ops[i].string)) { +			infix_advance(ps); +			return ps->ops[i].id; +		} +	} + +	opstr[1] = '\0'; + +	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { +		if (!strcmp(opstr, ps->ops[i].string)) +			return ps->ops[i].id; +	} + +	return OP_NONE;  } -int filter_parse(char **pbuf, struct filter_pred *pred) +static inline void clear_operand_string(struct filter_parse_state *ps)  { -	char *tmp, *tok, *val_str = NULL; -	int tok_n = 0; +	memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); +	ps->operand.tail = 0; +} -	/* field ==/!= number, or/and field ==/!= number, number */ -	while ((tok = strsep(pbuf, " \n"))) { -		if (tok_n == 0) { -			if (!strcmp(tok, "0")) { -				pred->clear = 1; -				return 0; -			} else if (!strcmp(tok, "&&")) { -				pred->or = 0; -				pred->compound = 1; -			} else if (!strcmp(tok, "||")) { -				pred->or = 1; -				pred->compound = 1; -			} else -				pred->field_name = tok; -			tok_n = 1; +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ +	if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) +		return -EINVAL; + +	ps->operand.string[ps->operand.tail++] = c; + +	return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ +	struct opstack_op *opstack_op; + +	opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); +	if (!opstack_op) +		return -ENOMEM; + +	opstack_op->op = op; +	list_add(&opstack_op->list, &ps->opstack); + +	return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ +	return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ +	struct opstack_op *opstack_op; + +	if (filter_opstack_empty(ps)) +		return OP_NONE; + +	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + +	return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ +	struct opstack_op *opstack_op; +	int op; + +	if (filter_opstack_empty(ps)) +		return OP_NONE; + +	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); +	op = opstack_op->op; +	list_del(&opstack_op->list); + +	kfree(opstack_op); + +	return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ +	while (!filter_opstack_empty(ps)) +		filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ +	return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ +	struct postfix_elt *elt; + +	elt = kmalloc(sizeof(*elt), GFP_KERNEL); +	if (!elt) +		return -ENOMEM; + +	elt->op = OP_NONE; +	elt->operand = kstrdup(operand, GFP_KERNEL); +	if (!elt->operand) { +		kfree(elt); +		return -ENOMEM; +	} + +	list_add_tail(&elt->list, &ps->postfix); + +	return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ +	struct postfix_elt *elt; + +	elt = kmalloc(sizeof(*elt), GFP_KERNEL); +	if (!elt) +		return -ENOMEM; + +	elt->op = op; +	elt->operand = NULL; + +	list_add_tail(&elt->list, &ps->postfix); + +	return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ +	struct postfix_elt *elt; + +	while (!list_empty(&ps->postfix)) { +		elt = list_first_entry(&ps->postfix, struct postfix_elt, list); +		kfree(elt->operand); +		list_del(&elt->list); +	} +} + +static int filter_parse(struct filter_parse_state *ps) +{ +	int in_string = 0; +	int op, top_op; +	char ch; + +	while ((ch = infix_next(ps))) { +		if (ch == '"') { +			in_string ^= 1;  			continue;  		} -		if (tok_n == 1) { -			if (!pred->field_name) -				pred->field_name = tok; -			else if (!strcmp(tok, "!=")) -				pred->not = 1; -			else if (!strcmp(tok, "==")) -				pred->not = 0; -			else { -				pred->field_name = NULL; + +		if (in_string) +			goto parse_operand; + +		if (isspace(ch)) +			continue; + +		if (is_op_char(ps, ch)) { +			op = infix_get_op(ps, ch); +			if (op == OP_NONE) { +				parse_error(ps, FILT_ERR_INVALID_OP, 0);  				return -EINVAL;  			} -			tok_n = 2; + +			if (strlen(curr_operand(ps))) { +				postfix_append_operand(ps, curr_operand(ps)); +				clear_operand_string(ps); +			} + +			while (!filter_opstack_empty(ps)) { +				top_op = filter_opstack_top(ps); +				if (!is_precedence_lower(ps, top_op, op)) { +					top_op = filter_opstack_pop(ps); +					postfix_append_op(ps, top_op); +					continue; +				} +				break; +			} + +			filter_opstack_push(ps, op);  			continue;  		} -		if (tok_n == 2) { -			if (pred->compound) { -				if (!strcmp(tok, "!=")) -					pred->not = 1; -				else if (!strcmp(tok, "==")) -					pred->not = 0; -				else { -					pred->field_name = NULL; -					return -EINVAL; -				} -			} else { -				val_str = tok; -				break; /* done */ + +		if (ch == '(') { +			filter_opstack_push(ps, OP_OPEN_PAREN); +			continue; +		} + +		if (ch == ')') { +			if (strlen(curr_operand(ps))) { +				postfix_append_operand(ps, curr_operand(ps)); +				clear_operand_string(ps); +			} + +			top_op = filter_opstack_pop(ps); +			while (top_op != OP_NONE) { +				if (top_op == OP_OPEN_PAREN) +					break; +				postfix_append_op(ps, top_op); +				top_op = filter_opstack_pop(ps); +			} +			if (top_op == OP_NONE) { +				parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); +				return -EINVAL;  			} -			tok_n = 3;  			continue;  		} -		if (tok_n == 3) { -			val_str = tok; -			break; /* done */ +parse_operand: +		if (append_operand_char(ps, ch)) { +			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); +			return -EINVAL;  		}  	} -	if (!val_str) { -		pred->field_name = NULL; -		return -EINVAL; +	if (strlen(curr_operand(ps))) +		postfix_append_operand(ps, curr_operand(ps)); + +	while (!filter_opstack_empty(ps)) { +		top_op = filter_opstack_pop(ps); +		if (top_op == OP_NONE) +			break; +		if (top_op == OP_OPEN_PAREN) { +			parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); +			return -EINVAL; +		} +		postfix_append_op(ps, top_op);  	} -	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); -	if (!pred->field_name) -		return -ENOMEM; +	return 0; +} -	pred->val = simple_strtoull(val_str, &tmp, 0); -	if (tmp == val_str) { -		pred->str_val = kstrdup(val_str, GFP_KERNEL); -		if (!pred->str_val) -			return -ENOMEM; -	} else if (*tmp != '\0') +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ +	struct filter_pred *pred; + +	pred = kzalloc(sizeof(*pred), GFP_KERNEL); +	if (!pred) +		return NULL; + +	pred->field_name = kstrdup(operand1, GFP_KERNEL); +	if (!pred->field_name) { +		kfree(pred); +		return NULL; +	} + +	strcpy(pred->str_val, operand2); +	pred->str_len = strlen(operand2); + +	pred->op = op; + +	return pred; +} + +static struct filter_pred *create_logical_pred(int op) +{ +	struct filter_pred *pred; + +	pred = kzalloc(sizeof(*pred), GFP_KERNEL); +	if (!pred) +		return NULL; + +	pred->op = op; + +	return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ +	int n_normal_preds = 0, n_logical_preds = 0; +	struct postfix_elt *elt; + +	list_for_each_entry(elt, &ps->postfix, list) { +		if (elt->op == OP_NONE) +			continue; + +		if (elt->op == OP_AND || elt->op == OP_OR) { +			n_logical_preds++; +			continue; +		} +		n_normal_preds++; +	} + +	if (!n_normal_preds || n_logical_preds >= n_normal_preds) { +		parse_error(ps, FILT_ERR_INVALID_FILTER, 0);  		return -EINVAL; +	} + +	return 0; +} + +static int replace_preds(struct event_subsystem *system, +			 struct ftrace_event_call *call, +			 struct filter_parse_state *ps, +			 char *filter_string) +{ +	char *operand1 = NULL, *operand2 = NULL; +	struct filter_pred *pred; +	struct postfix_elt *elt; +	int err; + +	err = check_preds(ps); +	if (err) +		return err; + +	list_for_each_entry(elt, &ps->postfix, list) { +		if (elt->op == OP_NONE) { +			if (!operand1) +				operand1 = elt->operand; +			else if (!operand2) +				operand2 = elt->operand; +			else { +				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); +				return -EINVAL; +			} +			continue; +		} + +		if (elt->op == OP_AND || elt->op == OP_OR) { +			pred = create_logical_pred(elt->op); +			if (call) { +				err = filter_add_pred(ps, call, pred); +				filter_free_pred(pred); +			} else +				err = filter_add_subsystem_pred(ps, system, +							pred, filter_string); +			if (err) +				return err; + +			operand1 = operand2 = NULL; +			continue; +		} + +		if (!operand1 || !operand2) { +			parse_error(ps, FILT_ERR_MISSING_FIELD, 0); +			return -EINVAL; +		} + +		pred = create_pred(elt->op, operand1, operand2); +		if (call) { +			err = filter_add_pred(ps, call, pred); +			filter_free_pred(pred); +		} else +			err = filter_add_subsystem_pred(ps, system, pred, +							filter_string); +		if (err) +			return err; + +		operand1 = operand2 = NULL; +	}  	return 0;  } +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ +	int err; + +	struct filter_parse_state *ps; + +	mutex_lock(&filter_mutex); + +	if (!strcmp(strstrip(filter_string), "0")) { +		filter_disable_preds(call); +		remove_filter_string(call->filter); +		mutex_unlock(&filter_mutex); +		return 0; +	} + +	err = -ENOMEM; +	ps = kzalloc(sizeof(*ps), GFP_KERNEL); +	if (!ps) +		goto out_unlock; + +	filter_disable_preds(call); +	replace_filter_string(call->filter, filter_string); + +	parse_init(ps, filter_ops, filter_string); +	err = filter_parse(ps); +	if (err) { +		append_filter_err(ps, call->filter); +		goto out; +	} + +	err = replace_preds(NULL, call, ps, filter_string); +	if (err) +		append_filter_err(ps, call->filter); + +out: +	filter_opstack_clear(ps); +	postfix_clear(ps); +	kfree(ps); +out_unlock: +	mutex_unlock(&filter_mutex); + +	return err; +} + +int apply_subsystem_event_filter(struct event_subsystem *system, +				 char *filter_string) +{ +	int err; + +	struct filter_parse_state *ps; + +	mutex_lock(&filter_mutex); + +	if (!strcmp(strstrip(filter_string), "0")) { +		filter_free_subsystem_preds(system); +		remove_filter_string(system->filter); +		mutex_unlock(&filter_mutex); +		return 0; +	} + +	err = -ENOMEM; +	ps = kzalloc(sizeof(*ps), GFP_KERNEL); +	if (!ps) +		goto out_unlock; + +	filter_free_subsystem_preds(system); +	replace_filter_string(system->filter, filter_string); + +	parse_init(ps, filter_ops, filter_string); +	err = filter_parse(ps); +	if (err) { +		append_filter_err(ps, system->filter); +		goto out; +	} + +	err = replace_preds(system, NULL, ps, filter_string); +	if (err) +		append_filter_err(ps, system->filter); + +out: +	filter_opstack_clear(ps); +	postfix_clear(ps); +	kfree(ps); +out_unlock: +	mutex_unlock(&filter_mutex); + +	return err; +} diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h deleted file mode 100644 index 38985f9b379..00000000000 --- a/kernel/trace/trace_events_stage_1.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Stage 1 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * struct ftrace_raw_<call> { - *	struct trace_entry		ent; - *	<type>				<item>; - *	<type2>				<item2>[<len>]; - *	[...] - * }; - * - * The <type> <item> is created by the __field(type, item) macro or - * the __array(type2, item2, len) macro. - * We simply do "type item;", and that will create the fields - * in the structure. - */ - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - -#undef __array -#define __array(type, item, len)	type	item[len]; - -#undef __field -#define __field(type, item)		type	item; - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\ -	struct ftrace_raw_##name {				\ -		struct trace_entry	ent;			\ -		tstruct						\ -	};							\ -	static struct ftrace_event_call event_##name - -#include <trace/trace_event_types.h> diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h deleted file mode 100644 index d363c6672c6..00000000000 --- a/kernel/trace/trace_events_stage_2.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Stage 2 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * enum print_line_t - * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags) - * { - *	struct trace_seq *s = &iter->seq; - *	struct ftrace_raw_<call> *field; <-- defined in stage 1 - *	struct trace_entry *entry; - *	int ret; - * - *	entry = iter->ent; - * - *	if (entry->type != event_<call>.id) { - *		WARN_ON_ONCE(1); - *		return TRACE_TYPE_UNHANDLED; - *	} - * - *	field = (typeof(field))entry; - * - *	ret = trace_seq_printf(s, <TP_printk> "\n"); - *	if (!ret) - *		return TRACE_TYPE_PARTIAL_LINE; - * - *	return TRACE_TYPE_HANDLED; - * } - * - * This is the method used to print the raw event to the trace - * output format. Note, this is not needed if the data is read - * in binary. - */ - -#undef __entry -#define __entry field - -#undef TP_printk -#define TP_printk(fmt, args...) fmt "\n", args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\ -enum print_line_t							\ -ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\ -{									\ -	struct trace_seq *s = &iter->seq;				\ -	struct ftrace_raw_##call *field;				\ -	struct trace_entry *entry;					\ -	int ret;							\ -									\ -	entry = iter->ent;						\ -									\ -	if (entry->type != event_##call.id) {				\ -		WARN_ON_ONCE(1);					\ -		return TRACE_TYPE_UNHANDLED;				\ -	}								\ -									\ -	field = (typeof(field))entry;					\ -									\ -	ret = trace_seq_printf(s, #call ": " print);			\ -	if (!ret)							\ -		return TRACE_TYPE_PARTIAL_LINE;				\ -									\ -	return TRACE_TYPE_HANDLED;					\ -} -	 -#include <trace/trace_event_types.h> - -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - *	struct ftrace_raw_##call field; - *	int ret; - * - *	ret = trace_seq_printf(s, #type " " #item ";" - *			       " offset:%u; size:%u;\n", - *			       offsetof(struct ftrace_raw_##call, item), - *			       sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item)					\ -	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\ -			       "offset:%u;\tsize:%u;\n",		\ -			       (unsigned int)offsetof(typeof(field), item), \ -			       (unsigned int)sizeof(field.item));	\ -	if (!ret)							\ -		return 0; - -#undef __array -#define __array(type, item, len)						\ -	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\ -			       "offset:%u;\tsize:%u;\n",		\ -			       (unsigned int)offsetof(typeof(field), item), \ -			       (unsigned int)sizeof(field.item));	\ -	if (!ret)							\ -		return 0; - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\ -static int								\ -ftrace_format_##call(struct trace_seq *s)				\ -{									\ -	struct ftrace_raw_##call field;					\ -	int ret;							\ -									\ -	tstruct;							\ -									\ -	trace_seq_printf(s, "\nprint fmt: " print);			\ -									\ -	return ret;							\ -} - -#include <trace/trace_event_types.h> - -#undef __field -#define __field(type, item)						\ -	ret = trace_define_field(event_call, #type, #item,		\ -				 offsetof(typeof(field), item),		\ -				 sizeof(field.item));			\ -	if (ret)							\ -		return ret; - -#undef __array -#define __array(type, item, len)					\ -	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\ -				 offsetof(typeof(field), item),		\ -				 sizeof(field.item));			\ -	if (ret)							\ -		return ret; - -#define __common_field(type, item)					\ -	ret = trace_define_field(event_call, #type, "common_" #item,	\ -				 offsetof(typeof(field.ent), item),	\ -				 sizeof(field.ent.item));		\ -	if (ret)							\ -		return ret; - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\ -int									\ -ftrace_define_fields_##call(void)					\ -{									\ -	struct ftrace_raw_##call field;					\ -	struct ftrace_event_call *event_call = &event_##call;		\ -	int ret;							\ -									\ -	__common_field(unsigned char, type);				\ -	__common_field(unsigned char, flags);				\ -	__common_field(unsigned char, preempt_count);			\ -	__common_field(int, pid);					\ -	__common_field(int, tgid);					\ -									\ -	tstruct;							\ -									\ -	return ret;							\ -} - -#include <trace/trace_event_types.h> diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h deleted file mode 100644 index 9d2fa78cecc..00000000000 --- a/kernel/trace/trace_events_stage_3.h +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Stage 3 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * static void ftrace_event_<call>(proto) - * { - *	event_trace_printk(_RET_IP_, "<call>: " <fmt>); - * } - * - * static int ftrace_reg_event_<call>(void) - * { - *	int ret; - * - *	ret = register_trace_<call>(ftrace_event_<call>); - *	if (!ret) - *		pr_info("event trace: Could not activate trace point " - *			"probe to  <call>"); - *	return ret; - * } - * - * static void ftrace_unreg_event_<call>(void) - * { - *	unregister_trace_<call>(ftrace_event_<call>); - * } - * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_<call> = { - *	.name			= "<call>", - *	.regfunc		= ftrace_reg_event_<call>, - *	.unregfunc		= ftrace_unreg_event_<call>, - * } - * - * - * For those macros defined with TRACE_EVENT: - * - * static struct ftrace_event_call event_<call>; - * - * static void ftrace_raw_event_<call>(proto) - * { - *	struct ring_buffer_event *event; - *	struct ftrace_raw_<call> *entry; <-- defined in stage 1 - *	unsigned long irq_flags; - *	int pc; - * - *	local_save_flags(irq_flags); - *	pc = preempt_count(); - * - *	event = trace_current_buffer_lock_reserve(event_<call>.id, - *				  sizeof(struct ftrace_raw_<call>), - *				  irq_flags, pc); - *	if (!event) - *		return; - *	entry	= ring_buffer_event_data(event); - * - *	<assign>;  <-- Here we assign the entries by the __field and - *			__array macros. - * - *	trace_current_buffer_unlock_commit(event, irq_flags, pc); - * } - * - * static int ftrace_raw_reg_event_<call>(void) - * { - *	int ret; - * - *	ret = register_trace_<call>(ftrace_raw_event_<call>); - *	if (!ret) - *		pr_info("event trace: Could not activate trace point " - *			"probe to <call>"); - *	return ret; - * } - * - * static void ftrace_unreg_event_<call>(void) - * { - *	unregister_trace_<call>(ftrace_raw_event_<call>); - * } - * - * static struct trace_event ftrace_event_type_<call> = { - *	.trace			= ftrace_raw_output_<call>, <-- stage 2 - * }; - * - * static int ftrace_raw_init_event_<call>(void) - * { - *	int id; - * - *	id = register_ftrace_event(&ftrace_event_type_<call>); - *	if (!id) - *		return -ENODEV; - *	event_<call>.id = id; - *	return 0; - * } - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_<call> = { - *	.name			= "<call>", - *	.system			= "<system>", - *	.raw_init		= ftrace_raw_init_event_<call>, - *	.regfunc		= ftrace_reg_event_<call>, - *	.unregfunc		= ftrace_unreg_event_<call>, - *	.show_format		= ftrace_format_<call>, - * } - * - */ - -#undef TP_FMT -#define TP_FMT(fmt, args...)	fmt "\n", ##args - -#ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args)				\ -static void ftrace_profile_##call(proto)				\ -{									\ -	extern void perf_tpcounter_event(int);				\ -	perf_tpcounter_event(event_##call.id);				\ -}									\ -									\ -static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ -{									\ -	int ret = 0;							\ -									\ -	if (!atomic_inc_return(&call->profile_count))			\ -		ret = register_trace_##call(ftrace_profile_##call);	\ -									\ -	return ret;							\ -}									\ -									\ -static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ -{									\ -	if (atomic_add_negative(-1, &call->profile_count))		\ -		unregister_trace_##call(ftrace_profile_##call);		\ -} - -#define _TRACE_PROFILE_INIT(call)					\ -	.profile_count = ATOMIC_INIT(-1),				\ -	.profile_enable = ftrace_profile_enable_##call,			\ -	.profile_disable = ftrace_profile_disable_##call, - -#else -#define _TRACE_PROFILE(call, proto, args) -#define _TRACE_PROFILE_INIT(call) -#endif - -#define _TRACE_FORMAT(call, proto, args, fmt)				\ -static void ftrace_event_##call(proto)					\ -{									\ -	event_trace_printk(_RET_IP_, #call ": " fmt);			\ -}									\ -									\ -static int ftrace_reg_event_##call(void)				\ -{									\ -	int ret;							\ -									\ -	ret = register_trace_##call(ftrace_event_##call);		\ -	if (ret)							\ -		pr_info("event trace: Could not activate trace point "	\ -			"probe to " #call "\n");			\ -	return ret;							\ -}									\ -									\ -static void ftrace_unreg_event_##call(void)				\ -{									\ -	unregister_trace_##call(ftrace_event_##call);			\ -}									\ -									\ -static struct ftrace_event_call event_##call;				\ -									\ -static int ftrace_init_event_##call(void)				\ -{									\ -	int id;								\ -									\ -	id = register_ftrace_event(NULL);				\ -	if (!id)							\ -		return -ENODEV;						\ -	event_##call.id = id;						\ -	return 0;							\ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt)				\ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\ -static struct ftrace_event_call __used					\ -__attribute__((__aligned__(4)))						\ -__attribute__((section("_ftrace_events"))) event_##call = {		\ -	.name			= #call,				\ -	.system			= __stringify(TRACE_SYSTEM),		\ -	.raw_init		= ftrace_init_event_##call,		\ -	.regfunc		= ftrace_reg_event_##call,		\ -	.unregfunc		= ftrace_unreg_event_##call,		\ -	_TRACE_PROFILE_INIT(call)					\ -} - -#undef __entry -#define __entry entry - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\ -									\ -static struct ftrace_event_call event_##call;				\ -									\ -static void ftrace_raw_event_##call(proto)				\ -{									\ -	struct ftrace_event_call *call = &event_##call;			\ -	struct ring_buffer_event *event;				\ -	struct ftrace_raw_##call *entry;				\ -	unsigned long irq_flags;					\ -	int pc;								\ -									\ -	local_save_flags(irq_flags);					\ -	pc = preempt_count();						\ -									\ -	event = trace_current_buffer_lock_reserve(event_##call.id,	\ -				  sizeof(struct ftrace_raw_##call),	\ -				  irq_flags, pc);			\ -	if (!event)							\ -		return;							\ -	entry	= ring_buffer_event_data(event);			\ -									\ -	assign;								\ -									\ -	if (call->preds && !filter_match_preds(call, entry))		\ -		ring_buffer_event_discard(event);			\ -									\ -	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\ -									\ -}									\ -									\ -static int ftrace_raw_reg_event_##call(void)				\ -{									\ -	int ret;							\ -									\ -	ret = register_trace_##call(ftrace_raw_event_##call);		\ -	if (ret)							\ -		pr_info("event trace: Could not activate trace point "	\ -			"probe to " #call "\n");			\ -	return ret;							\ -}									\ -									\ -static void ftrace_raw_unreg_event_##call(void)				\ -{									\ -	unregister_trace_##call(ftrace_raw_event_##call);		\ -}									\ -									\ -static struct trace_event ftrace_event_type_##call = {			\ -	.trace			= ftrace_raw_output_##call,		\ -};									\ -									\ -static int ftrace_raw_init_event_##call(void)				\ -{									\ -	int id;								\ -									\ -	id = register_ftrace_event(&ftrace_event_type_##call);		\ -	if (!id)							\ -		return -ENODEV;						\ -	event_##call.id = id;						\ -	INIT_LIST_HEAD(&event_##call.fields);				\ -	return 0;							\ -}									\ -									\ -static struct ftrace_event_call __used					\ -__attribute__((__aligned__(4)))						\ -__attribute__((section("_ftrace_events"))) event_##call = {		\ -	.name			= #call,				\ -	.system			= __stringify(TRACE_SYSTEM),		\ -	.raw_init		= ftrace_raw_init_event_##call,		\ -	.regfunc		= ftrace_raw_reg_event_##call,		\ -	.unregfunc		= ftrace_raw_unreg_event_##call,	\ -	.show_format		= ftrace_format_##call,			\ -	.define_fields		= ftrace_define_fields_##call,		\ -	_TRACE_PROFILE_INIT(call)					\ -} - -#include <trace/trace_event_types.h> - -#undef _TRACE_PROFILE -#undef _TRACE_PROFILE_INIT - diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 07a22c33ebf..d06cf898dc8 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -19,8 +19,12 @@  #undef TRACE_STRUCT  #define TRACE_STRUCT(args...) args +extern void __bad_type_size(void); +  #undef TRACE_FIELD  #define TRACE_FIELD(type, item, assign)					\ +	if (sizeof(type) != sizeof(field.item))				\ +		__bad_type_size();					\  	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\  			       "offset:%u;\tsize:%u;\n",		\  			       (unsigned int)offsetof(typeof(field), item), \ @@ -30,7 +34,7 @@  #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)			\  	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\  			       "offset:%u;\tsize:%u;\n",		\  			       (unsigned int)offsetof(typeof(field), item), \ @@ -46,6 +50,9 @@  	if (!ret)							\  		return 0; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\ +	TRACE_FIELD(type, item, assign)  #undef TP_RAW_FMT  #define TP_RAW_FMT(args...) args @@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s)				\  	return ret;							\  } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\ +				    tpfmt)				\ +static int								\ +ftrace_format_##call(struct trace_seq *s)				\ +{									\ +	struct args field;						\ +	int ret;							\ +									\ +	tstruct;							\ +									\ +	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\ +									\ +	return ret;							\ +} +  #include "trace_event_types.h"  #undef TRACE_ZERO_CHAR @@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)				\  #define TRACE_FIELD(type, item, assign)\  	entry->item = assign; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\ +	TRACE_FIELD(type, item, assign) +  #undef TP_CMD  #define TP_CMD(cmd...)	cmd @@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s)				\  #define TRACE_ENTRY	entry  #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\  	cmd;  #undef TRACE_EVENT_FORMAT  #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\ +int ftrace_define_fields_##call(void);					\ +static int ftrace_raw_init_event_##call(void);				\ +									\ +struct ftrace_event_call __used						\ +__attribute__((__aligned__(4)))						\ +__attribute__((section("_ftrace_events"))) event_##call = {		\ +	.name			= #call,				\ +	.id			= proto,				\ +	.system			= __stringify(TRACE_SYSTEM),		\ +	.raw_init		= ftrace_raw_init_event_##call,		\ +	.show_format		= ftrace_format_##call,			\ +	.define_fields		= ftrace_define_fields_##call,		\ +};									\ +static int ftrace_raw_init_event_##call(void)				\ +{									\ +	INIT_LIST_HEAD(&event_##call.fields);				\ +	init_preds(&event_##call);					\ +	return 0;							\ +}									\ + +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\ +				    tpfmt)				\  									\ -static struct ftrace_event_call __used					\ +struct ftrace_event_call __used						\  __attribute__((__aligned__(4)))						\  __attribute__((section("_ftrace_events"))) event_##call = {		\  	.name			= #call,				\  	.id			= proto,				\  	.system			= __stringify(TRACE_SYSTEM),		\  	.show_format		= ftrace_format_##call,			\ +}; + +#include "trace_event_types.h" + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign)					\ +	ret = trace_define_field(event_call, #type, #item,		\ +				 offsetof(typeof(field), item),		\ +				 sizeof(field.item), is_signed_type(type));	\ +	if (ret)							\ +		return ret; + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\ +	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\ +				 offsetof(typeof(field), item),		\ +				 sizeof(field.item), 0);		\ +	if (ret)							\ +		return ret; + +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\ +	ret = trace_define_field(event_call, #type, #item,		\ +				 offsetof(typeof(field), item),		\ +				 sizeof(field.item), is_signed);	\ +	if (ret)							\ +		return ret; + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\ +int									\ +ftrace_define_fields_##call(void)					\ +{									\ +	struct ftrace_event_call *event_call = &event_##call;		\ +	struct args field;						\ +	int ret;							\ +									\ +	__common_field(unsigned char, type, 0);				\ +	__common_field(unsigned char, flags, 0);			\ +	__common_field(unsigned char, preempt_count, 0);		\ +	__common_field(int, pid, 1);					\ +	__common_field(int, tgid, 1);					\ +									\ +	tstruct;							\ +									\ +	return ret;							\  } + +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\ +				    tpfmt) +  #include "trace_event_types.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a..8b592418d8b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)  	if (!current->ret_stack)  		return -EBUSY; +	/* +	 * We must make sure the ret_stack is tested before we read +	 * anything else. +	 */ +	smp_rmb(); +  	/* The return trace stack is full */  	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {  		atomic_inc(¤t->trace_overrun); @@ -78,13 +84,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)  	current->ret_stack[index].ret = ret;  	current->ret_stack[index].func = func;  	current->ret_stack[index].calltime = calltime; +	current->ret_stack[index].subtime = 0;  	*depth = index;  	return 0;  }  /* Retrieve a function return address to the trace stack on thread info.*/ -void +static void  ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)  {  	int index; @@ -104,9 +111,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)  	trace->calltime = current->ret_stack[index].calltime;  	trace->overrun = atomic_read(¤t->trace_overrun);  	trace->depth = index; -	barrier(); -	current->curr_ret_stack--; -  }  /* @@ -121,6 +125,8 @@ unsigned long ftrace_return_to_handler(void)  	ftrace_pop_return_trace(&trace, &ret);  	trace.rettime = trace_clock_local();  	ftrace_graph_return(&trace); +	barrier(); +	current->curr_ret_stack--;  	if (unlikely(!ret)) {  		ftrace_graph_stop(); @@ -426,8 +432,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  	return TRACE_TYPE_HANDLED;  } -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)  {  	unsigned long nsecs_rem = do_div(duration, 1000);  	/* log10(ULONG_MAX) + '\0' */ @@ -464,12 +470,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} +	return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ +	int ret; + +	ret = trace_print_graph_duration(duration, s); +	if (ret != TRACE_TYPE_HANDLED) +		return ret;  	ret = trace_seq_printf(s, "|  ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; -	return TRACE_TYPE_HANDLED; +	return TRACE_TYPE_HANDLED;  }  /* Case of a leaf function on its call entry */ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..ca7d7c4d0c2 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,10 +1,9 @@  /* - * h/w branch tracer for x86 based on bts + * h/w branch tracer for x86 based on BTS   *   * Copyright (C) 2008-2009 Intel Corporation.   * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009   */ -#include <linux/spinlock.h>  #include <linux/kallsyms.h>  #include <linux/debugfs.h>  #include <linux/ftrace.h> @@ -15,110 +14,119 @@  #include <asm/ds.h> -#include "trace.h"  #include "trace_output.h" +#include "trace.h" -#define SIZEOF_BTS (1 << 13) +#define BTS_BUFFER_SIZE (1 << 13) -/* - * The tracer lock protects the below per-cpu tracer array. - * It needs to be held to: - * - start tracing on all cpus - * - stop tracing on all cpus - * - start tracing on a single hotplug cpu - * - stop tracing on a single hotplug cpu - * - read the trace from all cpus - * - read the trace from a single cpu - */ -static DEFINE_SPINLOCK(bts_tracer_lock);  static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);  #define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) -static int __read_mostly trace_hw_branches_enabled; +static int trace_hw_branches_enabled __read_mostly; +static int trace_hw_branches_suspended __read_mostly;  static struct trace_array *hw_branch_trace __read_mostly; -/* - * Start tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_start_cpu(void *arg) +static void bts_trace_init_cpu(int cpu)  { -	if (this_tracer) -		ds_release_bts(this_tracer); +	per_cpu(tracer, cpu) = +		ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, +				   NULL, (size_t)-1, BTS_KERNEL); -	this_tracer = -		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, -			       /* ovfl = */ NULL, /* th = */ (size_t)-1, -			       BTS_KERNEL); -	if (IS_ERR(this_tracer)) { -		this_tracer = NULL; -		return; -	} +	if (IS_ERR(per_cpu(tracer, cpu))) +		per_cpu(tracer, cpu) = NULL;  } -static void bts_trace_start(struct trace_array *tr) +static int bts_trace_init(struct trace_array *tr)  { -	spin_lock(&bts_tracer_lock); +	int cpu; -	on_each_cpu(bts_trace_start_cpu, NULL, 1); -	trace_hw_branches_enabled = 1; +	hw_branch_trace = tr; +	trace_hw_branches_enabled = 0; -	spin_unlock(&bts_tracer_lock); +	get_online_cpus(); +	for_each_online_cpu(cpu) { +		bts_trace_init_cpu(cpu); + +		if (likely(per_cpu(tracer, cpu))) +			trace_hw_branches_enabled = 1; +	} +	trace_hw_branches_suspended = 0; +	put_online_cpus(); + +	/* If we could not enable tracing on a single cpu, we fail. */ +	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;  } -/* - * Stop tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_stop_cpu(void *arg) +static void bts_trace_reset(struct trace_array *tr)  { -	if (this_tracer) { -		ds_release_bts(this_tracer); -		this_tracer = NULL; +	int cpu; + +	get_online_cpus(); +	for_each_online_cpu(cpu) { +		if (likely(per_cpu(tracer, cpu))) { +			ds_release_bts(per_cpu(tracer, cpu)); +			per_cpu(tracer, cpu) = NULL; +		}  	} +	trace_hw_branches_enabled = 0; +	trace_hw_branches_suspended = 0; +	put_online_cpus();  } -static void bts_trace_stop(struct trace_array *tr) +static void bts_trace_start(struct trace_array *tr)  { -	spin_lock(&bts_tracer_lock); +	int cpu; -	trace_hw_branches_enabled = 0; -	on_each_cpu(bts_trace_stop_cpu, NULL, 1); +	get_online_cpus(); +	for_each_online_cpu(cpu) +		if (likely(per_cpu(tracer, cpu))) +			ds_resume_bts(per_cpu(tracer, cpu)); +	trace_hw_branches_suspended = 0; +	put_online_cpus(); +} + +static void bts_trace_stop(struct trace_array *tr) +{ +	int cpu; -	spin_unlock(&bts_tracer_lock); +	get_online_cpus(); +	for_each_online_cpu(cpu) +		if (likely(per_cpu(tracer, cpu))) +			ds_suspend_bts(per_cpu(tracer, cpu)); +	trace_hw_branches_suspended = 1; +	put_online_cpus();  }  static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,  				     unsigned long action, void *hcpu)  { -	unsigned int cpu = (unsigned long)hcpu; - -	spin_lock(&bts_tracer_lock); - -	if (!trace_hw_branches_enabled) -		goto out; +	int cpu = (long)hcpu;  	switch (action) {  	case CPU_ONLINE:  	case CPU_DOWN_FAILED: -		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); +		/* The notification is sent with interrupts enabled. */ +		if (trace_hw_branches_enabled) { +			bts_trace_init_cpu(cpu); + +			if (trace_hw_branches_suspended && +			    likely(per_cpu(tracer, cpu))) +				ds_suspend_bts(per_cpu(tracer, cpu)); +		}  		break; +  	case CPU_DOWN_PREPARE: -		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); -		break; +		/* The notification is sent with interrupts enabled. */ +		if (likely(per_cpu(tracer, cpu))) { +			ds_release_bts(per_cpu(tracer, cpu)); +			per_cpu(tracer, cpu) = NULL; +		}  	} - out: -	spin_unlock(&bts_tracer_lock);  	return NOTIFY_DONE;  } @@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {  	.notifier_call = bts_hotcpu_handler  }; -static int bts_trace_init(struct trace_array *tr) -{ -	hw_branch_trace = tr; - -	bts_trace_start(tr); - -	return 0; -} - -static void bts_trace_reset(struct trace_array *tr) -{ -	bts_trace_stop(tr); -} -  static void bts_trace_print_header(struct seq_file *m)  {  	seq_puts(m, "# CPU#        TO  <-  FROM\n"); @@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)  static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)  { +	unsigned long symflags = TRACE_ITER_SYM_OFFSET;  	struct trace_entry *entry = iter->ent;  	struct trace_seq *seq = &iter->seq;  	struct hw_branch_entry *it; -	unsigned long symflags = TRACE_ITER_SYM_OFFSET;  	trace_assign_type(it, entry); @@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)  void trace_hw_branch(u64 from, u64 to)  { +	struct ftrace_event_call *call = &event_hw_branch;  	struct trace_array *tr = hw_branch_trace;  	struct ring_buffer_event *event;  	struct hw_branch_entry *entry; @@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)  	entry->ent.type = TRACE_HW_BRANCHES;  	entry->from = from;  	entry->to   = to; -	trace_buffer_unlock_commit(tr, event, 0, 0); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		trace_buffer_unlock_commit(tr, event, 0, 0);   out:  	atomic_dec(&tr->data[cpu]->disabled); @@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)  /*   * Collect the trace on the current cpu and write it into the ftrace buffer.   * - * pre: bts_tracer_lock must be locked + * pre: tracing must be suspended on the current cpu   */  static void trace_bts_cpu(void *arg)  { -	struct trace_array *tr = (struct trace_array *) arg; +	struct trace_array *tr = (struct trace_array *)arg;  	const struct bts_trace *trace;  	unsigned char *at; @@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)  	if (unlikely(!this_tracer))  		return; -	ds_suspend_bts(this_tracer);  	trace = ds_read_bts(this_tracer);  	if (!trace) -		goto out; +		return;  	for (at = trace->ds.top; (void *)at < trace->ds.end;  	     at += trace->ds.size) @@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)  	for (at = trace->ds.begin; (void *)at < trace->ds.top;  	     at += trace->ds.size)  		trace_bts_at(trace, at); - -out: -	ds_resume_bts(this_tracer);  }  static void trace_bts_prepare(struct trace_iterator *iter)  { -	spin_lock(&bts_tracer_lock); +	int cpu; +	get_online_cpus(); +	for_each_online_cpu(cpu) +		if (likely(per_cpu(tracer, cpu))) +			ds_suspend_bts(per_cpu(tracer, cpu)); +	/* +	 * We need to collect the trace on the respective cpu since ftrace +	 * implicitly adds the record for the current cpu. +	 * Once that is more flexible, we could collect the data from any cpu. +	 */  	on_each_cpu(trace_bts_cpu, iter->tr, 1); -	spin_unlock(&bts_tracer_lock); +	for_each_online_cpu(cpu) +		if (likely(per_cpu(tracer, cpu))) +			ds_resume_bts(per_cpu(tracer, cpu)); +	put_online_cpus();  }  static void trace_bts_close(struct trace_iterator *iter) @@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)  void trace_hw_branch_oops(void)  { -	spin_lock(&bts_tracer_lock); - -	trace_bts_cpu(hw_branch_trace); - -	spin_unlock(&bts_tracer_lock); +	if (this_tracer) { +		ds_suspend_bts_noirq(this_tracer); +		trace_bts_cpu(hw_branch_trace); +		ds_resume_bts_noirq(this_tracer); +	}  }  struct tracer bts_tracer __read_mostly = @@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =  	.start		= bts_trace_start,  	.stop		= bts_trace_stop,  	.open		= trace_bts_prepare, -	.close		= trace_bts_close +	.close		= trace_bts_close, +#ifdef CONFIG_FTRACE_SELFTEST +	.selftest	= trace_selftest_startup_hw_branches, +#endif /* CONFIG_FTRACE_SELFTEST */  };  __init static int init_bts_trace(void) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 8e37fcddd8b..d53b45ed080 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,8 @@  #include <linux/kernel.h>  #include <linux/mmiotrace.h>  #include <linux/pci.h> +#include <linux/time.h> +  #include <asm/atomic.h>  #include "trace.h" @@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)  	struct mmiotrace_rw *rw;  	struct trace_seq *s	= &iter->seq;  	unsigned long long t	= ns2usecs(iter->ts); -	unsigned long usec_rem	= do_div(t, 1000000ULL); +	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);  	unsigned secs		= (unsigned long)t;  	int ret = 1; @@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)  	struct mmiotrace_map *m;  	struct trace_seq *s	= &iter->seq;  	unsigned long long t	= ns2usecs(iter->ts); -	unsigned long usec_rem	= do_div(t, 1000000ULL); +	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);  	unsigned secs		= (unsigned long)t;  	int ret; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 64b54a59c55..7938f3ae93e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,11 +14,25 @@  /* must be a power of 2 */  #define EVENT_HASHSIZE	128 -static DEFINE_MUTEX(trace_event_mutex); +DECLARE_RWSEM(trace_event_mutex); + +DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); +EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); +  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;  static int next_event_type = __TRACE_LAST_TYPE + 1; +void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ +	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + +	s->buffer[len] = 0; +	seq_puts(m, s->buffer); + +	trace_seq_init(s); +} +  enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq; @@ -84,6 +98,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)  	return len;  } +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ +	int len = (PAGE_SIZE - 1) - s->len; +	int ret; + +	if (!len) +		return 0; + +	ret = vsnprintf(s->buffer + s->len, len, fmt, args); + +	/* If we can't write it all, don't bother writing anything */ +	if (ret >= len) +		return 0; + +	s->len += ret; + +	return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf);  int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)  { @@ -201,6 +248,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)  	return 0;  } +const char * +ftrace_print_flags_seq(struct trace_seq *p, const char *delim, +		       unsigned long flags, +		       const struct trace_print_flags *flag_array) +{ +	unsigned long mask; +	const char *str; +	const char *ret = p->buffer + p->len; +	int i; + +	for (i = 0;  flag_array[i].name && flags; i++) { + +		mask = flag_array[i].mask; +		if ((flags & mask) != mask) +			continue; + +		str = flag_array[i].name; +		flags &= ~mask; +		if (p->len && delim) +			trace_seq_puts(p, delim); +		trace_seq_puts(p, str); +	} + +	/* check for left over flags */ +	if (flags) { +		if (p->len && delim) +			trace_seq_puts(p, delim); +		trace_seq_printf(p, "0x%lx", flags); +	} + +	trace_seq_putc(p, 0); + +	return ret; +} +EXPORT_SYMBOL(ftrace_print_flags_seq); + +const char * +ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, +			 const struct trace_print_flags *symbol_array) +{ +	int i; +	const char *ret = p->buffer + p->len; + +	for (i = 0;  symbol_array[i].name; i++) { + +		if (val != symbol_array[i].mask) +			continue; + +		trace_seq_puts(p, symbol_array[i].name); +		break; +	} + +	if (!p->len) +		trace_seq_printf(p, "0x%lx", val); +		 +	trace_seq_putc(p, 0); + +	return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq); +  #ifdef CONFIG_KRETPROBES  static inline const char *kretprobed(const char *name)  { @@ -311,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,  		if (ip == ULONG_MAX || !ret)  			break; -		if (i && ret) -			ret = trace_seq_puts(s, " <- "); +		if (ret) +			ret = trace_seq_puts(s, " => ");  		if (!ip) {  			if (ret)  				ret = trace_seq_puts(s, "??"); +			if (ret) +				ret = trace_seq_puts(s, "\n");  			continue;  		}  		if (!ret)  			break;  		if (ret)  			ret = seq_print_user_ip(s, mm, ip, sym_flags); +		ret = trace_seq_puts(s, "\n");  	}  	if (mm) @@ -455,6 +566,7 @@ static int task_state_char(unsigned long state)   * @type: the type of event to look for   *   * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held.   */  struct trace_event *ftrace_find_event(int type)  { @@ -464,7 +576,7 @@ struct trace_event *ftrace_find_event(int type)  	key = type & (EVENT_HASHSIZE - 1); -	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { +	hlist_for_each_entry(event, n, &event_hash[key], node) {  		if (event->type == type)  			return event;  	} @@ -472,6 +584,46 @@ struct trace_event *ftrace_find_event(int type)  	return NULL;  } +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ +	struct trace_event *e; +	int last = __TRACE_LAST_TYPE; + +	if (list_empty(&ftrace_event_list)) { +		*list = &ftrace_event_list; +		return last + 1; +	} + +	/* +	 * We used up all possible max events, +	 * lets see if somebody freed one. +	 */ +	list_for_each_entry(e, &ftrace_event_list, list) { +		if (e->type != last + 1) +			break; +		last++; +	} + +	/* Did we used up all 65 thousand events??? */ +	if ((last + 1) > FTRACE_MAX_EVENT) +		return 0; + +	*list = &e->list; +	return last + 1; +} + +void trace_event_read_lock(void) +{ +	down_read(&trace_event_mutex); +} + +void trace_event_read_unlock(void) +{ +	up_read(&trace_event_mutex); +} +  /**   * register_ftrace_event - register output for an event type   * @event: the event type to register @@ -492,22 +644,42 @@ int register_ftrace_event(struct trace_event *event)  	unsigned key;  	int ret = 0; -	mutex_lock(&trace_event_mutex); +	down_write(&trace_event_mutex); -	if (!event) { -		ret = next_event_type++; +	if (WARN_ON(!event))  		goto out; -	} -	if (!event->type) -		event->type = next_event_type++; -	else if (event->type > __TRACE_LAST_TYPE) { +	INIT_LIST_HEAD(&event->list); + +	if (!event->type) { +		struct list_head *list = NULL; + +		if (next_event_type > FTRACE_MAX_EVENT) { + +			event->type = trace_search_list(&list); +			if (!event->type) +				goto out; + +		} else { +			 +			event->type = next_event_type++; +			list = &ftrace_event_list; +		} + +		if (WARN_ON(ftrace_find_event(event->type))) +			goto out; + +		list_add_tail(&event->list, list); + +	} else if (event->type > __TRACE_LAST_TYPE) {  		printk(KERN_WARNING "Need to add type to trace.h\n");  		WARN_ON(1); -	} - -	if (ftrace_find_event(event->type))  		goto out; +	} else { +		/* Is this event already used */ +		if (ftrace_find_event(event->type)) +			goto out; +	}  	if (event->trace == NULL)  		event->trace = trace_nop_print; @@ -520,14 +692,25 @@ int register_ftrace_event(struct trace_event *event)  	key = event->type & (EVENT_HASHSIZE - 1); -	hlist_add_head_rcu(&event->node, &event_hash[key]); +	hlist_add_head(&event->node, &event_hash[key]);  	ret = event->type;   out: -	mutex_unlock(&trace_event_mutex); +	up_write(&trace_event_mutex);  	return ret;  } +EXPORT_SYMBOL_GPL(register_ftrace_event); + +/* + * Used by module code with the trace_event_mutex held for write. + */ +int __unregister_ftrace_event(struct trace_event *event) +{ +	hlist_del(&event->node); +	list_del(&event->list); +	return 0; +}  /**   * unregister_ftrace_event - remove a no longer used event @@ -535,12 +718,13 @@ int register_ftrace_event(struct trace_event *event)   */  int unregister_ftrace_event(struct trace_event *event)  { -	mutex_lock(&trace_event_mutex); -	hlist_del(&event->node); -	mutex_unlock(&trace_event_mutex); +	down_write(&trace_event_mutex); +	__unregister_ftrace_event(event); +	up_write(&trace_event_mutex);  	return 0;  } +EXPORT_SYMBOL_GPL(unregister_ftrace_event);  /*   * Standard events @@ -833,14 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,  	trace_assign_type(field, iter->ent); +	if (!trace_seq_puts(s, "<stack trace>\n")) +		goto partial;  	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { -		if (i) { -			if (!trace_seq_puts(s, " <= ")) -				goto partial; +		if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) +			break; +		if (!trace_seq_puts(s, " => ")) +			goto partial; -			if (!seq_print_ip_sym(s, field->caller[i], flags)) -				goto partial; -		} +		if (!seq_print_ip_sym(s, field->caller[i], flags)) +			goto partial;  		if (!trace_seq_puts(s, "\n"))  			goto partial;  	} @@ -868,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,  	trace_assign_type(field, iter->ent); -	if (!seq_print_userip_objs(field, s, flags)) +	if (!trace_seq_puts(s, "<user stack trace>\n"))  		goto partial; -	if (!trace_seq_putc(s, '\n')) +	if (!seq_print_userip_objs(field, s, flags))  		goto partial;  	return TRACE_TYPE_HANDLED; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index e0bde39c2dd..d38bec4a9c3 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,41 +1,17 @@  #ifndef __TRACE_EVENTS_H  #define __TRACE_EVENTS_H +#include <linux/trace_seq.h>  #include "trace.h" -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, -					      int flags); - -struct trace_event { -	struct hlist_node	node; -	int			type; -	trace_print_func	trace; -	trace_print_func	raw; -	trace_print_func	hex; -	trace_print_func	binary; -}; -  extern enum print_line_t  trace_print_bprintk_msg_only(struct trace_iterator *iter);  extern enum print_line_t  trace_print_printk_msg_only(struct trace_iterator *iter); -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -	__attribute__ ((format (printf, 2, 3))); -extern int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);  extern int  seq_print_ip_sym(struct trace_seq *s, unsigned long ip,  		unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, -				 size_t cnt); -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, -				size_t len); -extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path);  extern int seq_print_userip_objs(const struct userstack_entry *entry,  				 struct trace_seq *s, unsigned long sym_flags);  extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, @@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,  extern int trace_print_context(struct trace_iterator *iter);  extern int trace_print_lat_context(struct trace_iterator *iter); +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void);  extern struct trace_event *ftrace_find_event(int type); -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event);  extern enum print_line_t trace_nop_print(struct trace_iterator *iter,  					 int flags); +/* used by module unregistering */ +extern int __unregister_ftrace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_mutex; +  #define MAX_MEMHEX_BYTES	8  #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 118439709fb..8a30d9874cd 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,  static void probe_power_end(struct power_trace *it)  { +	struct ftrace_event_call *call = &event_power;  	struct ring_buffer_event *event;  	struct trace_power *entry;  	struct trace_array_cpu *data; @@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)  		goto out;  	entry	= ring_buffer_event_data(event);  	entry->state_data = *it; -	trace_buffer_unlock_commit(tr, event, 0, 0); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		trace_buffer_unlock_commit(tr, event, 0, 0);   out:  	preempt_enable();  } @@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)  static void probe_power_mark(struct power_trace *it, unsigned int type,  				unsigned int level)  { +	struct ftrace_event_call *call = &event_power;  	struct ring_buffer_event *event;  	struct trace_power *entry;  	struct trace_array_cpu *data; @@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,  		goto out;  	entry	= ring_buffer_event_data(event);  	entry->state_data = *it; -	trace_buffer_unlock_commit(tr, event, 0, 0); +	if (!filter_check_discard(call, entry, tr->buffer, event)) +		trace_buffer_unlock_commit(tr, event, 0, 0);   out:  	preempt_enable();  } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index eb81556107f..9bece9687b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {  static __init int init_trace_printk_function_export(void)  {  	struct dentry *d_tracer; -	struct dentry *entry;  	d_tracer = tracing_init_dentry();  	if (!d_tracer)  		return 0; -	entry = debugfs_create_file("printk_formats", 0444, d_tracer, +	trace_create_file("printk_formats", 0444, d_tracer,  				    NULL, &ftrace_formats_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'printk_formats' entry\n");  	return 0;  } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9117cea6f1a..a98106dd979 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -10,7 +10,7 @@  #include <linux/kallsyms.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> -#include <trace/sched.h> +#include <trace/events/sched.h>  #include "trace.h" @@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,  	int cpu;  	int pc; -	if (!sched_ref || sched_stopped) +	if (unlikely(!sched_ref))  		return;  	tracing_record_cmdline(prev);  	tracing_record_cmdline(next); -	if (!tracer_enabled) +	if (!tracer_enabled || sched_stopped)  		return;  	pc = preempt_count(); @@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)  	unsigned long flags;  	int cpu, pc; -	if (!likely(tracer_enabled)) +	if (unlikely(!sched_ref))  		return; -	pc = preempt_count();  	tracing_record_cmdline(current); -	if (sched_stopped) +	if (!tracer_enabled || sched_stopped)  		return; +	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id();  	data = ctx_trace->data[cpu]; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5bc00e8f153..eacb2722517 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@  #include <linux/kallsyms.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> -#include <trace/sched.h> +#include <trace/events/sched.h>  #include "trace.h" @@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,  	pc = preempt_count(); -	/* The task we are waiting for is waking up */ -	data = wakeup_trace->data[wakeup_cpu]; -  	/* disable local data, not wakeup_cpu data */  	cpu = raw_smp_processor_id();  	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); @@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,  	if (unlikely(!tracer_enabled || next != wakeup_task))  		goto out_unlock; +	/* The task we are waiting for is waking up */ +	data = wakeup_trace->data[wakeup_cpu]; +  	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);  	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 08f4eb2763d..00dd6485bdd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)  	case TRACE_BRANCH:  	case TRACE_GRAPH_ENT:  	case TRACE_GRAPH_RET: +	case TRACE_HW_BRANCHES:  		return 1;  	}  	return 0; @@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  #else  # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })  #endif /* CONFIG_DYNAMIC_FTRACE */ +  /*   * Simple verification test of ftrace function tracer.   * Enable ftrace, sleep 1/10 second, and then read the trace @@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)  	return ret;  }  #endif /* CONFIG_BRANCH_TRACER */ + +#ifdef CONFIG_HW_BRANCH_TRACER +int +trace_selftest_startup_hw_branches(struct tracer *trace, +				   struct trace_array *tr) +{ +	struct trace_iterator *iter; +	struct tracer tracer; +	unsigned long count; +	int ret; + +	if (!trace->open) { +		printk(KERN_CONT "missing open function..."); +		return -1; +	} + +	ret = tracer_init(trace, tr); +	if (ret) { +		warn_failed_init_tracer(trace, ret); +		return ret; +	} + +	/* +	 * The hw-branch tracer needs to collect the trace from the various +	 * cpu trace buffers - before tracing is stopped. +	 */ +	iter = kzalloc(sizeof(*iter), GFP_KERNEL); +	if (!iter) +		return -ENOMEM; + +	memcpy(&tracer, trace, sizeof(tracer)); + +	iter->trace = &tracer; +	iter->tr = tr; +	iter->pos = -1; +	mutex_init(&iter->mutex); + +	trace->open(iter); + +	mutex_destroy(&iter->mutex); +	kfree(iter); + +	tracing_stop(); + +	ret = trace_test_buffer(tr, &count); +	trace->reset(tr); +	tracing_start(); + +	if (!ret && !count) { +		printk(KERN_CONT "no entries found.."); +		ret = -1; +	} + +	return ret; +} +#endif /* CONFIG_HW_BRANCH_TRACER */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c750f65f966..2d7aebd71db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)  		seq_printf(m, "        Depth    Size   Location"  			   "    (%d entries)\n"  			   "        -----    ----   --------\n", -			   max_stack_trace.nr_entries); +			   max_stack_trace.nr_entries - 1);  		if (!stack_tracer_enabled && !max_stack_size)  			print_disabled(m); @@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);  static __init int stack_trace_init(void)  {  	struct dentry *d_tracer; -	struct dentry *entry;  	d_tracer = tracing_init_dentry(); -	entry = debugfs_create_file("stack_max_size", 0644, d_tracer, -				    &max_stack_size, &stack_max_size_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'stack_max_size' entry\n"); +	trace_create_file("stack_max_size", 0644, d_tracer, +			&max_stack_size, &stack_max_size_fops); -	entry = debugfs_create_file("stack_trace", 0444, d_tracer, -				    NULL, &stack_trace_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'stack_trace' entry\n"); +	trace_create_file("stack_trace", 0444, d_tracer, +			NULL, &stack_trace_fops);  	if (stack_tracer_enabled)  		register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index acdebd771a9..c00643733f4 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -1,7 +1,7 @@  /*   * Infrastructure for statistic tracing (histogram output).   * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>   *   * Based on the code from trace_branch.c which is   * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> @@ -10,22 +10,27 @@  #include <linux/list.h> +#include <linux/rbtree.h>  #include <linux/debugfs.h>  #include "trace_stat.h"  #include "trace.h" -/* List of stat entries from a tracer */ -struct trace_stat_list { -	struct list_head	list; +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { +	struct rb_node		node;  	void			*stat;  };  /* A stat session is the stats output in one file */ -struct tracer_stat_session { +struct stat_session {  	struct list_head	session_list;  	struct tracer_stat	*ts; -	struct list_head	stat_list; +	struct rb_root		stat_root;  	struct mutex		stat_mutex;  	struct dentry		*file;  }; @@ -37,18 +42,48 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);  /* The root directory for all stat files */  static struct dentry		*stat_dir; +/* + * Iterate through the rbtree using a post order traversal path + * to release the next node. + * It won't necessary release one at each iteration + * but it will at least advance closer to the next one + * to be released. + */ +static struct rb_node *release_next(struct rb_node *node) +{ +	struct stat_node *snode; +	struct rb_node *parent = rb_parent(node); + +	if (node->rb_left) +		return node->rb_left; +	else if (node->rb_right) +		return node->rb_right; +	else { +		if (!parent) +			; +		else if (parent->rb_left == node) +			parent->rb_left = NULL; +		else +			parent->rb_right = NULL; -static void reset_stat_session(struct tracer_stat_session *session) +		snode = container_of(node, struct stat_node, node); +		kfree(snode); + +		return parent; +	} +} + +static void reset_stat_session(struct stat_session *session)  { -	struct trace_stat_list *node, *next; +	struct rb_node *node = session->stat_root.rb_node; -	list_for_each_entry_safe(node, next, &session->stat_list, list) -		kfree(node); +	while (node) +		node = release_next(node); -	INIT_LIST_HEAD(&session->stat_list); +	session->stat_root = RB_ROOT;  } -static void destroy_session(struct tracer_stat_session *session) +static void destroy_session(struct stat_session *session)  {  	debugfs_remove(session->file);  	reset_stat_session(session); @@ -56,25 +91,60 @@ static void destroy_session(struct tracer_stat_session *session)  	kfree(session);  } +typedef int (*cmp_stat_t)(void *, void *); + +static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +{ +	struct rb_node **new = &(root->rb_node), *parent = NULL; +	struct stat_node *data; + +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		return -ENOMEM; +	data->stat = stat; + +	/* +	 * Figure out where to put new node +	 * This is a descendent sorting +	 */ +	while (*new) { +		struct stat_node *this; +		int result; + +		this = container_of(*new, struct stat_node, node); +		result = cmp(data->stat, this->stat); + +		parent = *new; +		if (result >= 0) +			new = &((*new)->rb_left); +		else +			new = &((*new)->rb_right); +	} + +	rb_link_node(&data->node, parent, new); +	rb_insert_color(&data->node, root); +	return 0; +} +  /*   * For tracers that don't provide a stat_cmp callback. - * This one will force an immediate insertion on tail of - * the list. + * This one will force an insertion as right-most node + * in the rbtree.   */  static int dummy_cmp(void *p1, void *p2)  { -	return 1; +	return -1;  }  /* - * Initialize the stat list at each trace_stat file opening. + * Initialize the stat rbtree at each trace_stat file opening.   * All of these copies and sorting are required on all opening   * since the stats could have changed between two file sessions.   */ -static int stat_seq_init(struct tracer_stat_session *session) +static int stat_seq_init(struct stat_session *session)  { -	struct trace_stat_list *iter_entry, *new_entry;  	struct tracer_stat *ts = session->ts; +	struct rb_root *root = &session->stat_root;  	void *stat;  	int ret = 0;  	int i; @@ -85,29 +155,16 @@ static int stat_seq_init(struct tracer_stat_session *session)  	if (!ts->stat_cmp)  		ts->stat_cmp = dummy_cmp; -	stat = ts->stat_start(); +	stat = ts->stat_start(ts);  	if (!stat)  		goto exit; -	/* -	 * The first entry. Actually this is the second, but the first -	 * one (the stat_list head) is pointless. -	 */ -	new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); -	if (!new_entry) { -		ret = -ENOMEM; +	ret = insert_stat(root, stat, ts->stat_cmp); +	if (ret)  		goto exit; -	} - -	INIT_LIST_HEAD(&new_entry->list); - -	list_add(&new_entry->list, &session->stat_list); - -	new_entry->stat = stat;  	/* -	 * Iterate over the tracer stat entries and store them in a sorted -	 * list. +	 * Iterate over the tracer stat entries and store them in an rbtree.  	 */  	for (i = 1; ; i++) {  		stat = ts->stat_next(stat, i); @@ -116,36 +173,16 @@ static int stat_seq_init(struct tracer_stat_session *session)  		if (!stat)  			break; -		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); -		if (!new_entry) { -			ret = -ENOMEM; -			goto exit_free_list; -		} - -		INIT_LIST_HEAD(&new_entry->list); -		new_entry->stat = stat; - -		list_for_each_entry_reverse(iter_entry, &session->stat_list, -				list) { - -			/* Insertion with a descendent sorting */ -			if (ts->stat_cmp(iter_entry->stat, -					new_entry->stat) >= 0) { - -				list_add(&new_entry->list, &iter_entry->list); -				break; -			} -		} - -		/* The current larger value */ -		if (list_empty(&new_entry->list)) -			list_add(&new_entry->list, &session->stat_list); +		ret = insert_stat(root, stat, ts->stat_cmp); +		if (ret) +			goto exit_free_rbtree;  	} +  exit:  	mutex_unlock(&session->stat_mutex);  	return ret; -exit_free_list: +exit_free_rbtree:  	reset_stat_session(session);  	mutex_unlock(&session->stat_mutex);  	return ret; @@ -154,38 +191,51 @@ exit_free_list:  static void *stat_seq_start(struct seq_file *s, loff_t *pos)  { -	struct tracer_stat_session *session = s->private; +	struct stat_session *session = s->private; +	struct rb_node *node; +	int i; -	/* Prevent from tracer switch or stat_list modification */ +	/* Prevent from tracer switch or rbtree modification */  	mutex_lock(&session->stat_mutex);  	/* If we are in the beginning of the file, print the headers */ -	if (!*pos && session->ts->stat_headers) +	if (!*pos && session->ts->stat_headers) { +		(*pos)++;  		return SEQ_START_TOKEN; +	} + +	node = rb_first(&session->stat_root); +	for (i = 0; node && i < *pos; i++) +		node = rb_next(node); + +	(*pos)++; -	return seq_list_start(&session->stat_list, *pos); +	return node;  }  static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)  { -	struct tracer_stat_session *session = s->private; +	struct stat_session *session = s->private; +	struct rb_node *node = p; + +	(*pos)++;  	if (p == SEQ_START_TOKEN) -		return seq_list_start(&session->stat_list, *pos); +		return rb_first(&session->stat_root); -	return seq_list_next(p, &session->stat_list, pos); +	return rb_next(node);  }  static void stat_seq_stop(struct seq_file *s, void *p)  { -	struct tracer_stat_session *session = s->private; +	struct stat_session *session = s->private;  	mutex_unlock(&session->stat_mutex);  }  static int stat_seq_show(struct seq_file *s, void *v)  { -	struct tracer_stat_session *session = s->private; -	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); +	struct stat_session *session = s->private; +	struct stat_node *l = container_of(v, struct stat_node, node);  	if (v == SEQ_START_TOKEN)  		return session->ts->stat_headers(s); @@ -205,7 +255,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)  {  	int ret; -	struct tracer_stat_session *session = inode->i_private; +	struct stat_session *session = inode->i_private;  	ret = seq_open(file, &trace_stat_seq_ops);  	if (!ret) { @@ -218,11 +268,11 @@ static int tracing_stat_open(struct inode *inode, struct file *file)  }  /* - * Avoid consuming memory with our now useless list. + * Avoid consuming memory with our now useless rbtree.   */  static int tracing_stat_release(struct inode *i, struct file *f)  { -	struct tracer_stat_session *session = i->i_private; +	struct stat_session *session = i->i_private;  	mutex_lock(&session->stat_mutex);  	reset_stat_session(session); @@ -251,7 +301,7 @@ static int tracing_stat_init(void)  	return 0;  } -static int init_stat_file(struct tracer_stat_session *session) +static int init_stat_file(struct stat_session *session)  {  	if (!stat_dir && tracing_stat_init())  		return -ENODEV; @@ -266,7 +316,7 @@ static int init_stat_file(struct tracer_stat_session *session)  int register_stat_tracer(struct tracer_stat *trace)  { -	struct tracer_stat_session *session, *node, *tmp; +	struct stat_session *session, *node;  	int ret;  	if (!trace) @@ -277,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)  	/* Already registered? */  	mutex_lock(&all_stat_sessions_mutex); -	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { +	list_for_each_entry(node, &all_stat_sessions, session_list) {  		if (node->ts == trace) {  			mutex_unlock(&all_stat_sessions_mutex);  			return -EINVAL; @@ -286,15 +336,13 @@ int register_stat_tracer(struct tracer_stat *trace)  	mutex_unlock(&all_stat_sessions_mutex);  	/* Init the session */ -	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); +	session = kzalloc(sizeof(*session), GFP_KERNEL);  	if (!session)  		return -ENOMEM;  	session->ts = trace;  	INIT_LIST_HEAD(&session->session_list); -	INIT_LIST_HEAD(&session->stat_list);  	mutex_init(&session->stat_mutex); -	session->file = NULL;  	ret = init_stat_file(session);  	if (ret) { @@ -312,7 +360,7 @@ int register_stat_tracer(struct tracer_stat *trace)  void unregister_stat_tracer(struct tracer_stat *trace)  { -	struct tracer_stat_session *node, *tmp; +	struct stat_session *node, *tmp;  	mutex_lock(&all_stat_sessions_mutex);  	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 202274cf7f3..f3546a2cd82 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -12,7 +12,7 @@ struct tracer_stat {  	/* The name of your stat file */  	const char		*name;  	/* Iteration over statistic entries */ -	void			*(*stat_start)(void); +	void			*(*stat_start)(struct tracer_stat *trace);  	void			*(*stat_next)(void *prev, int idx);  	/* Compare two entries for stats sorting */  	int			(*stat_cmp)(void *p1, void *p2); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c2149..e04b76cc238 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = {  void init_tracer_sysprof_debugfs(struct dentry *d_tracer)  { -	struct dentry *entry; -	entry = debugfs_create_file("sysprof_sample_period", 0644, +	trace_create_file("sysprof_sample_period", 0644,  			d_tracer, NULL, &sysprof_sample_fops); -	if (entry) -		return; -	pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");  } diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 797201e4a13..97fcea4acce 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -6,7 +6,7 @@   */ -#include <trace/workqueue.h> +#include <trace/events/workqueue.h>  #include <linux/list.h>  #include <linux/percpu.h>  #include "trace_stat.h" @@ -16,8 +16,6 @@  /* A cpu workqueue thread */  struct cpu_workqueue_stats {  	struct list_head            list; -/* Useful to know if we print the cpu headers */ -	bool		            first_entry;  	int		            cpu;  	pid_t			    pid;  /* Can be inserted from interrupt or user context, need to be atomic */ @@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,  			  struct work_struct *work)  {  	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node, *next; +	struct cpu_workqueue_stats *node;  	unsigned long flags;  	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, -							list) { +	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {  		if (node->pid == wq_thread->pid) {  			atomic_inc(&node->inserted);  			goto found; @@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,  			  struct work_struct *work)  {  	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node, *next; +	struct cpu_workqueue_stats *node;  	unsigned long flags;  	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, -							list) { +	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {  		if (node->pid == wq_thread->pid) {  			node->executed++;  			goto found; @@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)  	cws->pid = wq_thread->pid;  	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	if (list_empty(&workqueue_cpu_stat(cpu)->list)) -		cws->first_entry = true;  	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);  	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);  } @@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)  	return ret;  } -static void *workqueue_stat_start(void) +static void *workqueue_stat_start(struct tracer_stat *trace)  {  	int cpu;  	void *ret = NULL; @@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)  static int workqueue_stat_show(struct seq_file *s, void *p)  {  	struct cpu_workqueue_stats *cws = p; -	unsigned long flags; -	int cpu = cws->cpu;  	struct pid *pid;  	struct task_struct *tsk; -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	if (&cws->list == workqueue_cpu_stat(cpu)->list.next) -		seq_printf(s, "\n"); -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -  	pid = find_get_pid(cws->pid);  	if (pid) {  		tsk = get_pid_task(pid, PIDTYPE_PID); diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c..ea7c3b4275c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,  	if (!list_empty(&wait->task_list))  		list_del_init(&wait->task_list);  	else if (waitqueue_active(q)) -		__wake_up_common(q, mode, 1, 0, key); +		__wake_up_locked_key(q, mode, key);  	spin_unlock_irqrestore(&q->lock, flags);  }  EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f71fb2a0895..0668795d881 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,7 +33,8 @@  #include <linux/kallsyms.h>  #include <linux/debug_locks.h>  #include <linux/lockdep.h> -#include <trace/workqueue.h> +#define CREATE_TRACE_POINTS +#include <trace/events/workqueue.h>  /*   * The per-CPU workqueue (if single thread, we always use the first @@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)  	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);  } -DEFINE_TRACE(workqueue_insertion); -  static void insert_work(struct cpu_workqueue_struct *cwq,  			struct work_struct *work, struct list_head *head)  { @@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,  }  EXPORT_SYMBOL_GPL(queue_delayed_work_on); -DEFINE_TRACE(workqueue_execution); -  static void run_workqueue(struct cpu_workqueue_struct *cwq)  {  	spin_lock_irq(&cwq->lock); @@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)  	return cwq;  } -DEFINE_TRACE(workqueue_creation); -  static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)  {  	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,  }  EXPORT_SYMBOL_GPL(__create_workqueue_key); -DEFINE_TRACE(workqueue_destruction); -  static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)  {  	/*  |