diff options
Diffstat (limited to 'kernel')
68 files changed, 6662 insertions, 2480 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 00000000000..a3bb4cb5253 --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER +	def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df7c3e..305f11dbef2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg  CFLAGS_REMOVE_sched.o = -mno-spe -pg  endif +obj-$(CONFIG_FREEZER) += freezer.o  obj-$(CONFIG_PROFILING) += profile.o  obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o  obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -55,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o  obj-$(CONFIG_COMPAT) += compat.o  obj-$(CONFIG_CGROUPS) += cgroup.o  obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o  obj-$(CONFIG_CPUSETS) += cpuset.o  obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o  obj-$(CONFIG_UTS_NS) += utsname.o @@ -83,6 +85,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o  obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_TRACEPOINTS) += tracepoint.o  obj-$(CONFIG_LATENCYTOP) += latencytop.o  obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o  obj-$(CONFIG_FTRACE) += trace/ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c6e1c17e6d..046c1609606 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)  	struct cg_cgroup_link *link;  	struct cg_cgroup_link *saved_link; -	write_lock(&css_set_lock);  	hlist_del(&cg->hlist);  	css_set_count--; @@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)  		list_del(&link->cgrp_link_list);  		kfree(link);  	} - -	write_unlock(&css_set_lock);  } -static void __release_css_set(struct kref *k, int taskexit) +static void __put_css_set(struct css_set *cg, int taskexit)  {  	int i; -	struct css_set *cg = container_of(k, struct css_set, ref); - +	/* +	 * Ensure that the refcount doesn't hit zero while any readers +	 * can see it. Similar to atomic_dec_and_lock(), but for an +	 * rwlock +	 */ +	if (atomic_add_unless(&cg->refcount, -1, 1)) +		return; +	write_lock(&css_set_lock); +	if (!atomic_dec_and_test(&cg->refcount)) { +		write_unlock(&css_set_lock); +		return; +	}  	unlink_css_set(cg); +	write_unlock(&css_set_lock);  	rcu_read_lock();  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { @@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)  	kfree(cg);  } -static void release_css_set(struct kref *k) -{ -	__release_css_set(k, 0); -} - -static void release_css_set_taskexit(struct kref *k) -{ -	__release_css_set(k, 1); -} -  /*   * refcounted get/put for css_set objects   */  static inline void get_css_set(struct css_set *cg)  { -	kref_get(&cg->ref); +	atomic_inc(&cg->refcount);  }  static inline void put_css_set(struct css_set *cg)  { -	kref_put(&cg->ref, release_css_set); +	__put_css_set(cg, 0);  }  static inline void put_css_set_taskexit(struct css_set *cg)  { -	kref_put(&cg->ref, release_css_set_taskexit); +	__put_css_set(cg, 1);  }  /* @@ -427,7 +425,7 @@ static struct css_set *find_css_set(  		return NULL;  	} -	kref_init(&res->ref); +	atomic_set(&res->refcount, 1);  	INIT_LIST_HEAD(&res->cg_links);  	INIT_LIST_HEAD(&res->tasks);  	INIT_HLIST_NODE(&res->hlist); @@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = {  	.remount_fs = cgroup_remount,  }; +static void init_cgroup_housekeeping(struct cgroup *cgrp) +{ +	INIT_LIST_HEAD(&cgrp->sibling); +	INIT_LIST_HEAD(&cgrp->children); +	INIT_LIST_HEAD(&cgrp->css_sets); +	INIT_LIST_HEAD(&cgrp->release_list); +	init_rwsem(&cgrp->pids_mutex); +}  static void init_cgroup_root(struct cgroupfs_root *root)  {  	struct cgroup *cgrp = &root->top_cgroup; @@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)  	root->number_of_cgroups = 1;  	cgrp->root = root;  	cgrp->top_cgroup = cgrp; -	INIT_LIST_HEAD(&cgrp->sibling); -	INIT_LIST_HEAD(&cgrp->children); -	INIT_LIST_HEAD(&cgrp->css_sets); -	INIT_LIST_HEAD(&cgrp->release_list); +	init_cgroup_housekeeping(cgrp);  }  static int cgroup_test_super(struct super_block *sb, void *data) @@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp)  	read_lock(&css_set_lock);  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { -		count += atomic_read(&link->cg->ref.refcount); +		count += atomic_read(&link->cg->refcount);  	}  	read_unlock(&css_set_lock);  	return count; @@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)   * but we cannot guarantee that the information we produce is correct   * unless we produce it entirely atomically.   * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here).  The struct - * ctr_struct * is stored in file->private_data.  Its resources will - * be freed by release() when the file is closed.  The array is used - * to sprintf the PIDs and then used by read().   */ -struct ctr_struct { -	char *buf; -	int bufsz; -};  /*   * Load into 'pidarray' up to 'npids' of the tasks using cgroup @@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b)  	return *(pid_t *)a - *(pid_t *)b;  } +  /* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. + * seq_file methods for the "tasks" file. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->tasks_pids array.   */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) + +static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)  { -	int cnt = 0; -	int i; +	/* +	 * Initially we receive a position value that corresponds to +	 * one more than the last pid shown (or 0 on the first call or +	 * after a seek to the start). Use a binary-search to find the +	 * next pid to display, if any +	 */ +	struct cgroup *cgrp = s->private; +	int index = 0, pid = *pos; +	int *iter; + +	down_read(&cgrp->pids_mutex); +	if (pid) { +		int end = cgrp->pids_length; +		int i; +		while (index < end) { +			int mid = (index + end) / 2; +			if (cgrp->tasks_pids[mid] == pid) { +				index = mid; +				break; +			} else if (cgrp->tasks_pids[mid] <= pid) +				index = mid + 1; +			else +				end = mid; +		} +	} +	/* If we're off the end of the array, we're done */ +	if (index >= cgrp->pids_length) +		return NULL; +	/* Update the abstract position to be the actual pid that we found */ +	iter = cgrp->tasks_pids + index; +	*pos = *iter; +	return iter; +} + +static void cgroup_tasks_stop(struct seq_file *s, void *v) +{ +	struct cgroup *cgrp = s->private; +	up_read(&cgrp->pids_mutex); +} + +static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +{ +	struct cgroup *cgrp = s->private; +	int *p = v; +	int *end = cgrp->tasks_pids + cgrp->pids_length; + +	/* +	 * Advance to the next pid in the array. If this goes off the +	 * end, we're done +	 */ +	p++; +	if (p >= end) { +		return NULL; +	} else { +		*pos = *p; +		return p; +	} +} + +static int cgroup_tasks_show(struct seq_file *s, void *v) +{ +	return seq_printf(s, "%d\n", *(int *)v); +} + +static struct seq_operations cgroup_tasks_seq_operations = { +	.start = cgroup_tasks_start, +	.stop = cgroup_tasks_stop, +	.next = cgroup_tasks_next, +	.show = cgroup_tasks_show, +}; + +static void release_cgroup_pid_array(struct cgroup *cgrp) +{ +	down_write(&cgrp->pids_mutex); +	BUG_ON(!cgrp->pids_use_count); +	if (!--cgrp->pids_use_count) { +		kfree(cgrp->tasks_pids); +		cgrp->tasks_pids = NULL; +		cgrp->pids_length = 0; +	} +	up_write(&cgrp->pids_mutex); +} + +static int cgroup_tasks_release(struct inode *inode, struct file *file) +{ +	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + +	if (!(file->f_mode & FMODE_READ)) +		return 0; -	for (i = 0; i < npids; i++) -		cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); -	return cnt; +	release_cgroup_pid_array(cgrp); +	return seq_release(inode, file);  } +static struct file_operations cgroup_tasks_operations = { +	.read = seq_read, +	.llseek = seq_lseek, +	.write = cgroup_file_write, +	.release = cgroup_tasks_release, +}; +  /* - * Handle an open on 'tasks' file.  Prepare a buffer listing the + * Handle an open on 'tasks' file.  Prepare an array containing the   * process id's of tasks currently attached to the cgroup being opened. - * - * Does not require any specific cgroup mutexes, and does not take any.   */ +  static int cgroup_tasks_open(struct inode *unused, struct file *file)  {  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); -	struct ctr_struct *ctr;  	pid_t *pidarray;  	int npids; -	char c; +	int retval; +	/* Nothing to do for write-only files */  	if (!(file->f_mode & FMODE_READ))  		return 0; -	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); -	if (!ctr) -		goto err0; -  	/*  	 * If cgroup gets more users after we read count, we won't have  	 * enough space - tough.  This race is indistinguishable to the @@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)  	 * show up until sometime later on.  	 */  	npids = cgroup_task_count(cgrp); -	if (npids) { -		pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); -		if (!pidarray) -			goto err1; - -		npids = pid_array_load(pidarray, npids, cgrp); -		sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - -		/* Call pid_array_to_buf() twice, first just to get bufsz */ -		ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; -		ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); -		if (!ctr->buf) -			goto err2; -		ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - -		kfree(pidarray); -	} else { -		ctr->buf = NULL; -		ctr->bufsz = 0; -	} -	file->private_data = ctr; -	return 0; - -err2: -	kfree(pidarray); -err1: -	kfree(ctr); -err0: -	return -ENOMEM; -} - -static ssize_t cgroup_tasks_read(struct cgroup *cgrp, -				    struct cftype *cft, -				    struct file *file, char __user *buf, -				    size_t nbytes, loff_t *ppos) -{ -	struct ctr_struct *ctr = file->private_data; +	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); +	if (!pidarray) +		return -ENOMEM; +	npids = pid_array_load(pidarray, npids, cgrp); +	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); -	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} +	/* +	 * Store the array in the cgroup, freeing the old +	 * array if necessary +	 */ +	down_write(&cgrp->pids_mutex); +	kfree(cgrp->tasks_pids); +	cgrp->tasks_pids = pidarray; +	cgrp->pids_length = npids; +	cgrp->pids_use_count++; +	up_write(&cgrp->pids_mutex); -static int cgroup_tasks_release(struct inode *unused_inode, -					struct file *file) -{ -	struct ctr_struct *ctr; +	file->f_op = &cgroup_tasks_operations; -	if (file->f_mode & FMODE_READ) { -		ctr = file->private_data; -		kfree(ctr->buf); -		kfree(ctr); +	retval = seq_open(file, &cgroup_tasks_seq_operations); +	if (retval) { +		release_cgroup_pid_array(cgrp); +		return retval;  	} +	((struct seq_file *)file->private_data)->private = cgrp;  	return 0;  } @@ -2210,7 +2268,6 @@ static struct cftype files[] = {  	{  		.name = "tasks",  		.open = cgroup_tasks_open, -		.read = cgroup_tasks_read,  		.write_u64 = cgroup_tasks_write,  		.release = cgroup_tasks_release,  		.private = FILE_TASKLIST, @@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	mutex_lock(&cgroup_mutex); -	INIT_LIST_HEAD(&cgrp->sibling); -	INIT_LIST_HEAD(&cgrp->children); -	INIT_LIST_HEAD(&cgrp->css_sets); -	INIT_LIST_HEAD(&cgrp->release_list); +	init_cgroup_housekeeping(cgrp);  	cgrp->parent = parent;  	cgrp->root = parent->root; @@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  int __init cgroup_init_early(void)  {  	int i; -	kref_init(&init_css_set.ref); -	kref_get(&init_css_set.ref); +	atomic_set(&init_css_set.refcount, 1);  	INIT_LIST_HEAD(&init_css_set.cg_links);  	INIT_LIST_HEAD(&init_css_set.tasks);  	INIT_HLIST_NODE(&init_css_set.hlist); diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index c3dc3aba4c0..daca6209202 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,  	u64 count;  	rcu_read_lock(); -	count = atomic_read(¤t->cgroups->ref.refcount); +	count = atomic_read(¤t->cgroups->refcount);  	rcu_read_unlock();  	return count;  } @@ -90,7 +90,7 @@ static struct cftype files[] =  {  	{  		.name = "releasable",  		.read_u64 = releasable_read, -	} +	},  };  static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 00000000000..e9505695449 --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,379 @@ +/* + * cgroup_freezer.c -  control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> + +enum freezer_state { +	CGROUP_THAWED = 0, +	CGROUP_FREEZING, +	CGROUP_FROZEN, +}; + +struct freezer { +	struct cgroup_subsys_state css; +	enum freezer_state state; +	spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( +		struct cgroup *cgroup) +{ +	return container_of( +		cgroup_subsys_state(cgroup, freezer_subsys_id), +		struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ +	return container_of(task_subsys_state(task, freezer_subsys_id), +			    struct freezer, css); +} + +int cgroup_frozen(struct task_struct *task) +{ +	struct freezer *freezer; +	enum freezer_state state; + +	task_lock(task); +	freezer = task_freezer(task); +	state = freezer->state; +	task_unlock(task); + +	return state == CGROUP_FROZEN; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { +	"THAWED", +	"FREEZING", +	"FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + *    ^ ^                    |                     | + *    | \_______THAWED_______/                     | + *    \__________________________THAWED____________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * css_set_lock + * cgroup_mutex (AKA cgroup_lock) + * task->alloc_lock (AKA task_lock) + * freezer->lock + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * can_attach(): + * cgroup_mutex + * + * cgroup_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * task->alloc_lock (to get task's cgroup) + * freezer->lock + *  sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + *  freezer->lock + *   read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + *  freezer->lock + *   read_lock css_set_lock (cgroup iterator start) + *    sighand->siglock + * + * freezer_write() (unfreeze): + * cgroup_mutex + *  freezer->lock + *   read_lock css_set_lock (cgroup iterator start) + *    task->alloc_lock (to prevent races with freeze_task()) + *     sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, +						  struct cgroup *cgroup) +{ +	struct freezer *freezer; + +	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); +	if (!freezer) +		return ERR_PTR(-ENOMEM); + +	spin_lock_init(&freezer->lock); +	freezer->state = CGROUP_THAWED; +	return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, +			    struct cgroup *cgroup) +{ +	kfree(cgroup_freezer(cgroup)); +} + +/* Task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ +	return frozen(task) || +		(task_is_stopped_or_traced(task) && freezing(task)); +} + +/* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ +static int freezer_can_attach(struct cgroup_subsys *ss, +			      struct cgroup *new_cgroup, +			      struct task_struct *task) +{ +	struct freezer *freezer; +	int retval; + +	/* Anything frozen can't move or be moved to/from */ + +	if (is_task_frozen_enough(task)) +		return -EBUSY; + +	freezer = cgroup_freezer(new_cgroup); +	if (freezer->state == CGROUP_FROZEN) +		return -EBUSY; + +	retval = 0; +	task_lock(task); +	freezer = task_freezer(task); +	if (freezer->state == CGROUP_FROZEN) +		retval = -EBUSY; +	task_unlock(task); +	return retval; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ +	struct freezer *freezer; + +	task_lock(task); +	freezer = task_freezer(task); +	task_unlock(task); + +	BUG_ON(freezer->state == CGROUP_FROZEN); +	spin_lock_irq(&freezer->lock); +	/* Locking avoids race with FREEZING -> THAWED transitions. */ +	if (freezer->state == CGROUP_FREEZING) +		freeze_task(task, true); +	spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void update_freezer_state(struct cgroup *cgroup, +				 struct freezer *freezer) +{ +	struct cgroup_iter it; +	struct task_struct *task; +	unsigned int nfrozen = 0, ntotal = 0; + +	cgroup_iter_start(cgroup, &it); +	while ((task = cgroup_iter_next(cgroup, &it))) { +		ntotal++; +		if (is_task_frozen_enough(task)) +			nfrozen++; +	} + +	/* +	 * Transition to FROZEN when no new tasks can be added ensures +	 * that we never exist in the FROZEN state while there are unfrozen +	 * tasks. +	 */ +	if (nfrozen == ntotal) +		freezer->state = CGROUP_FROZEN; +	else if (nfrozen > 0) +		freezer->state = CGROUP_FREEZING; +	else +		freezer->state = CGROUP_THAWED; +	cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, +			struct seq_file *m) +{ +	struct freezer *freezer; +	enum freezer_state state; + +	if (!cgroup_lock_live_group(cgroup)) +		return -ENODEV; + +	freezer = cgroup_freezer(cgroup); +	spin_lock_irq(&freezer->lock); +	state = freezer->state; +	if (state == CGROUP_FREEZING) { +		/* We change from FREEZING to FROZEN lazily if the cgroup was +		 * only partially frozen when we exitted write. */ +		update_freezer_state(cgroup, freezer); +		state = freezer->state; +	} +	spin_unlock_irq(&freezer->lock); +	cgroup_unlock(); + +	seq_puts(m, freezer_state_strs[state]); +	seq_putc(m, '\n'); +	return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ +	struct cgroup_iter it; +	struct task_struct *task; +	unsigned int num_cant_freeze_now = 0; + +	freezer->state = CGROUP_FREEZING; +	cgroup_iter_start(cgroup, &it); +	while ((task = cgroup_iter_next(cgroup, &it))) { +		if (!freeze_task(task, true)) +			continue; +		if (is_task_frozen_enough(task)) +			continue; +		if (!freezing(task) && !freezer_should_skip(task)) +			num_cant_freeze_now++; +	} +	cgroup_iter_end(cgroup, &it); + +	return num_cant_freeze_now ? -EBUSY : 0; +} + +static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ +	struct cgroup_iter it; +	struct task_struct *task; + +	cgroup_iter_start(cgroup, &it); +	while ((task = cgroup_iter_next(cgroup, &it))) { +		int do_wake; + +		task_lock(task); +		do_wake = __thaw_process(task); +		task_unlock(task); +		if (do_wake) +			wake_up_process(task); +	} +	cgroup_iter_end(cgroup, &it); +	freezer->state = CGROUP_THAWED; + +	return 0; +} + +static int freezer_change_state(struct cgroup *cgroup, +				enum freezer_state goal_state) +{ +	struct freezer *freezer; +	int retval = 0; + +	freezer = cgroup_freezer(cgroup); +	spin_lock_irq(&freezer->lock); +	update_freezer_state(cgroup, freezer); +	if (goal_state == freezer->state) +		goto out; +	switch (freezer->state) { +	case CGROUP_THAWED: +		retval = try_to_freeze_cgroup(cgroup, freezer); +		break; +	case CGROUP_FREEZING: +		if (goal_state == CGROUP_FROZEN) { +			/* Userspace is retrying after +			 * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ +			retval = try_to_freeze_cgroup(cgroup, freezer); +			break; +		} +		/* state == FREEZING and goal_state == THAWED, so unfreeze */ +	case CGROUP_FROZEN: +		retval = unfreeze_cgroup(cgroup, freezer); +		break; +	default: +		break; +	} +out: +	spin_unlock_irq(&freezer->lock); + +	return retval; +} + +static int freezer_write(struct cgroup *cgroup, +			 struct cftype *cft, +			 const char *buffer) +{ +	int retval; +	enum freezer_state goal_state; + +	if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) +		goal_state = CGROUP_THAWED; +	else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) +		goal_state = CGROUP_FROZEN; +	else +		return -EIO; + +	if (!cgroup_lock_live_group(cgroup)) +		return -ENODEV; +	retval = freezer_change_state(cgroup, goal_state); +	cgroup_unlock(); +	return retval; +} + +static struct cftype files[] = { +	{ +		.name = "state", +		.read_seq_string = freezer_read, +		.write_string = freezer_write, +	}, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ +	return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { +	.name		= "freezer", +	.create		= freezer_create, +	.destroy	= freezer_destroy, +	.populate	= freezer_populate, +	.subsys_id	= freezer_subsys_id, +	.can_attach	= freezer_can_attach, +	.attach		= NULL, +	.fork		= freezer_fork, +	.exit		= NULL, +}; diff --git a/kernel/compat.c b/kernel/compat.c index 143990e48cb..8eafe3eb50d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,6 +23,7 @@  #include <linux/timex.h>  #include <linux/migrate.h>  #include <linux/posix-timers.h> +#include <linux/times.h>  #include <asm/uaccess.h> @@ -208,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which,  	return 0;  } +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ +	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} +  asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)  { -	/* -	 *	In the SMP world we might just be unlucky and have one of -	 *	the times increment as we use it. Since the value is an -	 *	atomically safe type this is just fine. Conceptually its -	 *	as if the syscall took an instant longer to occur. -	 */  	if (tbuf) { +		struct tms tms;  		struct compat_tms tmp; -		struct task_struct *tsk = current; -		struct task_struct *t; -		cputime_t utime, stime, cutime, cstime; - -		read_lock(&tasklist_lock); -		utime = tsk->signal->utime; -		stime = tsk->signal->stime; -		t = tsk; -		do { -			utime = cputime_add(utime, t->utime); -			stime = cputime_add(stime, t->stime); -			t = next_thread(t); -		} while (t != tsk); - -		/* -		 * While we have tasklist_lock read-locked, no dying thread -		 * can be updating current->signal->[us]time.  Instead, -		 * we got their counts included in the live thread loop. -		 * However, another thread can come in right now and -		 * do a wait call that updates current->signal->c[us]time. -		 * To make sure we always see that pair updated atomically, -		 * we take the siglock around fetching them. -		 */ -		spin_lock_irq(&tsk->sighand->siglock); -		cutime = tsk->signal->cutime; -		cstime = tsk->signal->cstime; -		spin_unlock_irq(&tsk->sighand->siglock); -		read_unlock(&tasklist_lock); -		tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); -		tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); -		tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); -		tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); +		do_sys_times(&tms); +		/* Convert our struct tms to the compat version. */ +		tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); +		tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); +		tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); +		tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);  		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))  			return -EFAULT;  	} diff --git a/kernel/configs.c b/kernel/configs.c index 4c345210ed8..abaee684ecb 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -54,9 +54,6 @@  #ifdef CONFIG_IKCONFIG_PROC -/**************************************************/ -/* globals and useful constants                   */ -  static ssize_t  ikconfig_read_current(struct file *file, char __user *buf,  		      size_t len, loff_t * offset) @@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = {  	.read = ikconfig_read_current,  }; -/***************************************************/ -/* ikconfig_init: start up everything we need to */ -  static int __init ikconfig_init(void)  {  	struct proc_dir_entry *entry; @@ -89,9 +83,6 @@ static int __init ikconfig_init(void)  	return 0;  } -/***************************************************/ -/* ikconfig_cleanup: clean up our mess           */ -  static void __exit ikconfig_cleanup(void)  {  	remove_proc_entry("config.gz", NULL); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index eab7bd6628e..3e00526f52e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1172,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,  {  	struct cpuset trialcs;  	int err; -	int cpus_nonempty, balance_flag_changed; +	int balance_flag_changed;  	trialcs = *cs;  	if (turning_on) @@ -1184,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,  	if (err < 0)  		return err; -	cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);  	balance_flag_changed = (is_sched_load_balance(cs) !=  		 			is_sched_load_balance(&trialcs)); @@ -1192,7 +1191,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,  	cs->flags = trialcs.flags;  	mutex_unlock(&callback_mutex); -	if (cpus_nonempty && balance_flag_changed) +	if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)  		async_rebuild_sched_domains();  	return 0; @@ -2437,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = {  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)  {  	seq_printf(m, "Cpus_allowed:\t"); -	m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, -					task->cpus_allowed); +	seq_cpumask(m, &task->cpus_allowed);  	seq_printf(m, "\n");  	seq_printf(m, "Cpus_allowed_list:\t"); -	m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, -					task->cpus_allowed); +	seq_cpumask_list(m, &task->cpus_allowed);  	seq_printf(m, "\n");  	seq_printf(m, "Mems_allowed:\t"); -	m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, -					task->mems_allowed); +	seq_nodemask(m, &task->mems_allowed);  	seq_printf(m, "\n");  	seq_printf(m, "Mems_allowed_list:\t"); -	m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, -					task->mems_allowed); +	seq_nodemask_list(m, &task->mems_allowed);  	seq_printf(m, "\n");  } diff --git a/kernel/exit.c b/kernel/exit.c index 0ef4673e351..80137a5d946 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,6 +47,7 @@  #include <linux/blkdev.h>  #include <linux/task_io_accounting_ops.h>  #include <linux/tracehook.h> +#include <trace/sched.h>  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -112,8 +113,6 @@ static void __exit_signal(struct task_struct *tsk)  		 * We won't ever get here for the group leader, since it  		 * will have been the last reference on the signal_struct.  		 */ -		sig->utime = cputime_add(sig->utime, task_utime(tsk)); -		sig->stime = cputime_add(sig->stime, task_stime(tsk));  		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));  		sig->min_flt += tsk->min_flt;  		sig->maj_flt += tsk->maj_flt; @@ -122,7 +121,6 @@ static void __exit_signal(struct task_struct *tsk)  		sig->inblock += task_io_get_inblock(tsk);  		sig->oublock += task_io_get_oublock(tsk);  		task_io_accounting_add(&sig->ioac, &tsk->ioac); -		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;  		sig = NULL; /* Marker for below. */  	} @@ -149,7 +147,10 @@ static void __exit_signal(struct task_struct *tsk)  static void delayed_put_task_struct(struct rcu_head *rhp)  { -	put_task_struct(container_of(rhp, struct task_struct, rcu)); +	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + +	trace_sched_process_free(tsk); +	put_task_struct(tsk);  } @@ -1073,6 +1074,8 @@ NORET_TYPE void do_exit(long code)  	if (group_dead)  		acct_process(); +	trace_sched_process_exit(tsk); +  	exit_sem(tsk);  	exit_files(tsk);  	exit_fs(tsk); @@ -1301,6 +1304,7 @@ static int wait_task_zombie(struct task_struct *p, int options,  	if (likely(!traced)) {  		struct signal_struct *psig;  		struct signal_struct *sig; +		struct task_cputime cputime;  		/*  		 * The resource counters for the group leader are in its @@ -1316,20 +1320,23 @@ static int wait_task_zombie(struct task_struct *p, int options,  		 * need to protect the access to p->parent->signal fields,  		 * as other threads in the parent group can be right  		 * here reaping other children at the same time. +		 * +		 * We use thread_group_cputime() to get times for the thread +		 * group, which consolidates times for all threads in the +		 * group including the group leader.  		 */  		spin_lock_irq(&p->parent->sighand->siglock);  		psig = p->parent->signal;  		sig = p->signal; +		thread_group_cputime(p, &cputime);  		psig->cutime =  			cputime_add(psig->cutime, -			cputime_add(p->utime, -			cputime_add(sig->utime, -				    sig->cutime))); +			cputime_add(cputime.utime, +				    sig->cutime));  		psig->cstime =  			cputime_add(psig->cstime, -			cputime_add(p->stime, -			cputime_add(sig->stime, -				    sig->cstime))); +			cputime_add(cputime.stime, +				    sig->cstime));  		psig->cgtime =  			cputime_add(psig->cgtime,  			cputime_add(p->gtime, @@ -1674,6 +1681,8 @@ static long do_wait(enum pid_type type, struct pid *pid, int options,  	struct task_struct *tsk;  	int retval; +	trace_sched_process_wait(pid); +  	add_wait_queue(¤t->signal->wait_chldexit,&wait);  repeat:  	/* diff --git a/kernel/fork.c b/kernel/fork.c index 37b3e150ae3..f6083561dfe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -58,6 +58,7 @@  #include <linux/tty.h>  #include <linux/proc_fs.h>  #include <linux/blkdev.h> +#include <trace/sched.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h> @@ -759,15 +760,44 @@ void __cleanup_sighand(struct sighand_struct *sighand)  		kmem_cache_free(sighand_cachep, sighand);  } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ +	/* Thread group counters. */ +	thread_group_cputime_init(sig); + +	/* Expiration times and increments. */ +	sig->it_virt_expires = cputime_zero; +	sig->it_virt_incr = cputime_zero; +	sig->it_prof_expires = cputime_zero; +	sig->it_prof_incr = cputime_zero; + +	/* Cached expiration times. */ +	sig->cputime_expires.prof_exp = cputime_zero; +	sig->cputime_expires.virt_exp = cputime_zero; +	sig->cputime_expires.sched_exp = 0; + +	/* The timer lists. */ +	INIT_LIST_HEAD(&sig->cpu_timers[0]); +	INIT_LIST_HEAD(&sig->cpu_timers[1]); +	INIT_LIST_HEAD(&sig->cpu_timers[2]); +} +  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  {  	struct signal_struct *sig;  	int ret;  	if (clone_flags & CLONE_THREAD) { -		atomic_inc(¤t->signal->count); -		atomic_inc(¤t->signal->live); -		return 0; +		ret = thread_group_cputime_clone_thread(current); +		if (likely(!ret)) { +			atomic_inc(¤t->signal->count); +			atomic_inc(¤t->signal->live); +		} +		return ret;  	}  	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);  	tsk->signal = sig; @@ -795,40 +825,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	sig->it_real_incr.tv64 = 0;  	sig->real_timer.function = it_real_fn; -	sig->it_virt_expires = cputime_zero; -	sig->it_virt_incr = cputime_zero; -	sig->it_prof_expires = cputime_zero; -	sig->it_prof_incr = cputime_zero; -  	sig->leader = 0;	/* session leadership doesn't inherit */  	sig->tty_old_pgrp = NULL;  	sig->tty = NULL; -	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; +	sig->cutime = sig->cstime = cputime_zero;  	sig->gtime = cputime_zero;  	sig->cgtime = cputime_zero;  	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;  	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;  	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;  	task_io_accounting_init(&sig->ioac); -	sig->sum_sched_runtime = 0; -	INIT_LIST_HEAD(&sig->cpu_timers[0]); -	INIT_LIST_HEAD(&sig->cpu_timers[1]); -	INIT_LIST_HEAD(&sig->cpu_timers[2]);  	taskstats_tgid_init(sig);  	task_lock(current->group_leader);  	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);  	task_unlock(current->group_leader); -	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { -		/* -		 * New sole thread in the process gets an expiry time -		 * of the whole CPU time limit. -		 */ -		tsk->it_prof_expires = -			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); -	} +	posix_cpu_timers_init_group(sig); +  	acct_init_pacct(&sig->pacct);  	tty_audit_fork(sig); @@ -838,6 +853,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  void __cleanup_signal(struct signal_struct *sig)  { +	thread_group_cputime_free(sig);  	exit_thread_group_keys(sig);  	tty_kref_put(sig->tty);  	kmem_cache_free(signal_cachep, sig); @@ -888,6 +904,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)  #endif /* CONFIG_MM_OWNER */  /* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ +	tsk->cputime_expires.prof_exp = cputime_zero; +	tsk->cputime_expires.virt_exp = cputime_zero; +	tsk->cputime_expires.sched_exp = 0; +	INIT_LIST_HEAD(&tsk->cpu_timers[0]); +	INIT_LIST_HEAD(&tsk->cpu_timers[1]); +	INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + +/*   * This creates a new process as a copy of the old one,   * but does not actually start it yet.   * @@ -999,12 +1028,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	task_io_accounting_init(&p->ioac);  	acct_clear_integrals(p); -	p->it_virt_expires = cputime_zero; -	p->it_prof_expires = cputime_zero; -	p->it_sched_expires = 0; -	INIT_LIST_HEAD(&p->cpu_timers[0]); -	INIT_LIST_HEAD(&p->cpu_timers[1]); -	INIT_LIST_HEAD(&p->cpu_timers[2]); +	posix_cpu_timers_init(p);  	p->lock_depth = -1;		/* -1 = no lock */  	do_posix_clock_monotonic_gettime(&p->start_time); @@ -1205,21 +1229,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	if (clone_flags & CLONE_THREAD) {  		p->group_leader = current->group_leader;  		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - -		if (!cputime_eq(current->signal->it_virt_expires, -				cputime_zero) || -		    !cputime_eq(current->signal->it_prof_expires, -				cputime_zero) || -		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || -		    !list_empty(¤t->signal->cpu_timers[0]) || -		    !list_empty(¤t->signal->cpu_timers[1]) || -		    !list_empty(¤t->signal->cpu_timers[2])) { -			/* -			 * Have child wake up on its first tick to check -			 * for process CPU timers. -			 */ -			p->it_prof_expires = jiffies_to_cputime(1); -		}  	}  	if (likely(p->pid)) { @@ -1366,6 +1375,8 @@ long do_fork(unsigned long clone_flags,  	if (!IS_ERR(p)) {  		struct completion vfork; +		trace_sched_process_fork(current, p); +  		nr = task_pid_vnr(p);  		if (clone_flags & CLONE_PARENT_SETTID) diff --git a/kernel/freezer.c b/kernel/freezer.c new file mode 100644 index 00000000000..ba6248b323e --- /dev/null +++ b/kernel/freezer.c @@ -0,0 +1,154 @@ +/* + * kernel/freezer.c - Function to freeze a process + * + * Originally from kernel/power/process.c + */ + +#include <linux/interrupt.h> +#include <linux/suspend.h> +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/freezer.h> + +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ +	if (!unlikely(current->flags & PF_NOFREEZE)) { +		current->flags |= PF_FROZEN; +		wmb(); +	} +	clear_freeze_flag(current); +} + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(void) +{ +	/* Hmm, should we be allowed to suspend when there are realtime +	   processes around? */ +	long save; + +	task_lock(current); +	if (freezing(current)) { +		frozen_process(); +		task_unlock(current); +	} else { +		task_unlock(current); +		return; +	} +	save = current->state; +	pr_debug("%s entered refrigerator\n", current->comm); + +	spin_lock_irq(¤t->sighand->siglock); +	recalc_sigpending(); /* We sent fake signal, clean it up */ +	spin_unlock_irq(¤t->sighand->siglock); + +	for (;;) { +		set_current_state(TASK_UNINTERRUPTIBLE); +		if (!frozen(current)) +			break; +		schedule(); +	} +	pr_debug("%s left refrigerator\n", current->comm); +	__set_current_state(save); +} +EXPORT_SYMBOL(refrigerator); + +static void fake_signal_wake_up(struct task_struct *p) +{ +	unsigned long flags; + +	spin_lock_irqsave(&p->sighand->siglock, flags); +	signal_wake_up(p, 0); +	spin_unlock_irqrestore(&p->sighand->siglock, flags); +} + +/** + *	freeze_task - send a freeze request to given task + *	@p: task to send the request to + *	@sig_only: if set, the request will only be sent if the task has the + *		PF_FREEZER_NOSIG flag unset + *	Return value: 'false', if @sig_only is set and the task has + *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * + *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and + *	either sending a fake signal to it or waking it up, depending on whether + *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task + *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + *	TIF_FREEZE flag will not be set. + */ +bool freeze_task(struct task_struct *p, bool sig_only) +{ +	/* +	 * We first check if the task is freezing and next if it has already +	 * been frozen to avoid the race with frozen_process() which first marks +	 * the task as frozen and next clears its TIF_FREEZE. +	 */ +	if (!freezing(p)) { +		rmb(); +		if (frozen(p)) +			return false; + +		if (!sig_only || should_send_signal(p)) +			set_freeze_flag(p); +		else +			return false; +	} + +	if (should_send_signal(p)) { +		if (!signal_pending(p)) +			fake_signal_wake_up(p); +	} else if (sig_only) { +		return false; +	} else { +		wake_up_state(p, TASK_INTERRUPTIBLE); +	} + +	return true; +} + +void cancel_freezing(struct task_struct *p) +{ +	unsigned long flags; + +	if (freezing(p)) { +		pr_debug("  clean up: %s\n", p->comm); +		clear_freeze_flag(p); +		spin_lock_irqsave(&p->sighand->siglock, flags); +		recalc_sigpending_and_wake(p); +		spin_unlock_irqrestore(&p->sighand->siglock, flags); +	} +} + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails.  Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int __thaw_process(struct task_struct *p) +{ +	if (frozen(p)) { +		p->flags &= ~PF_FROZEN; +		return 1; +	} +	clear_freeze_flag(p); +	return 0; +} + +int thaw_process(struct task_struct *p) +{ +	task_lock(p); +	if (__thaw_process(p) == 1) { +		task_unlock(p); +		wake_up_process(p); +		return 1; +	} +	task_unlock(p); +	return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 4fc41414fc0..2b465dfde42 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1461,9 +1461,7 @@ void hrtimer_run_queues(void)  		if (!base->first)  			continue; -		if (base->get_softirq_time) -			base->softirq_time = base->get_softirq_time(); -		else if (gettime) { +		if (gettime) {  			hrtimer_get_softirq_time(cpu_base);  			gettime = 0;  		} @@ -1752,9 +1750,11 @@ static void migrate_hrtimers(int cpu)  	new_base = &get_cpu_var(hrtimer_bases);  	tick_cancel_sched_timer(cpu); - -	local_irq_disable(); -	spin_lock(&new_base->lock); +	/* +	 * The caller is globally serialized and nobody else +	 * takes two locks at once, deadlock is not possible. +	 */ +	spin_lock_irq(&new_base->lock);  	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);  	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1767,8 +1767,7 @@ static void migrate_hrtimers(int cpu)  		raise = 1;  	spin_unlock(&old_base->lock); -	spin_unlock(&new_base->lock); -	local_irq_enable(); +	spin_unlock_irq(&new_base->lock);  	put_cpu_var(hrtimer_bases);  	if (raise) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 533068cfb60..cc0f7321b8c 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -30,17 +30,16 @@ static DEFINE_MUTEX(probing_active);  unsigned long probe_irq_on(void)  {  	struct irq_desc *desc; -	unsigned long mask; -	unsigned int i; +	unsigned long mask = 0; +	unsigned int status; +	int i;  	mutex_lock(&probing_active);  	/*  	 * something may have generated an irq long ago and we want to  	 * flush such a longstanding irq before considering it as spurious.  	 */ -	for (i = NR_IRQS-1; i > 0; i--) { -		desc = irq_desc + i; - +	for_each_irq_desc_reverse(i, desc) {  		spin_lock_irq(&desc->lock);  		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {  			/* @@ -68,9 +67,7 @@ unsigned long probe_irq_on(void)  	 * (we must startup again here because if a longstanding irq  	 * happened in the previous stage, it may have masked itself)  	 */ -	for (i = NR_IRQS-1; i > 0; i--) { -		desc = irq_desc + i; - +	for_each_irq_desc_reverse(i, desc) {  		spin_lock_irq(&desc->lock);  		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {  			desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -88,11 +85,7 @@ unsigned long probe_irq_on(void)  	/*  	 * Now filter out any obviously spurious interrupts  	 */ -	mask = 0; -	for (i = 0; i < NR_IRQS; i++) { -		unsigned int status; - -		desc = irq_desc + i; +	for_each_irq_desc(i, desc) {  		spin_lock_irq(&desc->lock);  		status = desc->status; @@ -126,14 +119,11 @@ EXPORT_SYMBOL(probe_irq_on);   */  unsigned int probe_irq_mask(unsigned long val)  { -	unsigned int mask; +	unsigned int status, mask = 0; +	struct irq_desc *desc;  	int i; -	mask = 0; -	for (i = 0; i < NR_IRQS; i++) { -		struct irq_desc *desc = irq_desc + i; -		unsigned int status; - +	for_each_irq_desc(i, desc) {  		spin_lock_irq(&desc->lock);  		status = desc->status; @@ -171,20 +161,19 @@ EXPORT_SYMBOL(probe_irq_mask);   */  int probe_irq_off(unsigned long val)  { -	int i, irq_found = 0, nr_irqs = 0; - -	for (i = 0; i < NR_IRQS; i++) { -		struct irq_desc *desc = irq_desc + i; -		unsigned int status; +	int i, irq_found = 0, nr_of_irqs = 0; +	struct irq_desc *desc; +	unsigned int status; +	for_each_irq_desc(i, desc) {  		spin_lock_irq(&desc->lock);  		status = desc->status;  		if (status & IRQ_AUTODETECT) {  			if (!(status & IRQ_WAITING)) { -				if (!nr_irqs) +				if (!nr_of_irqs)  					irq_found = i; -				nr_irqs++; +				nr_of_irqs++;  			}  			desc->status = status & ~IRQ_AUTODETECT;  			desc->chip->shutdown(i); @@ -193,7 +182,7 @@ int probe_irq_off(unsigned long val)  	}  	mutex_unlock(&probing_active); -	if (nr_irqs > 1) +	if (nr_of_irqs > 1)  		irq_found = -irq_found;  	return irq_found; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3cd441ebf5d..4895fde4eb9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -24,16 +24,15 @@   */  void dynamic_irq_init(unsigned int irq)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);  		return;  	}  	/* Ensure we don't have left over values from a previous use of this irq */ -	desc = irq_desc + irq;  	spin_lock_irqsave(&desc->lock, flags);  	desc->status = IRQ_DISABLED;  	desc->chip = &no_irq_chip; @@ -57,15 +56,14 @@ void dynamic_irq_init(unsigned int irq)   */  void dynamic_irq_cleanup(unsigned int irq)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);  		return;  	} -	desc = irq_desc + irq;  	spin_lock_irqsave(&desc->lock, flags);  	if (desc->action) {  		spin_unlock_irqrestore(&desc->lock, flags); @@ -89,10 +87,10 @@ void dynamic_irq_cleanup(unsigned int irq)   */  int set_irq_chip(unsigned int irq, struct irq_chip *chip)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);  		return -EINVAL;  	} @@ -100,7 +98,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)  	if (!chip)  		chip = &no_irq_chip; -	desc = irq_desc + irq;  	spin_lock_irqsave(&desc->lock, flags);  	irq_chip_set_defaults(chip);  	desc->chip = chip; @@ -111,27 +108,27 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)  EXPORT_SYMBOL(set_irq_chip);  /** - *	set_irq_type - set the irq type for an irq + *	set_irq_type - set the irq trigger type for an irq   *	@irq:	irq number - *	@type:	interrupt type - see include/linux/interrupt.h + *	@type:	IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h   */  int set_irq_type(unsigned int irq, unsigned int type)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags;  	int ret = -ENXIO; -	if (irq >= NR_IRQS) { +	if (!desc) {  		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);  		return -ENODEV;  	} -	desc = irq_desc + irq; -	if (desc->chip->set_type) { -		spin_lock_irqsave(&desc->lock, flags); -		ret = desc->chip->set_type(irq, type); -		spin_unlock_irqrestore(&desc->lock, flags); -	} +	if (type == IRQ_TYPE_NONE) +		return 0; + +	spin_lock_irqsave(&desc->lock, flags); +	ret = __irq_set_trigger(desc, irq, flags); +	spin_unlock_irqrestore(&desc->lock, flags);  	return ret;  }  EXPORT_SYMBOL(set_irq_type); @@ -145,16 +142,15 @@ EXPORT_SYMBOL(set_irq_type);   */  int set_irq_data(unsigned int irq, void *data)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		printk(KERN_ERR  		       "Trying to install controller data for IRQ%d\n", irq);  		return -EINVAL;  	} -	desc = irq_desc + irq;  	spin_lock_irqsave(&desc->lock, flags);  	desc->handler_data = data;  	spin_unlock_irqrestore(&desc->lock, flags); @@ -171,15 +167,15 @@ EXPORT_SYMBOL(set_irq_data);   */  int set_irq_msi(unsigned int irq, struct msi_desc *entry)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		printk(KERN_ERR  		       "Trying to install msi data for IRQ%d\n", irq);  		return -EINVAL;  	} -	desc = irq_desc + irq; +  	spin_lock_irqsave(&desc->lock, flags);  	desc->msi_desc = entry;  	if (entry) @@ -197,10 +193,16 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)   */  int set_irq_chip_data(unsigned int irq, void *data)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS || !desc->chip) { +	if (!desc) { +		printk(KERN_ERR +		       "Trying to install chip data for IRQ%d\n", irq); +		return -EINVAL; +	} + +	if (!desc->chip) {  		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);  		return -EINVAL;  	} @@ -218,7 +220,7 @@ EXPORT_SYMBOL(set_irq_chip_data);   */  static void default_enable(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	desc->chip->unmask(irq);  	desc->status &= ~IRQ_MASKED; @@ -236,8 +238,9 @@ static void default_disable(unsigned int irq)   */  static unsigned int default_startup(unsigned int irq)  { -	irq_desc[irq].chip->enable(irq); +	struct irq_desc *desc = irq_to_desc(irq); +	desc->chip->enable(irq);  	return 0;  } @@ -246,7 +249,7 @@ static unsigned int default_startup(unsigned int irq)   */  static void default_shutdown(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	desc->chip->mask(irq);  	desc->status |= IRQ_MASKED; @@ -305,14 +308,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)  {  	struct irqaction *action;  	irqreturn_t action_ret; -	const unsigned int cpu = smp_processor_id();  	spin_lock(&desc->lock);  	if (unlikely(desc->status & IRQ_INPROGRESS))  		goto out_unlock;  	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -	kstat_cpu(cpu).irqs[irq]++; +	kstat_incr_irqs_this_cpu(irq, desc);  	action = desc->action;  	if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -344,7 +346,6 @@ out_unlock:  void  handle_level_irq(unsigned int irq, struct irq_desc *desc)  { -	unsigned int cpu = smp_processor_id();  	struct irqaction *action;  	irqreturn_t action_ret; @@ -354,7 +355,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)  	if (unlikely(desc->status & IRQ_INPROGRESS))  		goto out_unlock;  	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -	kstat_cpu(cpu).irqs[irq]++; +	kstat_incr_irqs_this_cpu(irq, desc);  	/*  	 * If its disabled or no action available @@ -392,7 +393,6 @@ out_unlock:  void  handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)  { -	unsigned int cpu = smp_processor_id();  	struct irqaction *action;  	irqreturn_t action_ret; @@ -402,7 +402,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)  		goto out;  	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -	kstat_cpu(cpu).irqs[irq]++; +	kstat_incr_irqs_this_cpu(irq, desc);  	/*  	 * If its disabled or no action available @@ -451,8 +451,6 @@ out:  void  handle_edge_irq(unsigned int irq, struct irq_desc *desc)  { -	const unsigned int cpu = smp_processor_id(); -  	spin_lock(&desc->lock);  	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -468,8 +466,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)  		mask_ack_irq(desc, irq);  		goto out_unlock;  	} - -	kstat_cpu(cpu).irqs[irq]++; +	kstat_incr_irqs_this_cpu(irq, desc);  	/* Start handling the irq */  	desc->chip->ack(irq); @@ -524,7 +521,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)  {  	irqreturn_t action_ret; -	kstat_this_cpu.irqs[irq]++; +	kstat_incr_irqs_this_cpu(irq, desc);  	if (desc->chip->ack)  		desc->chip->ack(irq); @@ -541,17 +538,15 @@ void  __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,  		  const char *name)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		printk(KERN_ERR  		       "Trying to install type control for IRQ%d\n", irq);  		return;  	} -	desc = irq_desc + irq; -  	if (!handle)  		handle = handle_bad_irq;  	else if (desc->chip == &no_irq_chip) { @@ -583,7 +578,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,  		desc->status &= ~IRQ_DISABLED;  		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;  		desc->depth = 0; -		desc->chip->unmask(irq); +		desc->chip->startup(irq);  	}  	spin_unlock_irqrestore(&desc->lock, flags);  } @@ -606,17 +601,14 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,  void __init set_irq_noprobe(unsigned int irq)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); -  		return;  	} -	desc = irq_desc + irq; -  	spin_lock_irqsave(&desc->lock, flags);  	desc->status |= IRQ_NOPROBE;  	spin_unlock_irqrestore(&desc->lock, flags); @@ -624,17 +616,14 @@ void __init set_irq_noprobe(unsigned int irq)  void __init set_irq_probe(unsigned int irq)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) { +	if (!desc) {  		printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); -  		return;  	} -	desc = irq_desc + irq; -  	spin_lock_irqsave(&desc->lock, flags);  	desc->status &= ~IRQ_NOPROBE;  	spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 5fa6198e913..c815b42d0f5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -25,11 +25,10 @@   *   * Handles spurious and unhandled IRQ's. It also prints a debugmessage.   */ -void -handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(unsigned int irq, struct irq_desc *desc)  {  	print_irq_desc(irq, desc); -	kstat_this_cpu.irqs[irq]++; +	kstat_incr_irqs_this_cpu(irq, desc);  	ack_bad_irq(irq);  } @@ -47,6 +46,9 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)   *   * Controller mappings for all interrupt sources:   */ +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); +  struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {  	[0 ... NR_IRQS-1] = {  		.status = IRQ_DISABLED, @@ -66,7 +68,9 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {   */  static void ack_bad(unsigned int irq)  { -	print_irq_desc(irq, irq_desc + irq); +	struct irq_desc *desc = irq_to_desc(irq); + +	print_irq_desc(irq, desc);  	ack_bad_irq(irq);  } @@ -131,8 +135,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)  	irqreturn_t ret, retval = IRQ_NONE;  	unsigned int status = 0; -	handle_dynamic_tick(action); -  	if (!(action->flags & IRQF_DISABLED))  		local_irq_enable_in_hardirq(); @@ -165,11 +167,12 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)   */  unsigned int __do_IRQ(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	struct irqaction *action;  	unsigned int status; -	kstat_this_cpu.irqs[irq]++; +	kstat_incr_irqs_this_cpu(irq, desc); +  	if (CHECK_IRQ_PER_CPU(desc->status)) {  		irqreturn_t action_ret; @@ -256,8 +259,8 @@ out:  }  #endif -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_TRACE_IRQFLAGS  /*   * lockdep: we want to handle all irq_desc locks as a single lock-class:   */ @@ -265,10 +268,10 @@ static struct lock_class_key irq_desc_lock_class;  void early_init_irq_lock_class(void)  { +	struct irq_desc *desc;  	int i; -	for (i = 0; i < NR_IRQS; i++) -		lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); +	for_each_irq_desc(i, desc) +		lockdep_set_class(&desc->lock, &irq_desc_lock_class);  } -  #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 08a849a2244..c9767e64198 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -10,12 +10,15 @@ extern void irq_chip_set_defaults(struct irq_chip *chip);  /* Set default handler: */  extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, +		unsigned long flags); +  #ifdef CONFIG_PROC_FS -extern void register_irq_proc(unsigned int irq); +extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);  extern void register_handler_proc(unsigned int irq, struct irqaction *action);  extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);  #else -static inline void register_irq_proc(unsigned int irq) { } +static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }  static inline void register_handler_proc(unsigned int irq,  					 struct irqaction *action) { }  static inline void unregister_handler_proc(unsigned int irq, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 60c49e32439..c498a1b8c62 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL;   */  void synchronize_irq(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned int status; -	if (irq >= NR_IRQS) +	if (!desc)  		return;  	do { @@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq);   */  int irq_can_set_affinity(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||  	    !desc->chip->set_affinity) @@ -81,18 +81,17 @@ int irq_can_set_affinity(unsigned int irq)   */  int irq_set_affinity(unsigned int irq, cpumask_t cpumask)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	if (!desc->chip->set_affinity)  		return -EINVAL; -	set_balance_irq_affinity(irq, cpumask); -  #ifdef CONFIG_GENERIC_PENDING_IRQ -	if (desc->status & IRQ_MOVE_PCNTXT) { +	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {  		unsigned long flags;  		spin_lock_irqsave(&desc->lock, flags); +		desc->affinity = cpumask;  		desc->chip->set_affinity(irq, cpumask);  		spin_unlock_irqrestore(&desc->lock, flags);  	} else @@ -111,16 +110,17 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)  int irq_select_affinity(unsigned int irq)  {  	cpumask_t mask; +	struct irq_desc *desc;  	if (!irq_can_set_affinity(irq))  		return 0;  	cpus_and(mask, cpu_online_map, irq_default_affinity); -	irq_desc[irq].affinity = mask; -	irq_desc[irq].chip->set_affinity(irq, mask); +	desc = irq_to_desc(irq); +	desc->affinity = mask; +	desc->chip->set_affinity(irq, mask); -	set_balance_irq_affinity(irq, mask);  	return 0;  }  #endif @@ -140,10 +140,10 @@ int irq_select_affinity(unsigned int irq)   */  void disable_irq_nosync(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) +	if (!desc)  		return;  	spin_lock_irqsave(&desc->lock, flags); @@ -169,9 +169,9 @@ EXPORT_SYMBOL(disable_irq_nosync);   */  void disable_irq(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq); -	if (irq >= NR_IRQS) +	if (!desc)  		return;  	disable_irq_nosync(irq); @@ -211,10 +211,10 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)   */  void enable_irq(unsigned int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; -	if (irq >= NR_IRQS) +	if (!desc)  		return;  	spin_lock_irqsave(&desc->lock, flags); @@ -223,9 +223,9 @@ void enable_irq(unsigned int irq)  }  EXPORT_SYMBOL(enable_irq); -int set_irq_wake_real(unsigned int irq, unsigned int on) +static int set_irq_wake_real(unsigned int irq, unsigned int on)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	int ret = -ENXIO;  	if (desc->chip->set_wake) @@ -248,7 +248,7 @@ int set_irq_wake_real(unsigned int irq, unsigned int on)   */  int set_irq_wake(unsigned int irq, unsigned int on)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags;  	int ret = 0; @@ -288,12 +288,16 @@ EXPORT_SYMBOL(set_irq_wake);   */  int can_request_irq(unsigned int irq, unsigned long irqflags)  { +	struct irq_desc *desc = irq_to_desc(irq);  	struct irqaction *action; -	if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) +	if (!desc) +		return 0; + +	if (desc->status & IRQ_NOREQUEST)  		return 0; -	action = irq_desc[irq].action; +	action = desc->action;  	if (action)  		if (irqflags & action->flags & IRQF_SHARED)  			action = NULL; @@ -312,10 +316,11 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)  		desc->handle_irq = NULL;  } -static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, +int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,  		unsigned long flags)  {  	int ret; +	struct irq_chip *chip = desc->chip;  	if (!chip || !chip->set_type) {  		/* @@ -333,6 +338,11 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,  		pr_err("setting trigger mode %d for irq %u failed (%pF)\n",  				(int)(flags & IRQF_TRIGGER_MASK),  				irq, chip->set_type); +	else { +		/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ +		desc->status &= ~IRQ_TYPE_SENSE_MASK; +		desc->status |= flags & IRQ_TYPE_SENSE_MASK; +	}  	return ret;  } @@ -341,16 +351,16 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,   * Internal function to register an irqaction - typically used to   * allocate special interrupts that are part of the architecture.   */ -int setup_irq(unsigned int irq, struct irqaction *new) +static int +__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)  { -	struct irq_desc *desc = irq_desc + irq;  	struct irqaction *old, **p;  	const char *old_name = NULL;  	unsigned long flags;  	int shared = 0;  	int ret; -	if (irq >= NR_IRQS) +	if (!desc)  		return -EINVAL;  	if (desc->chip == &no_irq_chip) @@ -411,7 +421,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)  		/* Setup the type (level, edge polarity) if configured: */  		if (new->flags & IRQF_TRIGGER_MASK) { -			ret = __irq_set_trigger(desc->chip, irq, new->flags); +			ret = __irq_set_trigger(desc, irq, new->flags);  			if (ret) {  				spin_unlock_irqrestore(&desc->lock, flags); @@ -430,16 +440,21 @@ int setup_irq(unsigned int irq, struct irqaction *new)  		if (!(desc->status & IRQ_NOAUTOEN)) {  			desc->depth = 0;  			desc->status &= ~IRQ_DISABLED; -			if (desc->chip->startup) -				desc->chip->startup(irq); -			else -				desc->chip->enable(irq); +			desc->chip->startup(irq);  		} else  			/* Undo nested disables: */  			desc->depth = 1;  		/* Set default affinity mask once everything is setup */  		irq_select_affinity(irq); + +	} else if ((new->flags & IRQF_TRIGGER_MASK) +			&& (new->flags & IRQF_TRIGGER_MASK) +				!= (desc->status & IRQ_TYPE_SENSE_MASK)) { +		/* hope the handler works with the actual trigger mode... */ +		pr_warning("IRQ %d uses trigger mode %d; requested %d\n", +				irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), +				(int)(new->flags & IRQF_TRIGGER_MASK));  	}  	*p = new; @@ -464,7 +479,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)  	spin_unlock_irqrestore(&desc->lock, flags);  	new->irq = irq; -	register_irq_proc(irq); +	register_irq_proc(irq, desc);  	new->dir = NULL;  	register_handler_proc(irq, new); @@ -484,6 +499,20 @@ mismatch:  }  /** + *	setup_irq - setup an interrupt + *	@irq: Interrupt line to setup + *	@act: irqaction for the interrupt + * + * Used to statically setup interrupts in the early boot process. + */ +int setup_irq(unsigned int irq, struct irqaction *act) +{ +	struct irq_desc *desc = irq_to_desc(irq); + +	return __setup_irq(irq, desc, act); +} + +/**   *	free_irq - free an interrupt   *	@irq: Interrupt line to free   *	@dev_id: Device identity to free @@ -499,15 +528,15 @@ mismatch:   */  void free_irq(unsigned int irq, void *dev_id)  { -	struct irq_desc *desc; +	struct irq_desc *desc = irq_to_desc(irq);  	struct irqaction **p;  	unsigned long flags;  	WARN_ON(in_interrupt()); -	if (irq >= NR_IRQS) + +	if (!desc)  		return; -	desc = irq_desc + irq;  	spin_lock_irqsave(&desc->lock, flags);  	p = &desc->action;  	for (;;) { @@ -596,12 +625,14 @@ EXPORT_SYMBOL(free_irq);   *	IRQF_SHARED		Interrupt is shared   *	IRQF_DISABLED	Disable local interrupts while processing   *	IRQF_SAMPLE_RANDOM	The interrupt can be used for entropy + *	IRQF_TRIGGER_*		Specify active edge(s) or level   *   */  int request_irq(unsigned int irq, irq_handler_t handler,  		unsigned long irqflags, const char *devname, void *dev_id)  {  	struct irqaction *action; +	struct irq_desc *desc;  	int retval;  #ifdef CONFIG_LOCKDEP @@ -618,9 +649,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,  	 */  	if ((irqflags & IRQF_SHARED) && !dev_id)  		return -EINVAL; -	if (irq >= NR_IRQS) + +	desc = irq_to_desc(irq); +	if (!desc)  		return -EINVAL; -	if (irq_desc[irq].status & IRQ_NOREQUEST) + +	if (desc->status & IRQ_NOREQUEST)  		return -EINVAL;  	if (!handler)  		return -EINVAL; @@ -636,26 +670,29 @@ int request_irq(unsigned int irq, irq_handler_t handler,  	action->next = NULL;  	action->dev_id = dev_id; +	retval = __setup_irq(irq, desc, action); +	if (retval) +		kfree(action); +  #ifdef CONFIG_DEBUG_SHIRQ  	if (irqflags & IRQF_SHARED) {  		/*  		 * It's a shared IRQ -- the driver ought to be prepared for it  		 * to happen immediately, so let's make sure.... -		 * We do this before actually registering it, to make sure that -		 * a 'real' IRQ doesn't run in parallel with our fake +		 * We disable the irq to make sure that a 'real' IRQ doesn't +		 * run in parallel with our fake.  		 */  		unsigned long flags; +		disable_irq(irq);  		local_irq_save(flags); +  		handler(irq, dev_id); +  		local_irq_restore(flags); +		enable_irq(irq);  	}  #endif - -	retval = setup_irq(irq, action); -	if (retval) -		kfree(action); -  	return retval;  }  EXPORT_SYMBOL(request_irq); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 77b7acc875c..90b920d3f52 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,18 +3,18 @@  void set_pending_irq(unsigned int irq, cpumask_t mask)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags;  	spin_lock_irqsave(&desc->lock, flags);  	desc->status |= IRQ_MOVE_PENDING; -	irq_desc[irq].pending_mask = mask; +	desc->pending_mask = mask;  	spin_unlock_irqrestore(&desc->lock, flags);  }  void move_masked_irq(int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	cpumask_t tmp;  	if (likely(!(desc->status & IRQ_MOVE_PENDING))) @@ -30,7 +30,7 @@ void move_masked_irq(int irq)  	desc->status &= ~IRQ_MOVE_PENDING; -	if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) +	if (unlikely(cpus_empty(desc->pending_mask)))  		return;  	if (!desc->chip->set_affinity) @@ -38,7 +38,7 @@ void move_masked_irq(int irq)  	assert_spin_locked(&desc->lock); -	cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); +	cpus_and(tmp, desc->pending_mask, cpu_online_map);  	/*  	 * If there was a valid mask to work with, please @@ -55,12 +55,12 @@ void move_masked_irq(int irq)  	if (likely(!cpus_empty(tmp))) {  		desc->chip->set_affinity(irq,tmp);  	} -	cpus_clear(irq_desc[irq].pending_mask); +	cpus_clear(desc->pending_mask);  }  void move_native_irq(int irq)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	if (likely(!(desc->status & IRQ_MOVE_PENDING)))  		return; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a09dd29c2fd..fac014a81b2 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;  static int irq_affinity_proc_show(struct seq_file *m, void *v)  { -	struct irq_desc *desc = irq_desc + (long)m->private; +	struct irq_desc *desc = irq_to_desc((long)m->private);  	cpumask_t *mask = &desc->affinity;  #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,  	cpumask_t new_value;  	int err; -	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || +	if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||  	    irq_balancing_disabled(irq))  		return -EIO; @@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = {  static int irq_spurious_read(char *page, char **start, off_t off,  				  int count, int *eof, void *data)  { -	struct irq_desc *d = &irq_desc[(long) data]; +	struct irq_desc *desc = irq_to_desc((long) data);  	return sprintf(page, "count %u\n"  			     "unhandled %u\n"  			     "last_unhandled %u ms\n", -			d->irq_count, -			d->irqs_unhandled, -			jiffies_to_msecs(d->last_unhandled)); +			desc->irq_count, +			desc->irqs_unhandled, +			jiffies_to_msecs(desc->last_unhandled));  }  #define MAX_NAMELEN 128  static int name_unique(unsigned int irq, struct irqaction *new_action)  { -	struct irq_desc *desc = irq_desc + irq; +	struct irq_desc *desc = irq_to_desc(irq);  	struct irqaction *action;  	unsigned long flags;  	int ret = 1; @@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)  void register_handler_proc(unsigned int irq, struct irqaction *action)  {  	char name [MAX_NAMELEN]; +	struct irq_desc *desc = irq_to_desc(irq); -	if (!irq_desc[irq].dir || action->dir || !action->name || +	if (!desc->dir || action->dir || !action->name ||  					!name_unique(irq, action))  		return; @@ -174,36 +175,34 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)  	snprintf(name, MAX_NAMELEN, "%s", action->name);  	/* create /proc/irq/1234/handler/ */ -	action->dir = proc_mkdir(name, irq_desc[irq].dir); +	action->dir = proc_mkdir(name, desc->dir);  }  #undef MAX_NAMELEN  #define MAX_NAMELEN 10 -void register_irq_proc(unsigned int irq) +void register_irq_proc(unsigned int irq, struct irq_desc *desc)  {  	char name [MAX_NAMELEN];  	struct proc_dir_entry *entry; -	if (!root_irq_dir || -		(irq_desc[irq].chip == &no_irq_chip) || -			irq_desc[irq].dir) +	if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)  		return;  	memset(name, 0, MAX_NAMELEN);  	sprintf(name, "%d", irq);  	/* create /proc/irq/1234 */ -	irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); +	desc->dir = proc_mkdir(name, root_irq_dir);  #ifdef CONFIG_SMP  	/* create /proc/irq/<irq>/smp_affinity */ -	proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, +	proc_create_data("smp_affinity", 0600, desc->dir,  			 &irq_affinity_proc_fops, (void *)(long)irq);  #endif -	entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); +	entry = create_proc_entry("spurious", 0444, desc->dir);  	if (entry) {  		entry->data = (void *)(long)irq;  		entry->read_proc = irq_spurious_read; @@ -214,8 +213,11 @@ void register_irq_proc(unsigned int irq)  void unregister_handler_proc(unsigned int irq, struct irqaction *action)  { -	if (action->dir) -		remove_proc_entry(action->dir->name, irq_desc[irq].dir); +	if (action->dir) { +		struct irq_desc *desc = irq_to_desc(irq); + +		remove_proc_entry(action->dir->name, desc->dir); +	}  }  void register_default_affinity_proc(void) @@ -228,7 +230,8 @@ void register_default_affinity_proc(void)  void init_irq_proc(void)  { -	int i; +	unsigned int irq; +	struct irq_desc *desc;  	/* create /proc/irq */  	root_irq_dir = proc_mkdir("irq", NULL); @@ -240,7 +243,7 @@ void init_irq_proc(void)  	/*  	 * Create entries for all existing IRQs.  	 */ -	for (i = 0; i < NR_IRQS; i++) -		register_irq_proc(i); +	for_each_irq_desc(irq, desc) +		register_irq_proc(irq, desc);  } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index a8046791ba2..89c7117acf2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -33,10 +33,10 @@ static void resend_irqs(unsigned long arg)  	struct irq_desc *desc;  	int irq; -	while (!bitmap_empty(irqs_resend, NR_IRQS)) { -		irq = find_first_bit(irqs_resend, NR_IRQS); +	while (!bitmap_empty(irqs_resend, nr_irqs)) { +		irq = find_first_bit(irqs_resend, nr_irqs);  		clear_bit(irq, irqs_resend); -		desc = irq_desc + irq; +		desc = irq_to_desc(irq);  		local_irq_disable();  		desc->handle_irq(irq, desc);  		local_irq_enable(); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c66d3f10e85..dd364c11e56 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -12,83 +12,122 @@  #include <linux/kallsyms.h>  #include <linux/interrupt.h>  #include <linux/moduleparam.h> +#include <linux/timer.h>  static int irqfixup __read_mostly; +#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) +static void poll_spurious_irqs(unsigned long dummy); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +  /*   * Recovery handler for misrouted interrupts.   */ -static int misrouted_irq(int irq) +static int try_one_irq(int irq, struct irq_desc *desc)  { -	int i; -	int ok = 0; -	int work = 0;	/* Did we do work for a real IRQ */ - -	for (i = 1; i < NR_IRQS; i++) { -		struct irq_desc *desc = irq_desc + i; -		struct irqaction *action; - -		if (i == irq)	/* Already tried */ -			continue; +	struct irqaction *action; +	int ok = 0, work = 0; -		spin_lock(&desc->lock); -		/* Already running on another processor */ -		if (desc->status & IRQ_INPROGRESS) { -			/* -			 * Already running: If it is shared get the other -			 * CPU to go looking for our mystery interrupt too -			 */ -			if (desc->action && (desc->action->flags & IRQF_SHARED)) -				desc->status |= IRQ_PENDING; -			spin_unlock(&desc->lock); -			continue; -		} -		/* Honour the normal IRQ locking */ -		desc->status |= IRQ_INPROGRESS; -		action = desc->action; +	spin_lock(&desc->lock); +	/* Already running on another processor */ +	if (desc->status & IRQ_INPROGRESS) { +		/* +		 * Already running: If it is shared get the other +		 * CPU to go looking for our mystery interrupt too +		 */ +		if (desc->action && (desc->action->flags & IRQF_SHARED)) +			desc->status |= IRQ_PENDING;  		spin_unlock(&desc->lock); +		return ok; +	} +	/* Honour the normal IRQ locking */ +	desc->status |= IRQ_INPROGRESS; +	action = desc->action; +	spin_unlock(&desc->lock); -		while (action) { -			/* Only shared IRQ handlers are safe to call */ -			if (action->flags & IRQF_SHARED) { -				if (action->handler(i, action->dev_id) == -						IRQ_HANDLED) -					ok = 1; -			} -			action = action->next; +	while (action) { +		/* Only shared IRQ handlers are safe to call */ +		if (action->flags & IRQF_SHARED) { +			if (action->handler(irq, action->dev_id) == +				IRQ_HANDLED) +				ok = 1;  		} -		local_irq_disable(); -		/* Now clean up the flags */ -		spin_lock(&desc->lock); -		action = desc->action; +		action = action->next; +	} +	local_irq_disable(); +	/* Now clean up the flags */ +	spin_lock(&desc->lock); +	action = desc->action; +	/* +	 * While we were looking for a fixup someone queued a real +	 * IRQ clashing with our walk: +	 */ +	while ((desc->status & IRQ_PENDING) && action) {  		/* -		 * While we were looking for a fixup someone queued a real -		 * IRQ clashing with our walk: -		 */ -		while ((desc->status & IRQ_PENDING) && action) { -			/* -			 * Perform real IRQ processing for the IRQ we deferred -			 */ -			work = 1; -			spin_unlock(&desc->lock); -			handle_IRQ_event(i, action); -			spin_lock(&desc->lock); -			desc->status &= ~IRQ_PENDING; -		} -		desc->status &= ~IRQ_INPROGRESS; -		/* -		 * If we did actual work for the real IRQ line we must let the -		 * IRQ controller clean up too +		 * Perform real IRQ processing for the IRQ we deferred  		 */ -		if (work && desc->chip && desc->chip->end) -			desc->chip->end(i); +		work = 1;  		spin_unlock(&desc->lock); +		handle_IRQ_event(irq, action); +		spin_lock(&desc->lock); +		desc->status &= ~IRQ_PENDING; +	} +	desc->status &= ~IRQ_INPROGRESS; +	/* +	 * If we did actual work for the real IRQ line we must let the +	 * IRQ controller clean up too +	 */ +	if (work && desc->chip && desc->chip->end) +		desc->chip->end(irq); +	spin_unlock(&desc->lock); + +	return ok; +} + +static int misrouted_irq(int irq) +{ +	struct irq_desc *desc; +	int i, ok = 0; + +	for_each_irq_desc(i, desc) { +		if (!i) +			 continue; + +		if (i == irq)	/* Already tried */ +			continue; + +		if (try_one_irq(i, desc)) +			ok = 1;  	}  	/* So the caller can adjust the irq error counts */  	return ok;  } +static void poll_spurious_irqs(unsigned long dummy) +{ +	struct irq_desc *desc; +	int i; + +	for_each_irq_desc(i, desc) { +		unsigned int status; + +		if (!i) +			 continue; + +		/* Racy but it doesn't matter */ +		status = desc->status; +		barrier(); +		if (!(status & IRQ_SPURIOUS_DISABLED)) +			continue; + +		try_one_irq(i, desc); +	} + +	mod_timer(&poll_spurious_irq_timer, +		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL); +} +  /*   * If 99,900 of the previous 100,000 interrupts have not been handled   * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -137,7 +176,9 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)  	}  } -static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +static inline int +try_misrouted_irq(unsigned int irq, struct irq_desc *desc, +		  irqreturn_t action_ret)  {  	struct irqaction *action; @@ -212,6 +253,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,  		desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;  		desc->depth++;  		desc->chip->disable(irq); + +		mod_timer(&poll_spurious_irq_timer, +			  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);  	}  	desc->irqs_unhandled = 0;  } @@ -241,7 +285,7 @@ static int __init irqfixup_setup(char *str)  __setup("irqfixup", irqfixup_setup);  module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); +MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");  static int __init irqpoll_setup(char *str)  { diff --git a/kernel/itimer.c b/kernel/itimer.c index ab982747d9b..db7c358b9a0 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)  		spin_unlock_irq(&tsk->sighand->siglock);  		break;  	case ITIMER_VIRTUAL: -		read_lock(&tasklist_lock);  		spin_lock_irq(&tsk->sighand->siglock);  		cval = tsk->signal->it_virt_expires;  		cinterval = tsk->signal->it_virt_incr;  		if (!cputime_eq(cval, cputime_zero)) { -			struct task_struct *t = tsk; -			cputime_t utime = tsk->signal->utime; -			do { -				utime = cputime_add(utime, t->utime); -				t = next_thread(t); -			} while (t != tsk); +			struct task_cputime cputime; +			cputime_t utime; + +			thread_group_cputime(tsk, &cputime); +			utime = cputime.utime;  			if (cputime_le(cval, utime)) { /* about to fire */  				cval = jiffies_to_cputime(1);  			} else { @@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)  			}  		}  		spin_unlock_irq(&tsk->sighand->siglock); -		read_unlock(&tasklist_lock);  		cputime_to_timeval(cval, &value->it_value);  		cputime_to_timeval(cinterval, &value->it_interval);  		break;  	case ITIMER_PROF: -		read_lock(&tasklist_lock);  		spin_lock_irq(&tsk->sighand->siglock);  		cval = tsk->signal->it_prof_expires;  		cinterval = tsk->signal->it_prof_incr;  		if (!cputime_eq(cval, cputime_zero)) { -			struct task_struct *t = tsk; -			cputime_t ptime = cputime_add(tsk->signal->utime, -						      tsk->signal->stime); -			do { -				ptime = cputime_add(ptime, -						    cputime_add(t->utime, -								t->stime)); -				t = next_thread(t); -			} while (t != tsk); +			struct task_cputime times; +			cputime_t ptime; + +			thread_group_cputime(tsk, ×); +			ptime = cputime_add(times.utime, times.stime);  			if (cputime_le(cval, ptime)) { /* about to fire */  				cval = jiffies_to_cputime(1);  			} else { @@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)  			}  		}  		spin_unlock_irq(&tsk->sighand->siglock); -		read_unlock(&tasklist_lock);  		cputime_to_timeval(cval, &value->it_value);  		cputime_to_timeval(cinterval, &value->it_interval);  		break; @@ -185,7 +176,6 @@ again:  	case ITIMER_VIRTUAL:  		nval = timeval_to_cputime(&value->it_value);  		ninterval = timeval_to_cputime(&value->it_interval); -		read_lock(&tasklist_lock);  		spin_lock_irq(&tsk->sighand->siglock);  		cval = tsk->signal->it_virt_expires;  		cinterval = tsk->signal->it_virt_incr; @@ -200,7 +190,6 @@ again:  		tsk->signal->it_virt_expires = nval;  		tsk->signal->it_virt_incr = ninterval;  		spin_unlock_irq(&tsk->sighand->siglock); -		read_unlock(&tasklist_lock);  		if (ovalue) {  			cputime_to_timeval(cval, &ovalue->it_value);  			cputime_to_timeval(cinterval, &ovalue->it_interval); @@ -209,7 +198,6 @@ again:  	case ITIMER_PROF:  		nval = timeval_to_cputime(&value->it_value);  		ninterval = timeval_to_cputime(&value->it_interval); -		read_lock(&tasklist_lock);  		spin_lock_irq(&tsk->sighand->siglock);  		cval = tsk->signal->it_prof_expires;  		cinterval = tsk->signal->it_prof_incr; @@ -224,7 +212,6 @@ again:  		tsk->signal->it_prof_expires = nval;  		tsk->signal->it_prof_incr = ninterval;  		spin_unlock_irq(&tsk->sighand->siglock); -		read_unlock(&tasklist_lock);  		if (ovalue) {  			cputime_to_timeval(cval, &ovalue->it_value);  			cputime_to_timeval(cinterval, &ovalue->it_interval); diff --git a/kernel/kexec.c b/kernel/kexec.c index aef265325cd..ac0fde7b54d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -30,6 +30,7 @@  #include <linux/pm.h>  #include <linux/cpu.h>  #include <linux/console.h> +#include <linux/vmalloc.h>  #include <asm/page.h>  #include <asm/uaccess.h> @@ -1371,6 +1372,7 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_SYMBOL(node_online_map);  	VMCOREINFO_SYMBOL(swapper_pg_dir);  	VMCOREINFO_SYMBOL(_stext); +	VMCOREINFO_SYMBOL(vmlist);  #ifndef CONFIG_NEED_MULTIPLE_NODES  	VMCOREINFO_SYMBOL(mem_map); @@ -1406,6 +1408,7 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_OFFSET(free_area, free_list);  	VMCOREINFO_OFFSET(list_head, next);  	VMCOREINFO_OFFSET(list_head, prev); +	VMCOREINFO_OFFSET(vm_struct, addr);  	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);  	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);  	VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/kernel/kthread.c b/kernel/kthread.c index 96cff2f8710..8e7a7ce3ed0 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,6 +13,7 @@  #include <linux/file.h>  #include <linux/module.h>  #include <linux/mutex.h> +#include <trace/sched.h>  #define KTHREAD_NICE_LEVEL (-5) @@ -171,12 +172,11 @@ EXPORT_SYMBOL(kthread_create);   */  void kthread_bind(struct task_struct *k, unsigned int cpu)  { -	if (k->state != TASK_UNINTERRUPTIBLE) { +	/* Must have done schedule() in kthread() before we set_task_cpu */ +	if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {  		WARN_ON(1);  		return;  	} -	/* Must have done schedule() in kthread() before we set_task_cpu */ -	wait_task_inactive(k, 0);  	set_task_cpu(k, cpu);  	k->cpus_allowed = cpumask_of_cpu(cpu);  	k->rt.nr_cpus_allowed = 1; @@ -206,6 +206,8 @@ int kthread_stop(struct task_struct *k)  	/* It could exit after stop_info.k set, but before wake_up_process. */  	get_task_struct(k); +	trace_sched_kthread_stop(k); +  	/* Must init completion *before* thread sees kthread_stop_info.k */  	init_completion(&kthread_stop_info.done);  	smp_wmb(); @@ -221,6 +223,8 @@ int kthread_stop(struct task_struct *k)  	ret = kthread_stop_info.err;  	mutex_unlock(&kthread_stop_lock); +	trace_sched_kthread_stop_ret(ret); +  	return ret;  }  EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/marker.c b/kernel/marker.c index 7d1faecd7a5..e9c6b2bc940 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -62,7 +62,7 @@ struct marker_entry {  	int refcount;	/* Number of times armed. 0 if disarmed. */  	struct rcu_head rcu;  	void *oldptr; -	unsigned char rcu_pending:1; +	int rcu_pending;  	unsigned char ptype:1;  	char name[0];	/* Contains name'\0'format'\0' */  }; @@ -103,11 +103,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)  	char ptype;  	/* -	 * preempt_disable does two things : disabling preemption to make sure -	 * the teardown of the callbacks can be done correctly when they are in -	 * modules and they insure RCU read coherency. +	 * rcu_read_lock_sched does two things : disabling preemption to make +	 * sure the teardown of the callbacks can be done correctly when they +	 * are in modules and they insure RCU read coherency.  	 */ -	preempt_disable(); +	rcu_read_lock_sched();  	ptype = mdata->ptype;  	if (likely(!ptype)) {  		marker_probe_func *func; @@ -145,7 +145,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)  			va_end(args);  		}  	} -	preempt_enable(); +	rcu_read_unlock_sched();  }  EXPORT_SYMBOL_GPL(marker_probe_cb); @@ -162,7 +162,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)  	va_list args;	/* not initialized */  	char ptype; -	preempt_disable(); +	rcu_read_lock_sched();  	ptype = mdata->ptype;  	if (likely(!ptype)) {  		marker_probe_func *func; @@ -195,7 +195,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)  			multi[i].func(multi[i].probe_private, call_private,  				mdata->format, &args);  	} -	preempt_enable(); +	rcu_read_unlock_sched();  }  EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); @@ -560,7 +560,7 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,   * Disable a marker and its probe callback.   * Note: only waiting an RCU period after setting elem->call to the empty   * function insures that the original callback is not used anymore. This insured - * by preempt_disable around the call site. + * by rcu_read_lock_sched around the call site.   */  static void disable_marker(struct marker *elem)  { @@ -653,11 +653,17 @@ int marker_probe_register(const char *name, const char *format,  	entry = get_marker(name);  	if (!entry) {  		entry = add_marker(name, format); -		if (IS_ERR(entry)) { +		if (IS_ERR(entry))  			ret = PTR_ERR(entry); -			goto end; -		} +	} else if (format) { +		if (!entry->format) +			ret = marker_set_format(&entry, format); +		else if (strcmp(entry->format, format)) +			ret = -EPERM;  	} +	if (ret) +		goto end; +  	/*  	 * If we detect that a call_rcu is pending for this marker,  	 * make sure it's executed now. @@ -674,6 +680,8 @@ int marker_probe_register(const char *name, const char *format,  	mutex_lock(&markers_mutex);  	entry = get_marker(name);  	WARN_ON(!entry); +	if (entry->rcu_pending) +		rcu_barrier_sched();  	entry->oldptr = old;  	entry->rcu_pending = 1;  	/* write rcu_pending before calling the RCU callback */ @@ -717,6 +725,8 @@ int marker_probe_unregister(const char *name,  	entry = get_marker(name);  	if (!entry)  		goto end; +	if (entry->rcu_pending) +		rcu_barrier_sched();  	entry->oldptr = old;  	entry->rcu_pending = 1;  	/* write rcu_pending before calling the RCU callback */ @@ -795,6 +805,8 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,  	mutex_lock(&markers_mutex);  	entry = get_marker_from_private_data(probe, probe_private);  	WARN_ON(!entry); +	if (entry->rcu_pending) +		rcu_barrier_sched();  	entry->oldptr = old;  	entry->rcu_pending = 1;  	/* write rcu_pending before calling the RCU callback */ diff --git a/kernel/module.c b/kernel/module.c index b7205f67cfa..0d8d21ee792 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -46,6 +46,8 @@  #include <asm/cacheflush.h>  #include <linux/license.h>  #include <asm/sections.h> +#include <linux/tracepoint.h> +#include <linux/ftrace.h>  #if 0  #define DEBUGP printk @@ -1430,6 +1432,9 @@ static void free_module(struct module *mod)  	/* Module unload stuff */  	module_unload_free(mod); +	/* release any pointers to mcount in this module */ +	ftrace_release(mod->module_core, mod->core_size); +  	/* This may be NULL, but that's OK */  	module_free(mod, mod->module_init);  	kfree(mod->args); @@ -1834,6 +1839,7 @@ static noinline struct module *load_module(void __user *umod,  	Elf_Ehdr *hdr;  	Elf_Shdr *sechdrs;  	char *secstrings, *args, *modmagic, *strtab = NULL; +	char *staging;  	unsigned int i;  	unsigned int symindex = 0;  	unsigned int strindex = 0; @@ -1860,9 +1866,13 @@ static noinline struct module *load_module(void __user *umod,  	unsigned int markersindex;  	unsigned int markersstringsindex;  	unsigned int verboseindex; +	unsigned int tracepointsindex; +	unsigned int tracepointsstringsindex; +	unsigned int mcountindex;  	struct module *mod;  	long err = 0;  	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ +	void *mseg;  	struct exception_table_entry *extable;  	mm_segment_t old_fs; @@ -1989,6 +1999,14 @@ static noinline struct module *load_module(void __user *umod,  		goto free_hdr;  	} +	staging = get_modinfo(sechdrs, infoindex, "staging"); +	if (staging) { +		add_taint_module(mod, TAINT_CRAP); +		printk(KERN_WARNING "%s: module is from the staging directory," +		       " the quality is unknown, you have been warned.\n", +		       mod->name); +	} +  	/* Now copy in args */  	args = strndup_user(uargs, ~0UL >> 1);  	if (IS_ERR(args)) { @@ -2147,6 +2165,12 @@ static noinline struct module *load_module(void __user *umod,   	markersstringsindex = find_sec(hdr, sechdrs, secstrings,  					"__markers_strings");  	verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); +	tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); +	tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, +					"__tracepoints_strings"); + +	mcountindex = find_sec(hdr, sechdrs, secstrings, +			       "__mcount_loc");  	/* Now do relocations. */  	for (i = 1; i < hdr->e_shnum; i++) { @@ -2174,6 +2198,12 @@ static noinline struct module *load_module(void __user *umod,  	mod->num_markers =  		sechdrs[markersindex].sh_size / sizeof(*mod->markers);  #endif +#ifdef CONFIG_TRACEPOINTS +	mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; +	mod->num_tracepoints = +		sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); +#endif +          /* Find duplicate symbols */  	err = verify_export_symbols(mod); @@ -2192,12 +2222,22 @@ static noinline struct module *load_module(void __user *umod,  	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); +	if (!mod->taints) {  #ifdef CONFIG_MARKERS -	if (!mod->taints)  		marker_update_probe_range(mod->markers,  			mod->markers + mod->num_markers);  #endif  	dynamic_printk_setup(sechdrs, verboseindex); +#ifdef CONFIG_TRACEPOINTS +		tracepoint_update_probe_range(mod->tracepoints, +			mod->tracepoints + mod->num_tracepoints); +#endif +	} + +	/* sechdrs[0].sh_size is always zero */ +	mseg = (void *)sechdrs[mcountindex].sh_addr; +	ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); +  	err = module_finalize(hdr, sechdrs, mod);  	if (err < 0)  		goto cleanup; @@ -2267,6 +2307,7 @@ static noinline struct module *load_module(void __user *umod,   cleanup:  	kobject_del(&mod->mkobj.kobj);  	kobject_put(&mod->mkobj.kobj); +	ftrace_release(mod->module_core, mod->core_size);   free_unload:  	module_unload_free(mod);  	module_free(mod, mod->module_init); @@ -2587,6 +2628,8 @@ static char *module_flags(struct module *mod, char *buf)  			buf[bx++] = 'P';  		if (mod->taints & (1 << TAINT_FORCED_MODULE))  			buf[bx++] = 'F'; +		if (mod->taints & (1 << TAINT_CRAP)) +			buf[bx++] = 'C';  		/*  		 * TAINT_FORCED_RMMOD: could be added.  		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't @@ -2748,3 +2791,50 @@ void module_update_markers(void)  	mutex_unlock(&module_mutex);  }  #endif + +#ifdef CONFIG_TRACEPOINTS +void module_update_tracepoints(void) +{ +	struct module *mod; + +	mutex_lock(&module_mutex); +	list_for_each_entry(mod, &modules, list) +		if (!mod->taints) +			tracepoint_update_probe_range(mod->tracepoints, +				mod->tracepoints + mod->num_tracepoints); +	mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ +	struct module *iter_mod; +	int found = 0; + +	mutex_lock(&module_mutex); +	list_for_each_entry(iter_mod, &modules, list) { +		if (!iter_mod->taints) { +			/* +			 * Sorted module list +			 */ +			if (iter_mod < iter->module) +				continue; +			else if (iter_mod > iter->module) +				iter->tracepoint = NULL; +			found = tracepoint_get_iter_range(&iter->tracepoint, +				iter_mod->tracepoints, +				iter_mod->tracepoints +					+ iter_mod->num_tracepoints); +			if (found) { +				iter->module = iter_mod; +				break; +			} +		} +	} +	mutex_unlock(&module_mutex); +	return found; +} +#endif diff --git a/kernel/notifier.c b/kernel/notifier.c index 823be11584e..4282c0a40a5 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -550,7 +550,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);  static ATOMIC_NOTIFIER_HEAD(die_chain); -int notify_die(enum die_val val, const char *str, +int notrace notify_die(enum die_val val, const char *str,  	       struct pt_regs *regs, long err, int trap, int sig)  {  	struct die_args args = { diff --git a/kernel/panic.c b/kernel/panic.c index f290e8e866f..bda561ef3cd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -161,6 +161,7 @@ static const struct tnt tnts[] = {  	{ TAINT_DIE, 'D', ' ' },  	{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },  	{ TAINT_WARN, 'W', ' ' }, +	{ TAINT_CRAP, 'C', ' ' },  };  /** @@ -175,6 +176,7 @@ static const struct tnt tnts[] = {   *  'U' - Userspace-defined naughtiness.   *  'A' - ACPI table overridden.   *  'W' - Taint on warning. + *  'C' - modules from drivers/staging are loaded.   *   *	The string is overwritten by the next call to print_taint().   */ diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c42a03aef36..153dcb2639c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -7,6 +7,93 @@  #include <linux/errno.h>  #include <linux/math64.h>  #include <asm/uaccess.h> +#include <linux/kernel_stat.h> + +/* + * Allocate the thread_group_cputime structure appropriately and fill in the + * current values of the fields.  Called from copy_signal() via + * thread_group_cputime_clone_thread() when adding a second or subsequent + * thread to a thread group.  Assumes interrupts are enabled when called. + */ +int thread_group_cputime_alloc(struct task_struct *tsk) +{ +	struct signal_struct *sig = tsk->signal; +	struct task_cputime *cputime; + +	/* +	 * If we have multiple threads and we don't already have a +	 * per-CPU task_cputime struct (checked in the caller), allocate +	 * one and fill it in with the times accumulated so far.  We may +	 * race with another thread so recheck after we pick up the sighand +	 * lock. +	 */ +	cputime = alloc_percpu(struct task_cputime); +	if (cputime == NULL) +		return -ENOMEM; +	spin_lock_irq(&tsk->sighand->siglock); +	if (sig->cputime.totals) { +		spin_unlock_irq(&tsk->sighand->siglock); +		free_percpu(cputime); +		return 0; +	} +	sig->cputime.totals = cputime; +	cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); +	cputime->utime = tsk->utime; +	cputime->stime = tsk->stime; +	cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; +	spin_unlock_irq(&tsk->sighand->siglock); +	return 0; +} + +/** + * thread_group_cputime - Sum the thread group time fields across all CPUs. + * + * @tsk:	The task we use to identify the thread group. + * @times:	task_cputime structure in which we return the summed fields. + * + * Walk the list of CPUs to sum the per-CPU time fields in the thread group + * time structure. + */ +void thread_group_cputime( +	struct task_struct *tsk, +	struct task_cputime *times) +{ +	struct signal_struct *sig; +	int i; +	struct task_cputime *tot; + +	sig = tsk->signal; +	if (unlikely(!sig) || !sig->cputime.totals) { +		times->utime = tsk->utime; +		times->stime = tsk->stime; +		times->sum_exec_runtime = tsk->se.sum_exec_runtime; +		return; +	} +	times->stime = times->utime = cputime_zero; +	times->sum_exec_runtime = 0; +	for_each_possible_cpu(i) { +		tot = per_cpu_ptr(tsk->signal->cputime.totals, i); +		times->utime = cputime_add(times->utime, tot->utime); +		times->stime = cputime_add(times->stime, tot->stime); +		times->sum_exec_runtime += tot->sum_exec_runtime; +	} +} + +/* + * Called after updating RLIMIT_CPU to set timer expiration if necessary. + */ +void update_rlimit_cpu(unsigned long rlim_new) +{ +	cputime_t cputime; + +	cputime = secs_to_cputime(rlim_new); +	if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || +	    cputime_lt(current->signal->it_prof_expires, cputime)) { +		spin_lock_irq(¤t->sighand->siglock); +		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); +		spin_unlock_irq(¤t->sighand->siglock); +	} +}  static int check_clock(const clockid_t which_clock)  { @@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)  {  	return p->utime;  } -static inline unsigned long long sched_ns(struct task_struct *p) -{ -	return task_sched_runtime(p); -}  int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)  { @@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,  		cpu->cpu = virt_ticks(p);  		break;  	case CPUCLOCK_SCHED: -		cpu->sched = sched_ns(p); +		cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);  		break;  	}  	return 0; @@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,  /*   * Sample a process (thread group) clock for the given group_leader task.   * Must be called with tasklist_lock held for reading. - * Must be called with tasklist_lock held for reading, and p->sighand->siglock.   */ -static int cpu_clock_sample_group_locked(unsigned int clock_idx, -					 struct task_struct *p, -					 union cpu_time_count *cpu) +static int cpu_clock_sample_group(const clockid_t which_clock, +				  struct task_struct *p, +				  union cpu_time_count *cpu)  { -	struct task_struct *t = p; - 	switch (clock_idx) { +	struct task_cputime cputime; + +	thread_group_cputime(p, &cputime); +	switch (which_clock) {  	default:  		return -EINVAL;  	case CPUCLOCK_PROF: -		cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); -		do { -			cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); -			t = next_thread(t); -		} while (t != p); +		cpu->cpu = cputime_add(cputime.utime, cputime.stime);  		break;  	case CPUCLOCK_VIRT: -		cpu->cpu = p->signal->utime; -		do { -			cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); -			t = next_thread(t); -		} while (t != p); +		cpu->cpu = cputime.utime;  		break;  	case CPUCLOCK_SCHED: -		cpu->sched = p->signal->sum_sched_runtime; -		/* Add in each other live thread.  */ -		while ((t = next_thread(t)) != p) { -			cpu->sched += t->se.sum_exec_runtime; -		} -		cpu->sched += sched_ns(p); +		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);  		break;  	}  	return 0;  } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, -				  struct task_struct *p, -				  union cpu_time_count *cpu) -{ -	int ret; -	unsigned long flags; -	spin_lock_irqsave(&p->sighand->siglock, flags); -	ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, -					    cpu); -	spin_unlock_irqrestore(&p->sighand->siglock, flags); -	return ret; -} -  int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)  { @@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)  }  void posix_cpu_timers_exit_group(struct task_struct *tsk)  { -	cleanup_timers(tsk->signal->cpu_timers, -		       cputime_add(tsk->utime, tsk->signal->utime), -		       cputime_add(tsk->stime, tsk->signal->stime), -		     tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); -} +	struct task_cputime cputime; - -/* - * Set the expiry times of all the threads in the process so one of them - * will go off before the process cumulative expiry total is reached. - */ -static void process_timer_rebalance(struct task_struct *p, -				    unsigned int clock_idx, -				    union cpu_time_count expires, -				    union cpu_time_count val) -{ -	cputime_t ticks, left; -	unsigned long long ns, nsleft; - 	struct task_struct *t = p; -	unsigned int nthreads = atomic_read(&p->signal->live); - -	if (!nthreads) -		return; - -	switch (clock_idx) { -	default: -		BUG(); -		break; -	case CPUCLOCK_PROF: -		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), -				       nthreads); -		do { -			if (likely(!(t->flags & PF_EXITING))) { -				ticks = cputime_add(prof_ticks(t), left); -				if (cputime_eq(t->it_prof_expires, -					       cputime_zero) || -				    cputime_gt(t->it_prof_expires, ticks)) { -					t->it_prof_expires = ticks; -				} -			} -			t = next_thread(t); -		} while (t != p); -		break; -	case CPUCLOCK_VIRT: -		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), -				       nthreads); -		do { -			if (likely(!(t->flags & PF_EXITING))) { -				ticks = cputime_add(virt_ticks(t), left); -				if (cputime_eq(t->it_virt_expires, -					       cputime_zero) || -				    cputime_gt(t->it_virt_expires, ticks)) { -					t->it_virt_expires = ticks; -				} -			} -			t = next_thread(t); -		} while (t != p); -		break; -	case CPUCLOCK_SCHED: -		nsleft = expires.sched - val.sched; -		do_div(nsleft, nthreads); -		nsleft = max_t(unsigned long long, nsleft, 1); -		do { -			if (likely(!(t->flags & PF_EXITING))) { -				ns = t->se.sum_exec_runtime + nsleft; -				if (t->it_sched_expires == 0 || -				    t->it_sched_expires > ns) { -					t->it_sched_expires = ns; -				} -			} -			t = next_thread(t); -		} while (t != p); -		break; -	} +	thread_group_cputime(tsk, &cputime); +	cleanup_timers(tsk->signal->cpu_timers, +		       cputime.utime, cputime.stime, cputime.sum_exec_runtime);  }  static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) @@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)  			default:  				BUG();  			case CPUCLOCK_PROF: -				if (cputime_eq(p->it_prof_expires, +				if (cputime_eq(p->cputime_expires.prof_exp,  					       cputime_zero) || -				    cputime_gt(p->it_prof_expires, +				    cputime_gt(p->cputime_expires.prof_exp,  					       nt->expires.cpu)) -					p->it_prof_expires = nt->expires.cpu; +					p->cputime_expires.prof_exp = +						nt->expires.cpu;  				break;  			case CPUCLOCK_VIRT: -				if (cputime_eq(p->it_virt_expires, +				if (cputime_eq(p->cputime_expires.virt_exp,  					       cputime_zero) || -				    cputime_gt(p->it_virt_expires, +				    cputime_gt(p->cputime_expires.virt_exp,  					       nt->expires.cpu)) -					p->it_virt_expires = nt->expires.cpu; +					p->cputime_expires.virt_exp = +						nt->expires.cpu;  				break;  			case CPUCLOCK_SCHED: -				if (p->it_sched_expires == 0 || -				    p->it_sched_expires > nt->expires.sched) -					p->it_sched_expires = nt->expires.sched; +				if (p->cputime_expires.sched_exp == 0 || +				    p->cputime_expires.sched_exp > +							nt->expires.sched) +					p->cputime_expires.sched_exp = +						nt->expires.sched;  				break;  			}  		} else {  			/* -			 * For a process timer, we must balance -			 * all the live threads' expirations. +			 * For a process timer, set the cached expiration time.  			 */  			switch (CPUCLOCK_WHICH(timer->it_clock)) {  			default: @@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)  				    cputime_lt(p->signal->it_virt_expires,  					       timer->it.cpu.expires.cpu))  					break; -				goto rebalance; +				p->signal->cputime_expires.virt_exp = +					timer->it.cpu.expires.cpu; +				break;  			case CPUCLOCK_PROF:  				if (!cputime_eq(p->signal->it_prof_expires,  						cputime_zero) && @@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)  				if (i != RLIM_INFINITY &&  				    i <= cputime_to_secs(timer->it.cpu.expires.cpu))  					break; -				goto rebalance; +				p->signal->cputime_expires.prof_exp = +					timer->it.cpu.expires.cpu; +				break;  			case CPUCLOCK_SCHED: -			rebalance: -				process_timer_rebalance( -					timer->it.cpu.task, -					CPUCLOCK_WHICH(timer->it_clock), -					timer->it.cpu.expires, now); +				p->signal->cputime_expires.sched_exp = +					timer->it.cpu.expires.sched;  				break;  			}  		} @@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk,  	struct signal_struct *const sig = tsk->signal;  	maxfire = 20; -	tsk->it_prof_expires = cputime_zero; +	tsk->cputime_expires.prof_exp = cputime_zero;  	while (!list_empty(timers)) {  		struct cpu_timer_list *t = list_first_entry(timers,  						      struct cpu_timer_list,  						      entry);  		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { -			tsk->it_prof_expires = t->expires.cpu; +			tsk->cputime_expires.prof_exp = t->expires.cpu;  			break;  		}  		t->firing = 1; @@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk,  	++timers;  	maxfire = 20; -	tsk->it_virt_expires = cputime_zero; +	tsk->cputime_expires.virt_exp = cputime_zero;  	while (!list_empty(timers)) {  		struct cpu_timer_list *t = list_first_entry(timers,  						      struct cpu_timer_list,  						      entry);  		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { -			tsk->it_virt_expires = t->expires.cpu; +			tsk->cputime_expires.virt_exp = t->expires.cpu;  			break;  		}  		t->firing = 1; @@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk,  	++timers;  	maxfire = 20; -	tsk->it_sched_expires = 0; +	tsk->cputime_expires.sched_exp = 0;  	while (!list_empty(timers)) {  		struct cpu_timer_list *t = list_first_entry(timers,  						      struct cpu_timer_list,  						      entry);  		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { -			tsk->it_sched_expires = t->expires.sched; +			tsk->cputime_expires.sched_exp = t->expires.sched;  			break;  		}  		t->firing = 1; @@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk,  {  	int maxfire;  	struct signal_struct *const sig = tsk->signal; -	cputime_t utime, stime, ptime, virt_expires, prof_expires; +	cputime_t utime, ptime, virt_expires, prof_expires;  	unsigned long long sum_sched_runtime, sched_expires; -	struct task_struct *t;  	struct list_head *timers = sig->cpu_timers; +	struct task_cputime cputime;  	/*  	 * Don't sample the current process CPU clocks if there are no timers. @@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk,  	/*  	 * Collect the current process totals.  	 */ -	utime = sig->utime; -	stime = sig->stime; -	sum_sched_runtime = sig->sum_sched_runtime; -	t = tsk; -	do { -		utime = cputime_add(utime, t->utime); -		stime = cputime_add(stime, t->stime); -		sum_sched_runtime += t->se.sum_exec_runtime; -		t = next_thread(t); -	} while (t != tsk); -	ptime = cputime_add(utime, stime); - +	thread_group_cputime(tsk, &cputime); +	utime = cputime.utime; +	ptime = cputime_add(utime, cputime.stime); +	sum_sched_runtime = cputime.sum_exec_runtime;  	maxfire = 20;  	prof_expires = cputime_zero;  	while (!list_empty(timers)) { @@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk,  		}  	} -	if (!cputime_eq(prof_expires, cputime_zero) || -	    !cputime_eq(virt_expires, cputime_zero) || -	    sched_expires != 0) { -		/* -		 * Rebalance the threads' expiry times for the remaining -		 * process CPU timers. -		 */ - -		cputime_t prof_left, virt_left, ticks; -		unsigned long long sched_left, sched; -		const unsigned int nthreads = atomic_read(&sig->live); - -		if (!nthreads) -			return; - -		prof_left = cputime_sub(prof_expires, utime); -		prof_left = cputime_sub(prof_left, stime); -		prof_left = cputime_div_non_zero(prof_left, nthreads); -		virt_left = cputime_sub(virt_expires, utime); -		virt_left = cputime_div_non_zero(virt_left, nthreads); -		if (sched_expires) { -			sched_left = sched_expires - sum_sched_runtime; -			do_div(sched_left, nthreads); -			sched_left = max_t(unsigned long long, sched_left, 1); -		} else { -			sched_left = 0; -		} -		t = tsk; -		do { -			if (unlikely(t->flags & PF_EXITING)) -				continue; - -			ticks = cputime_add(cputime_add(t->utime, t->stime), -					    prof_left); -			if (!cputime_eq(prof_expires, cputime_zero) && -			    (cputime_eq(t->it_prof_expires, cputime_zero) || -			     cputime_gt(t->it_prof_expires, ticks))) { -				t->it_prof_expires = ticks; -			} - -			ticks = cputime_add(t->utime, virt_left); -			if (!cputime_eq(virt_expires, cputime_zero) && -			    (cputime_eq(t->it_virt_expires, cputime_zero) || -			     cputime_gt(t->it_virt_expires, ticks))) { -				t->it_virt_expires = ticks; -			} - -			sched = t->se.sum_exec_runtime + sched_left; -			if (sched_expires && (t->it_sched_expires == 0 || -					      t->it_sched_expires > sched)) { -				t->it_sched_expires = sched; -			} -		} while ((t = next_thread(t)) != tsk); -	} +	if (!cputime_eq(prof_expires, cputime_zero) && +	    (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || +	     cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) +		sig->cputime_expires.prof_exp = prof_expires; +	if (!cputime_eq(virt_expires, cputime_zero) && +	    (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || +	     cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) +		sig->cputime_expires.virt_exp = virt_expires; +	if (sched_expires != 0 && +	    (sig->cputime_expires.sched_exp == 0 || +	     sig->cputime_expires.sched_exp > sched_expires)) +		sig->cputime_expires.sched_exp = sched_expires;  }  /* @@ -1314,6 +1253,86 @@ out:  	++timer->it_requeue_pending;  } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime:	The struct to compare. + * + * Checks @cputime to see if all fields are zero.  Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ +	if (cputime_eq(cputime->utime, cputime_zero) && +	    cputime_eq(cputime->stime, cputime_zero) && +	    cputime->sum_exec_runtime == 0) +		return 1; +	return 0; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample:	The task_cputime structure to be checked for expiration. + * @expires:	Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set.  Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, +					const struct task_cputime *expires) +{ +	if (!cputime_eq(expires->utime, cputime_zero) && +	    cputime_ge(sample->utime, expires->utime)) +		return 1; +	if (!cputime_eq(expires->stime, cputime_zero) && +	    cputime_ge(cputime_add(sample->utime, sample->stime), +		       expires->stime)) +		return 1; +	if (expires->sum_exec_runtime != 0 && +	    sample->sum_exec_runtime >= expires->sum_exec_runtime) +		return 1; +	return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk:	The task (thread) being checked. + * + * Check the task and thread group timers.  If both are zero (there are no + * timers set) return false.  Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times.  Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ +	struct signal_struct *sig = tsk->signal; + +	if (unlikely(!sig)) +		return 0; + +	if (!task_cputime_zero(&tsk->cputime_expires)) { +		struct task_cputime task_sample = { +			.utime = tsk->utime, +			.stime = tsk->stime, +			.sum_exec_runtime = tsk->se.sum_exec_runtime +		}; + +		if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) +			return 1; +	} +	if (!task_cputime_zero(&sig->cputime_expires)) { +		struct task_cputime group_sample; + +		thread_group_cputime(tsk, &group_sample); +		if (task_cputime_expired(&group_sample, &sig->cputime_expires)) +			return 1; +	} +	return 0; +} +  /*   * This is called from the timer interrupt handler.  The irq handler has   * already updated our counts.  We need to check if any timers fire now. @@ -1326,42 +1345,31 @@ void run_posix_cpu_timers(struct task_struct *tsk)  	BUG_ON(!irqs_disabled()); -#define UNEXPIRED(clock) \ -		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ -		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - -	if (UNEXPIRED(prof) && UNEXPIRED(virt) && -	    (tsk->it_sched_expires == 0 || -	     tsk->se.sum_exec_runtime < tsk->it_sched_expires)) +	/* +	 * The fast path checks that there are no expired thread or thread +	 * group timers.  If that's so, just return. +	 */ +	if (!fastpath_timer_check(tsk))  		return; -#undef	UNEXPIRED - +	spin_lock(&tsk->sighand->siglock);  	/* -	 * Double-check with locks held. +	 * Here we take off tsk->signal->cpu_timers[N] and +	 * tsk->cpu_timers[N] all the timers that are firing, and +	 * put them on the firing list.  	 */ -	read_lock(&tasklist_lock); -	if (likely(tsk->signal != NULL)) { -		spin_lock(&tsk->sighand->siglock); +	check_thread_timers(tsk, &firing); +	check_process_timers(tsk, &firing); -		/* -		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] -		 * all the timers that are firing, and put them on the firing list. -		 */ -		check_thread_timers(tsk, &firing); -		check_process_timers(tsk, &firing); - -		/* -		 * We must release these locks before taking any timer's lock. -		 * There is a potential race with timer deletion here, as the -		 * siglock now protects our private firing list.  We have set -		 * the firing flag in each timer, so that a deletion attempt -		 * that gets the timer lock before we do will give it up and -		 * spin until we've taken care of that timer below. -		 */ -		spin_unlock(&tsk->sighand->siglock); -	} -	read_unlock(&tasklist_lock); +	/* +	 * We must release these locks before taking any timer's lock. +	 * There is a potential race with timer deletion here, as the +	 * siglock now protects our private firing list.  We have set +	 * the firing flag in each timer, so that a deletion attempt +	 * that gets the timer lock before we do will give it up and +	 * spin until we've taken care of that timer below. +	 */ +	spin_unlock(&tsk->sighand->siglock);  	/*  	 * Now that all the timers on our list have the firing flag, @@ -1389,10 +1397,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)  /*   * Set one of the process-wide special case CPU timers. - * The tasklist_lock and tsk->sighand->siglock must be held by the caller. - * The oldval argument is null for the RLIMIT_CPU timer, where *newval is - * absolute; non-null for ITIMER_*, where *newval is relative and we update - * it to be absolute, *oldval is absolute and we update it to be relative. + * The tsk->sighand->siglock must be held by the caller. + * The *newval argument is relative and we update it to be absolute, *oldval + * is absolute and we update it to be relative.   */  void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  			   cputime_t *newval, cputime_t *oldval) @@ -1401,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  	struct list_head *head;  	BUG_ON(clock_idx == CPUCLOCK_SCHED); -	cpu_clock_sample_group_locked(clock_idx, tsk, &now); +	cpu_clock_sample_group(clock_idx, tsk, &now);  	if (oldval) {  		if (!cputime_eq(*oldval, cputime_zero)) { @@ -1435,13 +1442,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  	    cputime_ge(list_first_entry(head,  				  struct cpu_timer_list, entry)->expires.cpu,  		       *newval)) { -		/* -		 * Rejigger each thread's expiry time so that one will -		 * notice before we hit the process-cumulative expiry time. -		 */ -		union cpu_time_count expires = { .sched = 0 }; -		expires.cpu = *newval; -		process_timer_rebalance(tsk, clock_idx, expires, now); +		switch (clock_idx) { +		case CPUCLOCK_PROF: +			tsk->signal->cputime_expires.prof_exp = *newval; +			break; +		case CPUCLOCK_VIRT: +			tsk->signal->cputime_expires.virt_exp = *newval; +			break; +		}  	}  } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ee204586149..5e79c662294 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)  }  /* + * Get monotonic time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ +	getrawmonotonic(tp); +	return 0; +} + +/*   * Initialize everything, well, just everything in Posix clocks/timers ;)   */  static __init int init_posix_timers(void) @@ -235,9 +244,15 @@ static __init int init_posix_timers(void)  		.clock_get = posix_ktime_get_ts,  		.clock_set = do_posix_clock_nosettime,  	}; +	struct k_clock clock_monotonic_raw = { +		.clock_getres = hrtimer_get_res, +		.clock_get = posix_get_monotonic_raw, +		.clock_set = do_posix_clock_nosettime, +	};  	register_posix_clock(CLOCK_REALTIME, &clock_realtime);  	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); +	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);  	posix_timers_cache = kmem_cache_create("posix_timers_cache",  					sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -298,6 +313,7 @@ void do_schedule_next_timer(struct siginfo *info)  int posix_timer_event(struct k_itimer *timr, int si_private)  { +	int shared, ret;  	/*  	 * FIXME: if ->sigq is queued we can race with  	 * dequeue_signal()->do_schedule_next_timer(). @@ -311,25 +327,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private)  	 */  	timr->sigq->info.si_sys_private = si_private; -	timr->sigq->info.si_signo = timr->it_sigev_signo; -	timr->sigq->info.si_code = SI_TIMER; -	timr->sigq->info.si_tid = timr->it_id; -	timr->sigq->info.si_value = timr->it_sigev_value; - -	if (timr->it_sigev_notify & SIGEV_THREAD_ID) { -		struct task_struct *leader; -		int ret = send_sigqueue(timr->sigq, timr->it_process, 0); - -		if (likely(ret >= 0)) -			return ret; - -		timr->it_sigev_notify = SIGEV_SIGNAL; -		leader = timr->it_process->group_leader; -		put_task_struct(timr->it_process); -		timr->it_process = leader; -	} - -	return send_sigqueue(timr->sigq, timr->it_process, 1); +	shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); +	ret = send_sigqueue(timr->sigq, timr->it_process, shared); +	/* If we failed to send the signal the timer stops. */ +	return ret > 0;  }  EXPORT_SYMBOL_GPL(posix_timer_event); @@ -468,11 +469,9 @@ sys_timer_create(const clockid_t which_clock,  		 struct sigevent __user *timer_event_spec,  		 timer_t __user * created_timer_id)  { -	int error = 0; -	struct k_itimer *new_timer = NULL; -	int new_timer_id; -	struct task_struct *process = NULL; -	unsigned long flags; +	struct k_itimer *new_timer; +	int error, new_timer_id; +	struct task_struct *process;  	sigevent_t event;  	int it_id_set = IT_ID_NOT_SET; @@ -490,12 +489,11 @@ sys_timer_create(const clockid_t which_clock,  		goto out;  	}  	spin_lock_irq(&idr_lock); -	error = idr_get_new(&posix_timers_id, (void *) new_timer, -			    &new_timer_id); +	error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);  	spin_unlock_irq(&idr_lock); -	if (error == -EAGAIN) -		goto retry; -	else if (error) { +	if (error) { +		if (error == -EAGAIN) +			goto retry;  		/*  		 * Weird looking, but we return EAGAIN if the IDR is  		 * full (proper POSIX return value for this) @@ -526,67 +524,43 @@ sys_timer_create(const clockid_t which_clock,  			error = -EFAULT;  			goto out;  		} -		new_timer->it_sigev_notify = event.sigev_notify; -		new_timer->it_sigev_signo = event.sigev_signo; -		new_timer->it_sigev_value = event.sigev_value; - -		read_lock(&tasklist_lock); -		if ((process = good_sigevent(&event))) { -			/* -			 * We may be setting up this process for another -			 * thread.  It may be exiting.  To catch this -			 * case the we check the PF_EXITING flag.  If -			 * the flag is not set, the siglock will catch -			 * him before it is too late (in exit_itimers). -			 * -			 * The exec case is a bit more invloved but easy -			 * to code.  If the process is in our thread -			 * group (and it must be or we would not allow -			 * it here) and is doing an exec, it will cause -			 * us to be killed.  In this case it will wait -			 * for us to die which means we can finish this -			 * linkage with our last gasp. I.e. no code :) -			 */ -			spin_lock_irqsave(&process->sighand->siglock, flags); -			if (!(process->flags & PF_EXITING)) { -				new_timer->it_process = process; -				list_add(&new_timer->list, -					 &process->signal->posix_timers); -				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) -					get_task_struct(process); -				spin_unlock_irqrestore(&process->sighand->siglock, flags); -			} else { -				spin_unlock_irqrestore(&process->sighand->siglock, flags); -				process = NULL; -			} -		} -		read_unlock(&tasklist_lock); +		rcu_read_lock(); +		process = good_sigevent(&event); +		if (process) +			get_task_struct(process); +		rcu_read_unlock();  		if (!process) {  			error = -EINVAL;  			goto out;  		}  	} else { -		new_timer->it_sigev_notify = SIGEV_SIGNAL; -		new_timer->it_sigev_signo = SIGALRM; -		new_timer->it_sigev_value.sival_int = new_timer->it_id; +		event.sigev_notify = SIGEV_SIGNAL; +		event.sigev_signo = SIGALRM; +		event.sigev_value.sival_int = new_timer->it_id;  		process = current->group_leader; -		spin_lock_irqsave(&process->sighand->siglock, flags); -		new_timer->it_process = process; -		list_add(&new_timer->list, &process->signal->posix_timers); -		spin_unlock_irqrestore(&process->sighand->siglock, flags); +		get_task_struct(process);  	} +	new_timer->it_sigev_notify     = event.sigev_notify; +	new_timer->sigq->info.si_signo = event.sigev_signo; +	new_timer->sigq->info.si_value = event.sigev_value; +	new_timer->sigq->info.si_tid   = new_timer->it_id; +	new_timer->sigq->info.si_code  = SI_TIMER; + +	spin_lock_irq(¤t->sighand->siglock); +	new_timer->it_process = process; +	list_add(&new_timer->list, ¤t->signal->posix_timers); +	spin_unlock_irq(¤t->sighand->siglock); + +	return 0;   	/*  	 * In the case of the timer belonging to another task, after  	 * the task is unlocked, the timer is owned by the other task  	 * and may cease to exist at any time.  Don't use or modify  	 * new_timer after the unlock call.  	 */ -  out: -	if (error) -		release_posix_timer(new_timer, it_id_set); - +	release_posix_timer(new_timer, it_id_set);  	return error;  } @@ -597,7 +571,7 @@ out:   * the find to the timer lock.  To avoid a dead lock, the timer id MUST   * be release with out holding the timer lock.   */ -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)  {  	struct k_itimer *timr;  	/* @@ -605,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)  	 * flags part over to the timer lock.  Must not let interrupts in  	 * while we are moving the lock.  	 */ -  	spin_lock_irqsave(&idr_lock, *flags); -	timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); +	timr = idr_find(&posix_timers_id, (int)timer_id);  	if (timr) {  		spin_lock(&timr->it_lock); - -		if ((timr->it_id != timer_id) || !(timr->it_process) || -				!same_thread_group(timr->it_process, current)) { -			spin_unlock(&timr->it_lock); -			spin_unlock_irqrestore(&idr_lock, *flags); -			timr = NULL; -		} else +		if (timr->it_process && +		    same_thread_group(timr->it_process, current)) {  			spin_unlock(&idr_lock); -	} else -		spin_unlock_irqrestore(&idr_lock, *flags); +			return timr; +		} +		spin_unlock(&timr->it_lock); +	} +	spin_unlock_irqrestore(&idr_lock, *flags); -	return timr; +	return NULL;  }  /* @@ -860,8 +831,7 @@ retry_delete:  	 * This keeps any tasks waiting on the spin lock from thinking  	 * they got something (see the lock code above).  	 */ -	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) -		put_task_struct(timer->it_process); +	put_task_struct(timer->it_process);  	timer->it_process = NULL;  	unlock_timer(timer, flags); @@ -888,8 +858,7 @@ retry_delete:  	 * This keeps any tasks waiting on the spin lock from thinking  	 * they got something (see the lock code above).  	 */ -	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) -		put_task_struct(timer->it_process); +	put_task_struct(timer->it_process);  	timer->it_process = NULL;  	unlock_timer(timer, flags); diff --git a/kernel/power/process.c b/kernel/power/process.c index 278946aecaf..ca634019497 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p)  	return 1;  } -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ -	if (!unlikely(current->flags & PF_NOFREEZE)) { -		current->flags |= PF_FROZEN; -		wmb(); -	} -	clear_freeze_flag(current); -} - -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ -	/* Hmm, should we be allowed to suspend when there are realtime -	   processes around? */ -	long save; - -	task_lock(current); -	if (freezing(current)) { -		frozen_process(); -		task_unlock(current); -	} else { -		task_unlock(current); -		return; -	} -	save = current->state; -	pr_debug("%s entered refrigerator\n", current->comm); - -	spin_lock_irq(¤t->sighand->siglock); -	recalc_sigpending(); /* We sent fake signal, clean it up */ -	spin_unlock_irq(¤t->sighand->siglock); - -	for (;;) { -		set_current_state(TASK_UNINTERRUPTIBLE); -		if (!frozen(current)) -			break; -		schedule(); -	} -	pr_debug("%s left refrigerator\n", current->comm); -	__set_current_state(save); -} - -static void fake_signal_wake_up(struct task_struct *p) -{ -	unsigned long flags; - -	spin_lock_irqsave(&p->sighand->siglock, flags); -	signal_wake_up(p, 0); -	spin_unlock_irqrestore(&p->sighand->siglock, flags); -} - -static inline bool should_send_signal(struct task_struct *p) -{ -	return !(p->flags & PF_FREEZER_NOSIG); -} - -/** - *	freeze_task - send a freeze request to given task - *	@p: task to send the request to - *	@sig_only: if set, the request will only be sent if the task has the - *		PF_FREEZER_NOSIG flag unset - *	Return value: 'false', if @sig_only is set and the task has - *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise - * - *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and - *	either sending a fake signal to it or waking it up, depending on whether - *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task - *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - *	TIF_FREEZE flag will not be set. - */ -static bool freeze_task(struct task_struct *p, bool sig_only) -{ -	/* -	 * We first check if the task is freezing and next if it has already -	 * been frozen to avoid the race with frozen_process() which first marks -	 * the task as frozen and next clears its TIF_FREEZE. -	 */ -	if (!freezing(p)) { -		rmb(); -		if (frozen(p)) -			return false; - -		if (!sig_only || should_send_signal(p)) -			set_freeze_flag(p); -		else -			return false; -	} - -	if (should_send_signal(p)) { -		if (!signal_pending(p)) -			fake_signal_wake_up(p); -	} else if (sig_only) { -		return false; -	} else { -		wake_up_state(p, TASK_INTERRUPTIBLE); -	} - -	return true; -} - -static void cancel_freezing(struct task_struct *p) -{ -	unsigned long flags; - -	if (freezing(p)) { -		pr_debug("  clean up: %s\n", p->comm); -		clear_freeze_flag(p); -		spin_lock_irqsave(&p->sighand->siglock, flags); -		recalc_sigpending_and_wake(p); -		spin_unlock_irqrestore(&p->sighand->siglock, flags); -	} -} -  static int try_to_freeze_tasks(bool sig_only)  {  	struct task_struct *g, *p; @@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)  		if (nosig_only && should_send_signal(p))  			continue; +		if (cgroup_frozen(p)) +			continue; +  		thaw_process(p);  	} while_each_thread(g, p);  	read_unlock(&tasklist_lock); @@ -264,4 +152,3 @@ void thaw_processes(void)  	printk("done.\n");  } -EXPORT_SYMBOL(refrigerator); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 356699a96d5..1e68e4c39e2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)   * TASK_TRACED, resume it now.   * Requires that irqs be disabled.   */ -void ptrace_untrace(struct task_struct *child) +static void ptrace_untrace(struct task_struct *child)  {  	spin_lock(&child->sighand->siglock);  	if (task_is_traced(child)) { diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ca4bbbe04aa..59236e8b9da 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -54,9 +54,9 @@  #include <linux/cpu.h>  #include <linux/random.h>  #include <linux/delay.h> -#include <linux/byteorder/swabb.h>  #include <linux/cpumask.h>  #include <linux/rcupreempt_trace.h> +#include <asm/byteorder.h>  /*   * PREEMPT_RCU data structures. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 90b5b123f7a..85cb90588a5 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -42,10 +42,10 @@  #include <linux/freezer.h>  #include <linux/cpu.h>  #include <linux/delay.h> -#include <linux/byteorder/swabb.h>  #include <linux/stat.h>  #include <linux/srcu.h>  #include <linux/slab.h> +#include <asm/byteorder.h>  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " diff --git a/kernel/sched.c b/kernel/sched.c index eb3c7295361..bfa87918380 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@  #include <linux/debugfs.h>  #include <linux/ctype.h>  #include <linux/ftrace.h> +#include <trace/sched.h>  #include <asm/tlb.h>  #include <asm/irq_regs.h> @@ -1935,6 +1936,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)  		 * just go back and repeat.  		 */  		rq = task_rq_lock(p, &flags); +		trace_sched_wait_task(rq, p);  		running = task_running(rq, p);  		on_rq = p->se.on_rq;  		ncsw = 0; @@ -2296,9 +2298,7 @@ out_activate:  	success = 1;  out_running: -	trace_mark(kernel_sched_wakeup, -		"pid %d state %ld ## rq %p task %p rq->curr %p", -		p->pid, p->state, rq, p, rq->curr); +	trace_sched_wakeup(rq, p);  	check_preempt_curr(rq, p, sync);  	p->state = TASK_RUNNING; @@ -2431,9 +2431,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)  		p->sched_class->task_new(rq, p);  		inc_nr_running(rq);  	} -	trace_mark(kernel_sched_wakeup_new, -		"pid %d state %ld ## rq %p task %p rq->curr %p", -		p->pid, p->state, rq, p, rq->curr); +	trace_sched_wakeup_new(rq, p);  	check_preempt_curr(rq, p, 0);  #ifdef CONFIG_SMP  	if (p->sched_class->task_wake_up) @@ -2606,11 +2604,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  	struct mm_struct *mm, *oldmm;  	prepare_task_switch(rq, prev, next); -	trace_mark(kernel_sched_schedule, -		"prev_pid %d next_pid %d prev_state %ld " -		"## rq %p prev %p next %p", -		prev->pid, next->pid, prev->state, -		rq, prev, next); +	trace_sched_switch(rq, prev, next);  	mm = next->mm;  	oldmm = prev->active_mm;  	/* @@ -2850,6 +2844,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)  	    || unlikely(!cpu_active(dest_cpu)))  		goto out; +	trace_sched_migrate_task(rq, p, dest_cpu);  	/* force the process onto the specified CPU */  	if (migrate_task(p, dest_cpu, &req)) {  		/* Need to wait for migration thread (might exit: take ref). */ @@ -4051,23 +4046,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);  EXPORT_PER_CPU_SYMBOL(kstat);  /* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running.   */ -unsigned long long task_sched_runtime(struct task_struct *p) +unsigned long long task_delta_exec(struct task_struct *p)  {  	unsigned long flags; -	u64 ns, delta_exec;  	struct rq *rq; +	u64 ns = 0;  	rq = task_rq_lock(p, &flags); -	ns = p->se.sum_exec_runtime; +  	if (task_current(rq, p)) { +		u64 delta_exec; +  		update_rq_clock(rq);  		delta_exec = rq->clock - p->se.exec_start;  		if ((s64)delta_exec > 0) -			ns += delta_exec; +			ns = delta_exec;  	} +  	task_rq_unlock(rq, &flags);  	return ns; @@ -4084,6 +4082,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)  	cputime64_t tmp;  	p->utime = cputime_add(p->utime, cputime); +	account_group_user_time(p, cputime);  	/* Add user time to cpustat. */  	tmp = cputime_to_cputime64(cputime); @@ -4108,6 +4107,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)  	tmp = cputime_to_cputime64(cputime);  	p->utime = cputime_add(p->utime, cputime); +	account_group_user_time(p, cputime);  	p->gtime = cputime_add(p->gtime, cputime);  	cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4143,6 +4143,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,  	}  	p->stime = cputime_add(p->stime, cputime); +	account_group_system_time(p, cputime);  	/* Add system time to cpustat. */  	tmp = cputime_to_cputime64(cputime); @@ -4184,6 +4185,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)  	if (p == rq->idle) {  		p->stime = cputime_add(p->stime, steal); +		account_group_system_time(p, steal);  		if (atomic_read(&rq->nr_iowait) > 0)  			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);  		else diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 18fd17172eb..f604dae7131 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -449,6 +449,7 @@ static void update_curr(struct cfs_rq *cfs_rq)  		struct task_struct *curtask = task_of(curr);  		cpuacct_charge(curtask, delta_exec); +		account_group_exec_runtime(curtask, delta_exec);  	}  } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index cdf5740ab03..b446dc87494 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -526,6 +526,8 @@ static void update_curr_rt(struct rq *rq)  	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));  	curr->se.sum_exec_runtime += delta_exec; +	account_group_exec_runtime(curr, delta_exec); +  	curr->se.exec_start = rq->clock;  	cpuacct_charge(curr, delta_exec); @@ -1458,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)  		p->rt.timeout++;  		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);  		if (p->rt.timeout > next) -			p->it_sched_expires = p->se.sum_exec_runtime; +			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;  	}  } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43987e..b8c156979cf 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -270,3 +270,89 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)  #define sched_info_switch(t, next)		do { } while (0)  #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick.  None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk:	Pointer to task structure. + * @cputime:	Time value by which to increment the utime field of the + *		thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, +					   cputime_t cputime) +{ +	struct signal_struct *sig; + +	sig = tsk->signal; +	if (unlikely(!sig)) +		return; +	if (sig->cputime.totals) { +		struct task_cputime *times; + +		times = per_cpu_ptr(sig->cputime.totals, get_cpu()); +		times->utime = cputime_add(times->utime, cputime); +		put_cpu_no_resched(); +	} +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk:	Pointer to task structure. + * @cputime:	Time value by which to increment the stime field of the + *		thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, +					     cputime_t cputime) +{ +	struct signal_struct *sig; + +	sig = tsk->signal; +	if (unlikely(!sig)) +		return; +	if (sig->cputime.totals) { +		struct task_cputime *times; + +		times = per_cpu_ptr(sig->cputime.totals, get_cpu()); +		times->stime = cputime_add(times->stime, cputime); +		put_cpu_no_resched(); +	} +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk:	Pointer to task structure. + * @ns:		Time value by which to increment the sum_exec_runtime field + *		of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, +					      unsigned long long ns) +{ +	struct signal_struct *sig; + +	sig = tsk->signal; +	if (unlikely(!sig)) +		return; +	if (sig->cputime.totals) { +		struct task_cputime *times; + +		times = per_cpu_ptr(sig->cputime.totals, get_cpu()); +		times->sum_exec_runtime += ns; +		put_cpu_no_resched(); +	} +} diff --git a/kernel/signal.c b/kernel/signal.c index e661b01d340..105217da5c8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,6 +27,7 @@  #include <linux/freezer.h>  #include <linux/pid_namespace.h>  #include <linux/nsproxy.h> +#include <trace/sched.h>  #include <asm/param.h>  #include <asm/uaccess.h> @@ -803,6 +804,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,  	struct sigpending *pending;  	struct sigqueue *q; +	trace_sched_signal_send(sig, t); +  	assert_spin_locked(&t->sighand->siglock);  	if (!prepare_signal(sig, t))  		return 0; @@ -1338,6 +1341,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)  	struct siginfo info;  	unsigned long flags;  	struct sighand_struct *psig; +	struct task_cputime cputime;  	int ret = sig;  	BUG_ON(sig == -1); @@ -1368,10 +1372,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)  	info.si_uid = tsk->uid; -	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, -						       tsk->signal->utime)); -	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, -						       tsk->signal->stime)); +	thread_group_cputime(tsk, &cputime); +	info.si_utime = cputime_to_jiffies(cputime.utime); +	info.si_stime = cputime_to_jiffies(cputime.stime);  	info.si_status = tsk->exit_code & 0x7f;  	if (tsk->exit_code & 0x80) diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa2d56..7110daeb9a9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@   *	Distribute under GPLv2.   *   *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + *	Remote softirq infrastructure is by Jens Axboe.   */  #include <linux/module.h> @@ -265,16 +267,12 @@ asmlinkage void do_softirq(void)   */  void irq_enter(void)  { -#ifdef CONFIG_NO_HZ  	int cpu = smp_processor_id(); +  	if (idle_cpu(cpu) && !in_interrupt()) -		tick_nohz_stop_idle(cpu); -#endif +		tick_check_idle(cpu); +  	__irq_enter(); -#ifdef CONFIG_NO_HZ -	if (idle_cpu(cpu)) -		tick_nohz_update_jiffies(); -#endif  }  #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -474,17 +472,144 @@ void tasklet_kill(struct tasklet_struct *t)  EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ +	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + +	list_add_tail(&cp->list, head); + +	/* Trigger the softirq only if the list was previously empty.  */ +	if (head->next == &cp->list) +		raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ +	struct call_single_data *cp = data; +	unsigned long flags; +	int softirq; + +	softirq = cp->priv; + +	local_irq_save(flags); +	__local_trigger(cp, softirq); +	local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ +	if (cpu_online(cpu)) { +		cp->func = remote_softirq_receive; +		cp->info = cp; +		cp->flags = 0; +		cp->priv = softirq; + +		__smp_call_function_single(cpu, cp); +		return 0; +	} +	return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ +	return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu.  If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ +	if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) +		__local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ +	unsigned long flags; +	int this_cpu; + +	local_irq_save(flags); +	this_cpu = smp_processor_id(); +	__send_remote_softirq(cp, cpu, this_cpu, softirq); +	local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, +					       unsigned long action, void *hcpu) +{ +	/* +	 * If a CPU goes away, splice its entries to the current CPU +	 * and trigger a run of the softirq +	 */ +	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { +		int cpu = (unsigned long) hcpu; +		int i; + +		local_irq_disable(); +		for (i = 0; i < NR_SOFTIRQS; i++) { +			struct list_head *head = &per_cpu(softirq_work_list[i], cpu); +			struct list_head *local_head; + +			if (list_empty(head)) +				continue; + +			local_head = &__get_cpu_var(softirq_work_list[i]); +			list_splice_init(head, local_head); +			raise_softirq_irqoff(i); +		} +		local_irq_enable(); +	} + +	return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { +	.notifier_call	= remote_softirq_cpu_notify, +}; +  void __init softirq_init(void)  {  	int cpu;  	for_each_possible_cpu(cpu) { +		int i; +  		per_cpu(tasklet_vec, cpu).tail =  			&per_cpu(tasklet_vec, cpu).head;  		per_cpu(tasklet_hi_vec, cpu).tail =  			&per_cpu(tasklet_hi_vec, cpu).head; +		for (i = 0; i < NR_SOFTIRQS; i++) +			INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));  	} +	register_hotcpu_notifier(&remote_softirq_cpu_notifier); +  	open_softirq(TASKLET_SOFTIRQ, tasklet_action);  	open_softirq(HI_SOFTIRQ, tasklet_hi_action);  } diff --git a/kernel/sys.c b/kernel/sys.c index fc71f99fb46..31deba8f7d1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid)  	return old_fsgid;  } +void do_sys_times(struct tms *tms) +{ +	struct task_cputime cputime; +	cputime_t cutime, cstime; + +	spin_lock_irq(¤t->sighand->siglock); +	thread_group_cputime(current, &cputime); +	cutime = current->signal->cutime; +	cstime = current->signal->cstime; +	spin_unlock_irq(¤t->sighand->siglock); +	tms->tms_utime = cputime_to_clock_t(cputime.utime); +	tms->tms_stime = cputime_to_clock_t(cputime.stime); +	tms->tms_cutime = cputime_to_clock_t(cutime); +	tms->tms_cstime = cputime_to_clock_t(cstime); +} +  asmlinkage long sys_times(struct tms __user * tbuf)  { -	/* -	 *	In the SMP world we might just be unlucky and have one of -	 *	the times increment as we use it. Since the value is an -	 *	atomically safe type this is just fine. Conceptually its -	 *	as if the syscall took an instant longer to occur. -	 */  	if (tbuf) {  		struct tms tmp; -		struct task_struct *tsk = current; -		struct task_struct *t; -		cputime_t utime, stime, cutime, cstime; - -		spin_lock_irq(&tsk->sighand->siglock); -		utime = tsk->signal->utime; -		stime = tsk->signal->stime; -		t = tsk; -		do { -			utime = cputime_add(utime, t->utime); -			stime = cputime_add(stime, t->stime); -			t = next_thread(t); -		} while (t != tsk); - -		cutime = tsk->signal->cutime; -		cstime = tsk->signal->cstime; -		spin_unlock_irq(&tsk->sighand->siglock); -		tmp.tms_utime = cputime_to_clock_t(utime); -		tmp.tms_stime = cputime_to_clock_t(stime); -		tmp.tms_cutime = cputime_to_clock_t(cutime); -		tmp.tms_cstime = cputime_to_clock_t(cstime); +		do_sys_times(&tmp);  		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))  			return -EFAULT;  	} @@ -1449,7 +1439,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r  asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)  {  	struct rlimit new_rlim, *old_rlim; -	unsigned long it_prof_secs;  	int retval;  	if (resource >= RLIM_NLIMITS) @@ -1503,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)  	if (new_rlim.rlim_cur == RLIM_INFINITY)  		goto out; -	it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); -	if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { -		unsigned long rlim_cur = new_rlim.rlim_cur; -		cputime_t cputime; - -		cputime = secs_to_cputime(rlim_cur); -		read_lock(&tasklist_lock); -		spin_lock_irq(¤t->sighand->siglock); -		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); -		spin_unlock_irq(¤t->sighand->siglock); -		read_unlock(&tasklist_lock); -	} +	update_rlimit_cpu(new_rlim.rlim_cur);  out:  	return 0;  } @@ -1552,11 +1530,8 @@ out:   *   */ -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, -				     cputime_t *utimep, cputime_t *stimep) +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)  { -	*utimep = cputime_add(*utimep, t->utime); -	*stimep = cputime_add(*stimep, t->stime);  	r->ru_nvcsw += t->nvcsw;  	r->ru_nivcsw += t->nivcsw;  	r->ru_minflt += t->min_flt; @@ -1570,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)  	struct task_struct *t;  	unsigned long flags;  	cputime_t utime, stime; +	struct task_cputime cputime;  	memset((char *) r, 0, sizeof *r);  	utime = stime = cputime_zero;  	if (who == RUSAGE_THREAD) { -		accumulate_thread_rusage(p, r, &utime, &stime); +		accumulate_thread_rusage(p, r);  		goto out;  	} @@ -1598,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)  				break;  		case RUSAGE_SELF: -			utime = cputime_add(utime, p->signal->utime); -			stime = cputime_add(stime, p->signal->stime); +			thread_group_cputime(p, &cputime); +			utime = cputime_add(utime, cputime.utime); +			stime = cputime_add(stime, cputime.stime);  			r->ru_nvcsw += p->signal->nvcsw;  			r->ru_nivcsw += p->signal->nivcsw;  			r->ru_minflt += p->signal->min_flt; @@ -1608,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)  			r->ru_oublock += p->signal->oublock;  			t = p;  			do { -				accumulate_thread_rusage(t, r, &utime, &stime); +				accumulate_thread_rusage(t, r);  				t = next_thread(t);  			} while (t != p);  			break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a..b3cc73931d1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -833,6 +833,16 @@ static struct ctl_table kern_table[] = {  		.proc_handler   = &proc_dointvec,  	},  #endif +#ifdef CONFIG_UNEVICTABLE_LRU +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "scan_unevictable_pages", +		.data		= &scan_unevictable_pages, +		.maxlen		= sizeof(scan_unevictable_pages), +		.mode		= 0644, +		.proc_handler	= &scan_unevictable_handler, +	}, +#endif  /*   * NOTE: do not add new entries to this table unless you have read   * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 093d4acf993..9ed2eec9752 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c)  	unsigned long flags;  	int ret; +	/* save mult_orig on registration */ +	c->mult_orig = c->mult; +  	spin_lock_irqsave(&clocksource_lock, flags);  	ret = clocksource_enqueue(c);  	if (!ret) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 4c256fdb887..1ca99557e92 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = {  	.read		= jiffies_read,  	.mask		= 0xffffffff, /*32bits*/  	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ +	.mult_orig	= NSEC_PER_JIFFY << JIFFIES_SHIFT,  	.shift		= JIFFIES_SHIFT,  }; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 9c114b726ab..8ff15e5d486 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,13 +10,13 @@  #include <linux/mm.h>  #include <linux/time.h> -#include <linux/timer.h>  #include <linux/timex.h>  #include <linux/jiffies.h>  #include <linux/hrtimer.h>  #include <linux/capability.h>  #include <linux/math64.h>  #include <linux/clocksource.h> +#include <linux/workqueue.h>  #include <asm/timex.h>  /* @@ -217,11 +217,11 @@ void second_overflow(void)  /* Disable the cmos update - used by virtualization and embedded */  int no_sync_cmos_clock  __read_mostly; -static void sync_cmos_clock(unsigned long dummy); +static void sync_cmos_clock(struct work_struct *work); -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); -static void sync_cmos_clock(unsigned long dummy) +static void sync_cmos_clock(struct work_struct *work)  {  	struct timespec now, next;  	int fail = 1; @@ -257,13 +257,13 @@ static void sync_cmos_clock(unsigned long dummy)  		next.tv_sec++;  		next.tv_nsec -= NSEC_PER_SEC;  	} -	mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); +	schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));  }  static void notify_cmos_timer(void)  {  	if (!no_sync_cmos_clock) -		mod_timer(&sync_cmos_timer, jiffies + 1); +		schedule_delayed_work(&sync_cmos_work, 0);  }  #else @@ -276,38 +276,50 @@ static inline void notify_cmos_timer(void) { }  int do_adjtimex(struct timex *txc)  {  	struct timespec ts; -	long save_adjust, sec;  	int result; -	/* In order to modify anything, you gotta be super-user! */ -	if (txc->modes && !capable(CAP_SYS_TIME)) -		return -EPERM; - -	/* Now we validate the data before disabling interrupts */ - -	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { +	/* Validate the data before disabling interrupts */ +	if (txc->modes & ADJ_ADJTIME) {  		/* singleshot must not be used with any other mode bits */ -		if (txc->modes & ~ADJ_OFFSET_SS_READ) +		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))  			return -EINVAL; -	} +		if (!(txc->modes & ADJ_OFFSET_READONLY) && +		    !capable(CAP_SYS_TIME)) +			return -EPERM; +	} else { +		/* In order to modify anything, you gotta be super-user! */ +		 if (txc->modes && !capable(CAP_SYS_TIME)) +			return -EPERM; -	/* if the quartz is off by more than 10% something is VERY wrong ! */ -	if (txc->modes & ADJ_TICK) -		if (txc->tick <  900000/USER_HZ || -		    txc->tick > 1100000/USER_HZ) -			return -EINVAL; +		/* if the quartz is off by more than 10% something is VERY wrong! */ +		if (txc->modes & ADJ_TICK && +		    (txc->tick <  900000/USER_HZ || +		     txc->tick > 1100000/USER_HZ)) +				return -EINVAL; + +		if (txc->modes & ADJ_STATUS && time_state != TIME_OK) +			hrtimer_cancel(&leap_timer); +	} -	if (time_state != TIME_OK && txc->modes & ADJ_STATUS) -		hrtimer_cancel(&leap_timer);  	getnstimeofday(&ts);  	write_seqlock_irq(&xtime_lock); -	/* Save for later - semantics of adjtime is to return old value */ -	save_adjust = time_adjust; -  	/* If there are input parameters, then process them */ +	if (txc->modes & ADJ_ADJTIME) { +		long save_adjust = time_adjust; + +		if (!(txc->modes & ADJ_OFFSET_READONLY)) { +			/* adjtime() is independent from ntp_adjtime() */ +			time_adjust = txc->offset; +			ntp_update_frequency(); +		} +		txc->offset = save_adjust; +		goto adj_done; +	}  	if (txc->modes) { +		long sec; +  		if (txc->modes & ADJ_STATUS) {  			if ((time_status & STA_PLL) &&  			    !(txc->status & STA_PLL)) { @@ -374,13 +386,8 @@ int do_adjtimex(struct timex *txc)  		if (txc->modes & ADJ_TAI && txc->constant > 0)  			time_tai = txc->constant; -		if (txc->modes & ADJ_OFFSET) { -			if (txc->modes == ADJ_OFFSET_SINGLESHOT) -				/* adjtime() is independent from ntp_adjtime() */ -				time_adjust = txc->offset; -			else -				ntp_update_offset(txc->offset); -		} +		if (txc->modes & ADJ_OFFSET) +			ntp_update_offset(txc->offset);  		if (txc->modes & ADJ_TICK)  			tick_usec = txc->tick; @@ -388,22 +395,18 @@ int do_adjtimex(struct timex *txc)  			ntp_update_frequency();  	} +	txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, +				  NTP_SCALE_SHIFT); +	if (!(time_status & STA_NANO)) +		txc->offset /= NSEC_PER_USEC; + +adj_done:  	result = time_state;	/* mostly `TIME_OK' */  	if (time_status & (STA_UNSYNC|STA_CLOCKERR))  		result = TIME_ERROR; -	if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || -	    (txc->modes == ADJ_OFFSET_SS_READ)) -		txc->offset = save_adjust; -	else { -		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, -					  NTP_SCALE_SHIFT); -		if (!(time_status & STA_NANO)) -			txc->offset /= NSEC_PER_USEC; -	} -	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * -					 (s64)PPM_SCALE_INV, -					 NTP_SCALE_SHIFT); +	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * +					 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);  	txc->maxerror	   = time_maxerror;  	txc->esterror	   = time_esterror;  	txc->status	   = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index cb01cd8f919..f98a1b7b16e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -384,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)  }  /* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ +	if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { +		struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + +		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); +	} +} + +/*   * Handle oneshot mode broadcasting   */  static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 469248782c2..b1c05bf75ee 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void);  extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);  extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);  extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu);  # else /* BROADCAST */  static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  { @@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }  static inline void tick_broadcast_switch_to_oneshot(void) { }  static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }  static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { }  # endif /* !BROADCAST */  #else /* !ONESHOT */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a547be11cf9..5bbb1044f84 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void)  	touch_softlockup_watchdog();  } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -377,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void)  	return ts->sleep_length;  } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ +	hrtimer_cancel(&ts->sched_timer); +	hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + +	while (1) { +		/* Forward the time to expire in the future */ +		hrtimer_forward(&ts->sched_timer, now, tick_period); + +		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { +			hrtimer_start_expires(&ts->sched_timer, +				      HRTIMER_MODE_ABS); +			/* Check, if the timer was already in the past */ +			if (hrtimer_active(&ts->sched_timer)) +				break; +		} else { +			if (!tick_program_event( +				hrtimer_get_expires(&ts->sched_timer), 0)) +				break; +		} +		/* Update jiffies and reread time */ +		tick_do_update_jiffies64(now); +		now = ktime_get(); +	} +} +  /**   * tick_nohz_restart_sched_tick - restart the idle tick from the idle task   * @@ -430,28 +456,9 @@ void tick_nohz_restart_sched_tick(void)  	 */  	ts->tick_stopped  = 0;  	ts->idle_exittime = now; -	hrtimer_cancel(&ts->sched_timer); -	hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); -	while (1) { -		/* Forward the time to expire in the future */ -		hrtimer_forward(&ts->sched_timer, now, tick_period); +	tick_nohz_restart(ts, now); -		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { -			hrtimer_start_expires(&ts->sched_timer, -				      HRTIMER_MODE_ABS); -			/* Check, if the timer was already in the past */ -			if (hrtimer_active(&ts->sched_timer)) -				break; -		} else { -			if (!tick_program_event( -				hrtimer_get_expires(&ts->sched_timer), 0)) -				break; -		} -		/* Update jiffies and reread time */ -		tick_do_update_jiffies64(now); -		now = ktime_get(); -	}  	local_irq_enable();  } @@ -503,10 +510,6 @@ static void tick_nohz_handler(struct clock_event_device *dev)  	update_process_times(user_mode(regs));  	profile_tick(CPU_PROFILING); -	/* Do not restart, when we are in the idle loop */ -	if (ts->tick_stopped) -		return; -  	while (tick_nohz_reprogram(ts, now)) {  		now = ktime_get();  		tick_do_update_jiffies64(now); @@ -552,6 +555,37 @@ static void tick_nohz_switch_to_nohz(void)  	       smp_processor_id());  } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ +	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +	ktime_t delta, now; + +	if (!ts->tick_stopped) +		return; + +	/* +	 * Do not touch the tick device, when the next expiry is either +	 * already reached or less/equal than the tick period. +	 */ +	now = ktime_get(); +	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); +	if (delta.tv64 <= tick_period.tv64) +		return; + +	tick_nohz_restart(ts, now); +} +  #else  static inline void tick_nohz_switch_to_nohz(void) { } @@ -559,6 +593,19 @@ static inline void tick_nohz_switch_to_nohz(void) { }  #endif /* NO_HZ */  /* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ +	tick_check_oneshot_broadcast(cpu); +#ifdef CONFIG_NO_HZ +	tick_nohz_stop_idle(cpu); +	tick_nohz_update_jiffies(); +	tick_nohz_kick_tick(cpu); +#endif +} + +/*   * High resolution timer specific code   */  #ifdef CONFIG_HIGH_RES_TIMERS @@ -611,10 +658,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)  		profile_tick(CPU_PROFILING);  	} -	/* Do not restart, when we are in the idle loop */ -	if (ts->tick_stopped) -		return HRTIMER_NORESTART; -  	hrtimer_forward(timer, now, tick_period);  	return HRTIMER_RESTART; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f961c..e7acfb482a6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,27 +58,26 @@ struct clocksource *clock;  #ifdef CONFIG_GENERIC_TIME  /** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * clocksource_forward_now - update clock to the current time   * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly.   */ -static inline s64 __get_nsec_offset(void) +static void clocksource_forward_now(void)  {  	cycle_t cycle_now, cycle_delta; -	s64 ns_offset; +	s64 nsec; -	/* read clocksource: */  	cycle_now = clocksource_read(clock); - -	/* calculate the delta since the last update_wall_time: */  	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; +	clock->cycle_last = cycle_now; -	/* convert to nanoseconds: */ -	ns_offset = cyc2ns(clock, cycle_delta); +	nsec = cyc2ns(clock, cycle_delta); +	timespec_add_ns(&xtime, nsec); -	return ns_offset; +	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; +	clock->raw_time.tv_nsec += nsec;  }  /** @@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void)   */  void getnstimeofday(struct timespec *ts)  { +	cycle_t cycle_now, cycle_delta;  	unsigned long seq;  	s64 nsecs; @@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts)  		seq = read_seqbegin(&xtime_lock);  		*ts = xtime; -		nsecs = __get_nsec_offset(); + +		/* read clocksource: */ +		cycle_now = clocksource_read(clock); + +		/* calculate the delta since the last update_wall_time: */ +		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + +		/* convert to nanoseconds: */ +		nsecs = cyc2ns(clock, cycle_delta);  	} while (read_seqretry(&xtime_lock, seq)); @@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday);   */  int do_settimeofday(struct timespec *tv)  { +	struct timespec ts_delta;  	unsigned long flags; -	time_t wtm_sec, sec = tv->tv_sec; -	long wtm_nsec, nsec = tv->tv_nsec;  	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)  		return -EINVAL;  	write_seqlock_irqsave(&xtime_lock, flags); -	nsec -= __get_nsec_offset(); +	clocksource_forward_now(); + +	ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; +	ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; +	wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); -	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); -	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); +	xtime = *tv; -	set_normalized_timespec(&xtime, sec, nsec); -	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);  	update_xtime_cache(0);  	clock->error = 0; @@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday);  static void change_clocksource(void)  {  	struct clocksource *new; -	cycle_t now; -	u64 nsec;  	new = clocksource_get_next();  	if (clock == new)  		return; -	new->cycle_last = 0; -	now = clocksource_read(new); -	nsec =  __get_nsec_offset(); -	timespec_add_ns(&xtime, nsec); +	clocksource_forward_now(); -	clock = new; -	clock->cycle_last = now; +	new->raw_time = clock->raw_time; +	clock = new; +	clock->cycle_last = 0; +	clock->cycle_last = clocksource_read(new);  	clock->error = 0;  	clock->xtime_nsec = 0;  	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -200,11 +205,44 @@ static void change_clocksource(void)  	 */  }  #else +static inline void clocksource_forward_now(void) { }  static inline void change_clocksource(void) { } -static inline s64 __get_nsec_offset(void) { return 0; }  #endif  /** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts:		pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ +	unsigned long seq; +	s64 nsecs; +	cycle_t cycle_now, cycle_delta; + +	do { +		seq = read_seqbegin(&xtime_lock); + +		/* read clocksource: */ +		cycle_now = clocksource_read(clock); + +		/* calculate the delta since the last update_wall_time: */ +		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + +		/* convert to nanoseconds: */ +		nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + +		*ts = clock->raw_time; + +	} while (read_seqretry(&xtime_lock, seq)); + +	timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + +/**   * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres   */  int timekeeping_valid_for_hres(void) @@ -265,8 +303,6 @@ void __init timekeeping_init(void)  static int timekeeping_suspended;  /* time in seconds when suspend began */  static unsigned long timekeeping_suspend_time; -/* xtime offset when we went into suspend */ -static s64 timekeeping_suspend_nsecs;  /**   * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev)  		wall_to_monotonic.tv_sec -= sleep_length;  		total_sleep_time += sleep_length;  	} -	/* Make sure that we have the correct xtime reference */ -	timespec_add_ns(&xtime, timekeeping_suspend_nsecs);  	update_xtime_cache(0);  	/* re-base the last cycle value */  	clock->cycle_last = 0; @@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)  	timekeeping_suspend_time = read_persistent_clock();  	write_seqlock_irqsave(&xtime_lock, flags); -	/* Get the current xtime offset */ -	timekeeping_suspend_nsecs = __get_nsec_offset(); +	clocksource_forward_now();  	timekeeping_suspended = 1;  	write_sequnlock_irqrestore(&xtime_lock, flags); @@ -454,23 +487,29 @@ void update_wall_time(void)  #else  	offset = clock->cycle_interval;  #endif -	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; +	clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;  	/* normally this loop will run just once, however in the  	 * case of lost or late ticks, it will accumulate correctly.  	 */  	while (offset >= clock->cycle_interval) {  		/* accumulate one interval */ -		clock->xtime_nsec += clock->xtime_interval; -		clock->cycle_last += clock->cycle_interval;  		offset -= clock->cycle_interval; +		clock->cycle_last += clock->cycle_interval; +		clock->xtime_nsec += clock->xtime_interval;  		if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {  			clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;  			xtime.tv_sec++;  			second_overflow();  		} +		clock->raw_time.tv_nsec += clock->raw_interval; +		if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { +			clock->raw_time.tv_nsec -= NSEC_PER_SEC; +			clock->raw_time.tv_sec++; +		} +  		/* accumulate error between NTP and clock interval */  		clock->error += tick_length;  		clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); @@ -479,9 +518,12 @@ void update_wall_time(void)  	/* correct the clock when NTP error is too big */  	clocksource_adjust(offset); -	/* store full nanoseconds into xtime */ -	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; +	/* store full nanoseconds into xtime after rounding it up and +	 * add the remainder to the error difference. +	 */ +	xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;  	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; +	clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);  	update_xtime_cache(cyc2ns(clock, offset)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 122ee751d2d..a999b92a127 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym)  }  static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, +	    int idx, u64 now)  {  #ifdef CONFIG_TIMER_STATS  	char tmp[TASK_COMM_LEN + 1];  #endif  	SEQ_printf(m, " #%d: ", idx); -	print_name_offset(m, timer); +	print_name_offset(m, taddr);  	SEQ_printf(m, ", ");  	print_name_offset(m, timer->function);  	SEQ_printf(m, ", S:%02lx", timer->state); @@ -101,7 +102,7 @@ next_one:  		tmp = *timer;  		spin_unlock_irqrestore(&base->cpu_base->lock, flags); -		print_timer(m, &tmp, i, now); +		print_timer(m, timer, &tmp, i, now);  		next++;  		goto next_one;  	} @@ -111,6 +112,7 @@ next_one:  static void  print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)  { +	SEQ_printf(m, "  .base:       %p\n", base);  	SEQ_printf(m, "  .index:      %d\n",  			base->index);  	SEQ_printf(m, "  .resolution: %Lu nsecs\n", @@ -185,12 +187,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  #ifdef CONFIG_GENERIC_CLOCKEVENTS  static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  {  	struct clock_event_device *dev = td->evtdev;  	SEQ_printf(m, "\n");  	SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode); +	if (cpu < 0) +		SEQ_printf(m, "Broadcast device\n"); +	else +		SEQ_printf(m, "Per CPU device: %d\n", cpu);  	SEQ_printf(m, "Clock Event Device: ");  	if (!dev) { @@ -224,7 +230,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)  	int cpu;  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST -	print_tickdevice(m, tick_get_broadcast_device()); +	print_tickdevice(m, tick_get_broadcast_device(), -1);  	SEQ_printf(m, "tick_broadcast_mask: %08lx\n",  		   tick_get_broadcast_mask()->bits[0]);  #ifdef CONFIG_TICK_ONESHOT @@ -234,7 +240,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)  	SEQ_printf(m, "\n");  #endif  	for_each_online_cpu(cpu) -		   print_tickdevice(m, tick_get_device(cpu)); +		print_tickdevice(m, tick_get_device(cpu), cpu);  	SEQ_printf(m, "\n");  }  #else @@ -246,7 +252,7 @@ static int timer_list_show(struct seq_file *m, void *v)  	u64 now = ktime_to_ns(ktime_get());  	int cpu; -	SEQ_printf(m, "Timer List Version: v0.3\n"); +	SEQ_printf(m, "Timer List Version: v0.4\n");  	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);  	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 510fe69351c..56becf373c5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1436,9 +1436,11 @@ static void __cpuinit migrate_timers(int cpu)  	BUG_ON(cpu_online(cpu));  	old_base = per_cpu(tvec_bases, cpu);  	new_base = get_cpu_var(tvec_bases); - -	local_irq_disable(); -	spin_lock(&new_base->lock); +	/* +	 * The caller is globally serialized and nobody else +	 * takes two locks at once, deadlock is not possible. +	 */ +	spin_lock_irq(&new_base->lock);  	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);  	BUG_ON(old_base->running_timer); @@ -1453,8 +1455,7 @@ static void __cpuinit migrate_timers(int cpu)  	}  	spin_unlock(&old_base->lock); -	spin_unlock(&new_base->lock); -	local_irq_enable(); +	spin_unlock_irq(&new_base->lock);  	put_cpu_var(tvec_bases);  }  #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 263e9e6bbd6..1cb3e1f616a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,23 +1,37 @@  #  # Architectures that offer an FTRACE implementation should select HAVE_FTRACE:  # + +config NOP_TRACER +	bool +  config HAVE_FTRACE  	bool +	select NOP_TRACER  config HAVE_DYNAMIC_FTRACE  	bool +config HAVE_FTRACE_MCOUNT_RECORD +	bool +  config TRACER_MAX_TRACE  	bool +config RING_BUFFER +	bool +  config TRACING  	bool  	select DEBUG_FS +	select RING_BUFFER  	select STACKTRACE +	select TRACEPOINTS  config FTRACE  	bool "Kernel Function Tracer"  	depends on HAVE_FTRACE +	depends on DEBUG_KERNEL  	select FRAME_POINTER  	select TRACING  	select CONTEXT_SWITCH_TRACER @@ -36,6 +50,7 @@ config IRQSOFF_TRACER  	depends on TRACE_IRQFLAGS_SUPPORT  	depends on GENERIC_TIME  	depends on HAVE_FTRACE +	depends on DEBUG_KERNEL  	select TRACE_IRQFLAGS  	select TRACING  	select TRACER_MAX_TRACE @@ -59,6 +74,7 @@ config PREEMPT_TRACER  	depends on GENERIC_TIME  	depends on PREEMPT  	depends on HAVE_FTRACE +	depends on DEBUG_KERNEL  	select TRACING  	select TRACER_MAX_TRACE  	help @@ -86,6 +102,7 @@ config SYSPROF_TRACER  config SCHED_TRACER  	bool "Scheduling Latency Tracer"  	depends on HAVE_FTRACE +	depends on DEBUG_KERNEL  	select TRACING  	select CONTEXT_SWITCH_TRACER  	select TRACER_MAX_TRACE @@ -96,16 +113,56 @@ config SCHED_TRACER  config CONTEXT_SWITCH_TRACER  	bool "Trace process context switches"  	depends on HAVE_FTRACE +	depends on DEBUG_KERNEL  	select TRACING  	select MARKERS  	help  	  This tracer gets called from the context switch and records  	  all switching of tasks. +config BOOT_TRACER +	bool "Trace boot initcalls" +	depends on HAVE_FTRACE +	depends on DEBUG_KERNEL +	select TRACING +	help +	  This tracer helps developers to optimize boot times: it records +	  the timings of the initcalls and traces key events and the identity +	  of tasks that can cause boot delays, such as context-switches. + +	  Its aim is to be parsed by the /scripts/bootgraph.pl tool to +	  produce pretty graphics about boot inefficiencies, giving a visual +	  representation of the delays during initcalls - but the raw +	  /debug/tracing/trace text output is readable too. + +	  ( Note that tracing self tests can't be enabled if this tracer is +	    selected, because the self-tests are an initcall as well and that +	    would invalidate the boot trace. ) + +config STACK_TRACER +	bool "Trace max stack" +	depends on HAVE_FTRACE +	depends on DEBUG_KERNEL +	select FTRACE +	select STACKTRACE +	help +	  This special tracer records the maximum stack footprint of the +	  kernel and displays it in debugfs/tracing/stack_trace. + +	  This tracer works by hooking into every function call that the +	  kernel executes, and keeping a maximum stack depth value and +	  stack-trace saved. Because this logic has to execute in every +	  kernel function, all the time, this option can slow down the +	  kernel measurably and is generally intended for kernel +	  developers only. + +	  Say N if unsure. +  config DYNAMIC_FTRACE  	bool "enable/disable ftrace tracepoints dynamically"  	depends on FTRACE  	depends on HAVE_DYNAMIC_FTRACE +	depends on DEBUG_KERNEL  	default y  	help           This option will modify all the calls to ftrace dynamically @@ -121,12 +178,17 @@ config DYNAMIC_FTRACE  	 were made. If so, it runs stop_machine (stops all CPUS)  	 and modifies the code to jump over the call to ftrace. +config FTRACE_MCOUNT_RECORD +	def_bool y +	depends on DYNAMIC_FTRACE +	depends on HAVE_FTRACE_MCOUNT_RECORD +  config FTRACE_SELFTEST  	bool  config FTRACE_STARTUP_TEST  	bool "Perform a startup test on ftrace" -	depends on TRACING +	depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER  	select FTRACE_SELFTEST  	help  	  This option performs a series of startup tests on ftrace. On bootup diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 71d17de1728..a85dfba88ba 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o  endif  obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o  obj-$(CONFIG_TRACING) += trace.o  obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o @@ -19,6 +20,9 @@ obj-$(CONFIG_FTRACE) += trace_functions.o  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_BOOT_TRACER) += trace_boot.o  libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6e3af31b40..4dda4f60a2a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -81,7 +81,7 @@ void clear_ftrace_function(void)  static int __register_ftrace_function(struct ftrace_ops *ops)  { -	/* Should never be called by interrupts */ +	/* should not be called from interrupt context */  	spin_lock(&ftrace_lock);  	ops->next = ftrace_list; @@ -115,6 +115,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  	struct ftrace_ops **p;  	int ret = 0; +	/* should not be called from interrupt context */  	spin_lock(&ftrace_lock);  	/* @@ -153,6 +154,30 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  #ifdef CONFIG_DYNAMIC_FTRACE +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +/* + * The hash lock is only needed when the recording of the mcount + * callers are dynamic. That is, by the caller themselves and + * not recorded via the compilation. + */ +static DEFINE_SPINLOCK(ftrace_hash_lock); +#define ftrace_hash_lock(flags)	  spin_lock_irqsave(&ftrace_hash_lock, flags) +#define ftrace_hash_unlock(flags) \ +			spin_unlock_irqrestore(&ftrace_hash_lock, flags) +#else +/* This is protected via the ftrace_lock with MCOUNT_RECORD. */ +#define ftrace_hash_lock(flags)   do { (void)(flags); } while (0) +#define ftrace_hash_unlock(flags) do { } while(0) +#endif + +/* + * Since MCOUNT_ADDR may point to mcount itself, we do not want + * to get it confused by reading a reference in the code as we + * are parsing on objcopy output of text. Use a variable for + * it instead. + */ +static unsigned long mcount_addr = MCOUNT_ADDR; +  static struct task_struct *ftraced_task;  enum { @@ -171,7 +196,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];  static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); -static DEFINE_SPINLOCK(ftrace_shutdown_lock);  static DEFINE_MUTEX(ftraced_lock);  static DEFINE_MUTEX(ftrace_regex_lock); @@ -294,13 +318,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node)  static void ftrace_free_rec(struct dyn_ftrace *rec)  { -	/* no locking, only called from kstop_machine */ -  	rec->ip = (unsigned long)ftrace_free_records;  	ftrace_free_records = rec;  	rec->flags |= FTRACE_FL_FREE;  } +void ftrace_release(void *start, unsigned long size) +{ +	struct dyn_ftrace *rec; +	struct ftrace_page *pg; +	unsigned long s = (unsigned long)start; +	unsigned long e = s + size; +	int i; + +	if (ftrace_disabled || !start) +		return; + +	/* should not be called from interrupt context */ +	spin_lock(&ftrace_lock); + +	for (pg = ftrace_pages_start; pg; pg = pg->next) { +		for (i = 0; i < pg->index; i++) { +			rec = &pg->records[i]; + +			if ((rec->ip >= s) && (rec->ip < e)) +				ftrace_free_rec(rec); +		} +	} +	spin_unlock(&ftrace_lock); + +} +  static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)  {  	struct dyn_ftrace *rec; @@ -338,7 +386,6 @@ ftrace_record_ip(unsigned long ip)  	unsigned long flags;  	unsigned long key;  	int resched; -	int atomic;  	int cpu;  	if (!ftrace_enabled || ftrace_disabled) @@ -368,9 +415,7 @@ ftrace_record_ip(unsigned long ip)  	if (ftrace_ip_in_hash(ip, key))  		goto out; -	atomic = irqs_disabled(); - -	spin_lock_irqsave(&ftrace_shutdown_lock, flags); +	ftrace_hash_lock(flags);  	/* This ip may have hit the hash before the lock */  	if (ftrace_ip_in_hash(ip, key)) @@ -387,7 +432,7 @@ ftrace_record_ip(unsigned long ip)  	ftraced_trigger = 1;   out_unlock: -	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); +	ftrace_hash_unlock(flags);   out:  	per_cpu(ftrace_shutdown_disable_cpu, cpu)--; @@ -531,6 +576,16 @@ static void ftrace_shutdown_replenish(void)  	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);  } +static void print_ip_ins(const char *fmt, unsigned char *p) +{ +	int i; + +	printk(KERN_CONT "%s", fmt); + +	for (i = 0; i < MCOUNT_INSN_SIZE; i++) +		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} +  static int  ftrace_code_disable(struct dyn_ftrace *rec)  { @@ -541,10 +596,27 @@ ftrace_code_disable(struct dyn_ftrace *rec)  	ip = rec->ip;  	nop = ftrace_nop_replace(); -	call = ftrace_call_replace(ip, MCOUNT_ADDR); +	call = ftrace_call_replace(ip, mcount_addr);  	failed = ftrace_modify_code(ip, call, nop);  	if (failed) { +		switch (failed) { +		case 1: +			WARN_ON_ONCE(1); +			pr_info("ftrace faulted on modifying "); +			print_ip_sym(ip); +			break; +		case 2: +			WARN_ON_ONCE(1); +			pr_info("ftrace failed to modify "); +			print_ip_sym(ip); +			print_ip_ins(" expected: ", call); +			print_ip_ins(" actual: ", (unsigned char *)ip); +			print_ip_ins(" replace: ", nop); +			printk(KERN_CONT "\n"); +			break; +		} +  		rec->flags |= FTRACE_FL_FAILED;  		return 0;  	} @@ -792,47 +864,7 @@ static int ftrace_update_code(void)  	return 1;  } -static int ftraced(void *ignore) -{ -	unsigned long usecs; - -	while (!kthread_should_stop()) { - -		set_current_state(TASK_INTERRUPTIBLE); - -		/* check once a second */ -		schedule_timeout(HZ); - -		if (unlikely(ftrace_disabled)) -			continue; - -		mutex_lock(&ftrace_sysctl_lock); -		mutex_lock(&ftraced_lock); -		if (!ftraced_suspend && !ftraced_stop && -		    ftrace_update_code()) { -			usecs = nsecs_to_usecs(ftrace_update_time); -			if (ftrace_update_tot_cnt > 100000) { -				ftrace_update_tot_cnt = 0; -				pr_info("hm, dftrace overflow: %lu change%s" -					" (%lu total) in %lu usec%s\n", -					ftrace_update_cnt, -					ftrace_update_cnt != 1 ? "s" : "", -					ftrace_update_tot_cnt, -					usecs, usecs != 1 ? "s" : ""); -				ftrace_disabled = 1; -				WARN_ON_ONCE(1); -			} -		} -		mutex_unlock(&ftraced_lock); -		mutex_unlock(&ftrace_sysctl_lock); - -		ftrace_shutdown_replenish(); -	} -	__set_current_state(TASK_RUNNING); -	return 0; -} - -static int __init ftrace_dyn_table_alloc(void) +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)  {  	struct ftrace_page *pg;  	int cnt; @@ -859,7 +891,9 @@ static int __init ftrace_dyn_table_alloc(void)  	pg = ftrace_pages = ftrace_pages_start; -	cnt = NR_TO_INIT / ENTRIES_PER_PAGE; +	cnt = num_to_init / ENTRIES_PER_PAGE; +	pr_info("ftrace: allocating %ld hash entries in %d pages\n", +		num_to_init, cnt);  	for (i = 0; i < cnt; i++) {  		pg->next = (void *)get_zeroed_page(GFP_KERNEL); @@ -901,6 +935,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  	(*pos)++; +	/* should not be called from interrupt context */ +	spin_lock(&ftrace_lock);   retry:  	if (iter->idx >= iter->pg->index) {  		if (iter->pg->next) { @@ -910,15 +946,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  		}  	} else {  		rec = &iter->pg->records[iter->idx++]; -		if ((!(iter->flags & FTRACE_ITER_FAILURES) && +		if ((rec->flags & FTRACE_FL_FREE) || + +		    (!(iter->flags & FTRACE_ITER_FAILURES) &&  		     (rec->flags & FTRACE_FL_FAILED)) ||  		    ((iter->flags & FTRACE_ITER_FAILURES) && -		     (!(rec->flags & FTRACE_FL_FAILED) || -		      (rec->flags & FTRACE_FL_FREE))) || - -		    ((iter->flags & FTRACE_ITER_FILTER) && -		     !(rec->flags & FTRACE_FL_FILTER)) || +		     !(rec->flags & FTRACE_FL_FAILED)) ||  		    ((iter->flags & FTRACE_ITER_NOTRACE) &&  		     !(rec->flags & FTRACE_FL_NOTRACE))) { @@ -926,6 +960,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  			goto retry;  		}  	} +	spin_unlock(&ftrace_lock);  	iter->pos = *pos; @@ -1039,8 +1074,8 @@ static void ftrace_filter_reset(int enable)  	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;  	unsigned i; -	/* keep kstop machine from running */ -	preempt_disable(); +	/* should not be called from interrupt context */ +	spin_lock(&ftrace_lock);  	if (enable)  		ftrace_filtered = 0;  	pg = ftrace_pages_start; @@ -1053,7 +1088,7 @@ static void ftrace_filter_reset(int enable)  		}  		pg = pg->next;  	} -	preempt_enable(); +	spin_unlock(&ftrace_lock);  }  static int @@ -1165,8 +1200,8 @@ ftrace_match(unsigned char *buff, int len, int enable)  		}  	} -	/* keep kstop machine from running */ -	preempt_disable(); +	/* should not be called from interrupt context */ +	spin_lock(&ftrace_lock);  	if (enable)  		ftrace_filtered = 1;  	pg = ftrace_pages_start; @@ -1203,7 +1238,7 @@ ftrace_match(unsigned char *buff, int len, int enable)  		}  		pg = pg->next;  	} -	preempt_enable(); +	spin_unlock(&ftrace_lock);  }  static ssize_t @@ -1556,6 +1591,114 @@ static __init int ftrace_init_debugfs(void)  fs_initcall(ftrace_init_debugfs); +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +static int ftrace_convert_nops(unsigned long *start, +			       unsigned long *end) +{ +	unsigned long *p; +	unsigned long addr; +	unsigned long flags; + +	p = start; +	while (p < end) { +		addr = ftrace_call_adjust(*p++); +		/* should not be called from interrupt context */ +		spin_lock(&ftrace_lock); +		ftrace_record_ip(addr); +		spin_unlock(&ftrace_lock); +		ftrace_shutdown_replenish(); +	} + +	/* p is ignored */ +	local_irq_save(flags); +	__ftrace_update_code(p); +	local_irq_restore(flags); + +	return 0; +} + +void ftrace_init_module(unsigned long *start, unsigned long *end) +{ +	if (ftrace_disabled || start == end) +		return; +	ftrace_convert_nops(start, end); +} + +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; + +void __init ftrace_init(void) +{ +	unsigned long count, addr, flags; +	int ret; + +	/* Keep the ftrace pointer to the stub */ +	addr = (unsigned long)ftrace_stub; + +	local_irq_save(flags); +	ftrace_dyn_arch_init(&addr); +	local_irq_restore(flags); + +	/* ftrace_dyn_arch_init places the return code in addr */ +	if (addr) +		goto failed; + +	count = __stop_mcount_loc - __start_mcount_loc; + +	ret = ftrace_dyn_table_alloc(count); +	if (ret) +		goto failed; + +	last_ftrace_enabled = ftrace_enabled = 1; + +	ret = ftrace_convert_nops(__start_mcount_loc, +				  __stop_mcount_loc); + +	return; + failed: +	ftrace_disabled = 1; +} +#else /* CONFIG_FTRACE_MCOUNT_RECORD */ +static int ftraced(void *ignore) +{ +	unsigned long usecs; + +	while (!kthread_should_stop()) { + +		set_current_state(TASK_INTERRUPTIBLE); + +		/* check once a second */ +		schedule_timeout(HZ); + +		if (unlikely(ftrace_disabled)) +			continue; + +		mutex_lock(&ftrace_sysctl_lock); +		mutex_lock(&ftraced_lock); +		if (!ftraced_suspend && !ftraced_stop && +		    ftrace_update_code()) { +			usecs = nsecs_to_usecs(ftrace_update_time); +			if (ftrace_update_tot_cnt > 100000) { +				ftrace_update_tot_cnt = 0; +				pr_info("hm, dftrace overflow: %lu change%s" +					" (%lu total) in %lu usec%s\n", +					ftrace_update_cnt, +					ftrace_update_cnt != 1 ? "s" : "", +					ftrace_update_tot_cnt, +					usecs, usecs != 1 ? "s" : ""); +				ftrace_disabled = 1; +				WARN_ON_ONCE(1); +			} +		} +		mutex_unlock(&ftraced_lock); +		mutex_unlock(&ftrace_sysctl_lock); + +		ftrace_shutdown_replenish(); +	} +	__set_current_state(TASK_RUNNING); +	return 0; +} +  static int __init ftrace_dynamic_init(void)  {  	struct task_struct *p; @@ -1572,7 +1715,7 @@ static int __init ftrace_dynamic_init(void)  		goto failed;  	} -	ret = ftrace_dyn_table_alloc(); +	ret = ftrace_dyn_table_alloc(NR_TO_INIT);  	if (ret)  		goto failed; @@ -1593,6 +1736,8 @@ static int __init ftrace_dynamic_init(void)  }  core_initcall(ftrace_dynamic_init); +#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ +  #else  # define ftrace_startup()		do { } while (0)  # define ftrace_shutdown()		do { } while (0) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 00000000000..94af1fe56bb --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,2014 @@ +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h>	/* used for sched_clock() (for now) */ +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/fs.h> + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +/* FIXME!!! */ +u64 ring_buffer_time_stamp(int cpu) +{ +	/* shift to debug/test normalization and TIME_EXTENTS */ +	return sched_clock() << DEBUG_SHIFT; +} + +void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) +{ +	/* Just stupid testing the normalize function and deltas */ +	*ts >>= DEBUG_SHIFT; +} + +#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_ALIGNMENT_SHIFT	2 +#define RB_ALIGNMENT		(1 << RB_ALIGNMENT_SHIFT) +#define RB_MAX_SMALL_DATA	28 + +enum { +	RB_LEN_TIME_EXTEND = 8, +	RB_LEN_TIME_STAMP = 16, +}; + +/* inline for ring buffer fast paths */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ +	unsigned length; + +	switch (event->type) { +	case RINGBUF_TYPE_PADDING: +		/* undefined */ +		return -1; + +	case RINGBUF_TYPE_TIME_EXTEND: +		return RB_LEN_TIME_EXTEND; + +	case RINGBUF_TYPE_TIME_STAMP: +		return RB_LEN_TIME_STAMP; + +	case RINGBUF_TYPE_DATA: +		if (event->len) +			length = event->len << RB_ALIGNMENT_SHIFT; +		else +			length = event->array[0]; +		return length + RB_EVNT_HDR_SIZE; +	default: +		BUG(); +	} +	/* not hit */ +	return 0; +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ +	return rb_event_length(event); +} + +/* inline for ring buffer fast paths */ +static inline void * +rb_event_data(struct ring_buffer_event *event) +{ +	BUG_ON(event->type != RINGBUF_TYPE_DATA); +	/* If length is in len field, then array[0] has the data */ +	if (event->len) +		return (void *)&event->array[0]; +	/* Otherwise length is in array[0] and array[1] has the data */ +	return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ +	return rb_event_data(event); +} + +#define for_each_buffer_cpu(buffer, cpu)		\ +	for_each_cpu_mask(cpu, buffer->cpumask) + +#define TS_SHIFT	27 +#define TS_MASK		((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST	(~TS_MASK) + +/* + * This hack stolen from mm/slob.c. + * We can store per page timing information in the page frame of the page. + * Thanks to Peter Zijlstra for suggesting this idea. + */ +struct buffer_page { +	u64		 time_stamp;	/* page time stamp */ +	local_t		 write;		/* index for next write */ +	local_t		 commit;	/* write commited index */ +	unsigned	 read;		/* index for next read */ +	struct list_head list;		/* list of free pages */ +	void *page;			/* Actual data page */ +}; + +/* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +static inline void free_buffer_page(struct buffer_page *bpage) +{ +	if (bpage->page) +		__free_page(bpage->page); +	kfree(bpage); +} + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ +	if (delta & TS_DELTA_TEST) +		return 1; +	return 0; +} + +#define BUF_PAGE_SIZE PAGE_SIZE + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { +	int				cpu; +	struct ring_buffer		*buffer; +	spinlock_t			lock; +	struct lock_class_key		lock_key; +	struct list_head		pages; +	struct buffer_page		*head_page;	/* read from head */ +	struct buffer_page		*tail_page;	/* write to tail */ +	struct buffer_page		*commit_page;	/* commited pages */ +	struct buffer_page		*reader_page; +	unsigned long			overrun; +	unsigned long			entries; +	u64				write_stamp; +	u64				read_stamp; +	atomic_t			record_disabled; +}; + +struct ring_buffer { +	unsigned long			size; +	unsigned			pages; +	unsigned			flags; +	int				cpus; +	cpumask_t			cpumask; +	atomic_t			record_disabled; + +	struct mutex			mutex; + +	struct ring_buffer_per_cpu	**buffers; +}; + +struct ring_buffer_iter { +	struct ring_buffer_per_cpu	*cpu_buffer; +	unsigned long			head; +	struct buffer_page		*head_page; +	u64				read_stamp; +}; + +#define RB_WARN_ON(buffer, cond)				\ +	do {							\ +		if (unlikely(cond)) {				\ +			atomic_inc(&buffer->record_disabled);	\ +			WARN_ON(1);				\ +		}						\ +	} while (0) + +#define RB_WARN_ON_RET(buffer, cond)				\ +	do {							\ +		if (unlikely(cond)) {				\ +			atomic_inc(&buffer->record_disabled);	\ +			WARN_ON(1);				\ +			return -1;				\ +		}						\ +	} while (0) + +#define RB_WARN_ON_ONCE(buffer, cond)				\ +	do {							\ +		static int once;				\ +		if (unlikely(cond) && !once) {			\ +			once++;					\ +			atomic_inc(&buffer->record_disabled);	\ +			WARN_ON(1);				\ +		}						\ +	} while (0) + +/** + * check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ +	struct list_head *head = &cpu_buffer->pages; +	struct buffer_page *page, *tmp; + +	RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); +	RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); + +	list_for_each_entry_safe(page, tmp, head, list) { +		RB_WARN_ON_RET(cpu_buffer, +			       page->list.next->prev != &page->list); +		RB_WARN_ON_RET(cpu_buffer, +			       page->list.prev->next != &page->list); +	} + +	return 0; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, +			     unsigned nr_pages) +{ +	struct list_head *head = &cpu_buffer->pages; +	struct buffer_page *page, *tmp; +	unsigned long addr; +	LIST_HEAD(pages); +	unsigned i; + +	for (i = 0; i < nr_pages; i++) { +		page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), +				    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); +		if (!page) +			goto free_pages; +		list_add(&page->list, &pages); + +		addr = __get_free_page(GFP_KERNEL); +		if (!addr) +			goto free_pages; +		page->page = (void *)addr; +	} + +	list_splice(&pages, head); + +	rb_check_pages(cpu_buffer); + +	return 0; + + free_pages: +	list_for_each_entry_safe(page, tmp, &pages, list) { +		list_del_init(&page->list); +		free_buffer_page(page); +	} +	return -ENOMEM; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct buffer_page *page; +	unsigned long addr; +	int ret; + +	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), +				  GFP_KERNEL, cpu_to_node(cpu)); +	if (!cpu_buffer) +		return NULL; + +	cpu_buffer->cpu = cpu; +	cpu_buffer->buffer = buffer; +	spin_lock_init(&cpu_buffer->lock); +	INIT_LIST_HEAD(&cpu_buffer->pages); + +	page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), +			    GFP_KERNEL, cpu_to_node(cpu)); +	if (!page) +		goto fail_free_buffer; + +	cpu_buffer->reader_page = page; +	addr = __get_free_page(GFP_KERNEL); +	if (!addr) +		goto fail_free_reader; +	page->page = (void *)addr; + +	INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + +	ret = rb_allocate_pages(cpu_buffer, buffer->pages); +	if (ret < 0) +		goto fail_free_reader; + +	cpu_buffer->head_page +		= list_entry(cpu_buffer->pages.next, struct buffer_page, list); +	cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + +	return cpu_buffer; + + fail_free_reader: +	free_buffer_page(cpu_buffer->reader_page); + + fail_free_buffer: +	kfree(cpu_buffer); +	return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ +	struct list_head *head = &cpu_buffer->pages; +	struct buffer_page *page, *tmp; + +	list_del_init(&cpu_buffer->reader_page->list); +	free_buffer_page(cpu_buffer->reader_page); + +	list_for_each_entry_safe(page, tmp, head, list) { +		list_del_init(&page->list); +		free_buffer_page(page); +	} +	kfree(cpu_buffer); +} + +/* + * Causes compile errors if the struct buffer_page gets bigger + * than the struct page. + */ +extern int ring_buffer_page_too_big(void); + +/** + * ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes that is needed. + * @flags: attributes to set for the ring buffer. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +{ +	struct ring_buffer *buffer; +	int bsize; +	int cpu; + +	/* Paranoid! Optimizes out when all is well */ +	if (sizeof(struct buffer_page) > sizeof(struct page)) +		ring_buffer_page_too_big(); + + +	/* keep it in its own cache line */ +	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), +			 GFP_KERNEL); +	if (!buffer) +		return NULL; + +	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); +	buffer->flags = flags; + +	/* need at least two pages */ +	if (buffer->pages == 1) +		buffer->pages++; + +	buffer->cpumask = cpu_possible_map; +	buffer->cpus = nr_cpu_ids; + +	bsize = sizeof(void *) * nr_cpu_ids; +	buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), +				  GFP_KERNEL); +	if (!buffer->buffers) +		goto fail_free_buffer; + +	for_each_buffer_cpu(buffer, cpu) { +		buffer->buffers[cpu] = +			rb_allocate_cpu_buffer(buffer, cpu); +		if (!buffer->buffers[cpu]) +			goto fail_free_buffers; +	} + +	mutex_init(&buffer->mutex); + +	return buffer; + + fail_free_buffers: +	for_each_buffer_cpu(buffer, cpu) { +		if (buffer->buffers[cpu]) +			rb_free_cpu_buffer(buffer->buffers[cpu]); +	} +	kfree(buffer->buffers); + + fail_free_buffer: +	kfree(buffer); +	return NULL; +} + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct ring_buffer *buffer) +{ +	int cpu; + +	for_each_buffer_cpu(buffer, cpu) +		rb_free_cpu_buffer(buffer->buffers[cpu]); + +	kfree(buffer); +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static void +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +{ +	struct buffer_page *page; +	struct list_head *p; +	unsigned i; + +	atomic_inc(&cpu_buffer->record_disabled); +	synchronize_sched(); + +	for (i = 0; i < nr_pages; i++) { +		BUG_ON(list_empty(&cpu_buffer->pages)); +		p = cpu_buffer->pages.next; +		page = list_entry(p, struct buffer_page, list); +		list_del_init(&page->list); +		free_buffer_page(page); +	} +	BUG_ON(list_empty(&cpu_buffer->pages)); + +	rb_reset_cpu(cpu_buffer); + +	rb_check_pages(cpu_buffer); + +	atomic_dec(&cpu_buffer->record_disabled); + +} + +static void +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, +		struct list_head *pages, unsigned nr_pages) +{ +	struct buffer_page *page; +	struct list_head *p; +	unsigned i; + +	atomic_inc(&cpu_buffer->record_disabled); +	synchronize_sched(); + +	for (i = 0; i < nr_pages; i++) { +		BUG_ON(list_empty(pages)); +		p = pages->next; +		page = list_entry(p, struct buffer_page, list); +		list_del_init(&page->list); +		list_add_tail(&page->list, &cpu_buffer->pages); +	} +	rb_reset_cpu(cpu_buffer); + +	rb_check_pages(cpu_buffer); + +	atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * + * The tracer is responsible for making sure that the buffer is + * not being used while changing the size. + * Note: We may be able to change the above requirement by using + *  RCU synchronizations. + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns -1 on failure. + */ +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned nr_pages, rm_pages, new_pages; +	struct buffer_page *page, *tmp; +	unsigned long buffer_size; +	unsigned long addr; +	LIST_HEAD(pages); +	int i, cpu; + +	size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); +	size *= BUF_PAGE_SIZE; +	buffer_size = buffer->pages * BUF_PAGE_SIZE; + +	/* we need a minimum of two pages */ +	if (size < BUF_PAGE_SIZE * 2) +		size = BUF_PAGE_SIZE * 2; + +	if (size == buffer_size) +		return size; + +	mutex_lock(&buffer->mutex); + +	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + +	if (size < buffer_size) { + +		/* easy case, just free pages */ +		BUG_ON(nr_pages >= buffer->pages); + +		rm_pages = buffer->pages - nr_pages; + +		for_each_buffer_cpu(buffer, cpu) { +			cpu_buffer = buffer->buffers[cpu]; +			rb_remove_pages(cpu_buffer, rm_pages); +		} +		goto out; +	} + +	/* +	 * This is a bit more difficult. We only want to add pages +	 * when we can allocate enough for all CPUs. We do this +	 * by allocating all the pages and storing them on a local +	 * link list. If we succeed in our allocation, then we +	 * add these pages to the cpu_buffers. Otherwise we just free +	 * them all and return -ENOMEM; +	 */ +	BUG_ON(nr_pages <= buffer->pages); +	new_pages = nr_pages - buffer->pages; + +	for_each_buffer_cpu(buffer, cpu) { +		for (i = 0; i < new_pages; i++) { +			page = kzalloc_node(ALIGN(sizeof(*page), +						  cache_line_size()), +					    GFP_KERNEL, cpu_to_node(cpu)); +			if (!page) +				goto free_pages; +			list_add(&page->list, &pages); +			addr = __get_free_page(GFP_KERNEL); +			if (!addr) +				goto free_pages; +			page->page = (void *)addr; +		} +	} + +	for_each_buffer_cpu(buffer, cpu) { +		cpu_buffer = buffer->buffers[cpu]; +		rb_insert_pages(cpu_buffer, &pages, new_pages); +	} + +	BUG_ON(!list_empty(&pages)); + + out: +	buffer->pages = nr_pages; +	mutex_unlock(&buffer->mutex); + +	return size; + + free_pages: +	list_for_each_entry_safe(page, tmp, &pages, list) { +		list_del_init(&page->list); +		free_buffer_page(page); +	} +	return -ENOMEM; +} + +static inline int rb_null_event(struct ring_buffer_event *event) +{ +	return event->type == RINGBUF_TYPE_PADDING; +} + +static inline void *__rb_page_index(struct buffer_page *page, unsigned index) +{ +	return page->page + index; +} + +static inline struct ring_buffer_event * +rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) +{ +	return __rb_page_index(cpu_buffer->reader_page, +			       cpu_buffer->reader_page->read); +} + +static inline struct ring_buffer_event * +rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) +{ +	return __rb_page_index(cpu_buffer->head_page, +			       cpu_buffer->head_page->read); +} + +static inline struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) +{ +	return __rb_page_index(iter->head_page, iter->head); +} + +static inline unsigned rb_page_write(struct buffer_page *bpage) +{ +	return local_read(&bpage->write); +} + +static inline unsigned rb_page_commit(struct buffer_page *bpage) +{ +	return local_read(&bpage->commit); +} + +/* Size is determined by what has been commited */ +static inline unsigned rb_page_size(struct buffer_page *bpage) +{ +	return rb_page_commit(bpage); +} + +static inline unsigned +rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) +{ +	return rb_page_commit(cpu_buffer->commit_page); +} + +static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) +{ +	return rb_page_commit(cpu_buffer->head_page); +} + +/* + * When the tail hits the head and the buffer is in overwrite mode, + * the head jumps to the next page and all content on the previous + * page is discarded. But before doing so, we update the overrun + * variable of the buffer. + */ +static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) +{ +	struct ring_buffer_event *event; +	unsigned long head; + +	for (head = 0; head < rb_head_size(cpu_buffer); +	     head += rb_event_length(event)) { + +		event = __rb_page_index(cpu_buffer->head_page, head); +		BUG_ON(rb_null_event(event)); +		/* Only count data entries */ +		if (event->type != RINGBUF_TYPE_DATA) +			continue; +		cpu_buffer->overrun++; +		cpu_buffer->entries--; +	} +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, +			       struct buffer_page **page) +{ +	struct list_head *p = (*page)->list.next; + +	if (p == &cpu_buffer->pages) +		p = p->next; + +	*page = list_entry(p, struct buffer_page, list); +} + +static inline unsigned +rb_event_index(struct ring_buffer_event *event) +{ +	unsigned long addr = (unsigned long)event; + +	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); +} + +static inline int +rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, +	     struct ring_buffer_event *event) +{ +	unsigned long addr = (unsigned long)event; +	unsigned long index; + +	index = rb_event_index(event); +	addr &= PAGE_MASK; + +	return cpu_buffer->commit_page->page == (void *)addr && +		rb_commit_index(cpu_buffer) == index; +} + +static inline void +rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, +		    struct ring_buffer_event *event) +{ +	unsigned long addr = (unsigned long)event; +	unsigned long index; + +	index = rb_event_index(event); +	addr &= PAGE_MASK; + +	while (cpu_buffer->commit_page->page != (void *)addr) { +		RB_WARN_ON(cpu_buffer, +			   cpu_buffer->commit_page == cpu_buffer->tail_page); +		cpu_buffer->commit_page->commit = +			cpu_buffer->commit_page->write; +		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); +		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; +	} + +	/* Now set the commit to the event's index */ +	local_set(&cpu_buffer->commit_page->commit, index); +} + +static inline void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ +	/* +	 * We only race with interrupts and NMIs on this CPU. +	 * If we own the commit event, then we can commit +	 * all others that interrupted us, since the interruptions +	 * are in stack format (they finish before they come +	 * back to us). This allows us to do a simple loop to +	 * assign the commit to the tail. +	 */ +	while (cpu_buffer->commit_page != cpu_buffer->tail_page) { +		cpu_buffer->commit_page->commit = +			cpu_buffer->commit_page->write; +		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); +		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; +		/* add barrier to keep gcc from optimizing too much */ +		barrier(); +	} +	while (rb_commit_index(cpu_buffer) != +	       rb_page_write(cpu_buffer->commit_page)) { +		cpu_buffer->commit_page->commit = +			cpu_buffer->commit_page->write; +		barrier(); +	} +} + +static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ +	cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; +	cpu_buffer->reader_page->read = 0; +} + +static inline void rb_inc_iter(struct ring_buffer_iter *iter) +{ +	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + +	/* +	 * The iterator could be on the reader page (it starts there). +	 * But the head could have moved, since the reader was +	 * found. Check for this case and assign the iterator +	 * to the head page instead of next. +	 */ +	if (iter->head_page == cpu_buffer->reader_page) +		iter->head_page = cpu_buffer->head_page; +	else +		rb_inc_page(cpu_buffer, &iter->head_page); + +	iter->read_stamp = iter->head_page->time_stamp; +	iter->head = 0; +} + +/** + * ring_buffer_update_event - update event type and data + * @event: the even to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static inline void +rb_update_event(struct ring_buffer_event *event, +			 unsigned type, unsigned length) +{ +	event->type = type; + +	switch (type) { + +	case RINGBUF_TYPE_PADDING: +		break; + +	case RINGBUF_TYPE_TIME_EXTEND: +		event->len = +			(RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) +			>> RB_ALIGNMENT_SHIFT; +		break; + +	case RINGBUF_TYPE_TIME_STAMP: +		event->len = +			(RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) +			>> RB_ALIGNMENT_SHIFT; +		break; + +	case RINGBUF_TYPE_DATA: +		length -= RB_EVNT_HDR_SIZE; +		if (length > RB_MAX_SMALL_DATA) { +			event->len = 0; +			event->array[0] = length; +		} else +			event->len = +				(length + (RB_ALIGNMENT-1)) +				>> RB_ALIGNMENT_SHIFT; +		break; +	default: +		BUG(); +	} +} + +static inline unsigned rb_calculate_event_length(unsigned length) +{ +	struct ring_buffer_event event; /* Used only for sizeof array */ + +	/* zero length can cause confusions */ +	if (!length) +		length = 1; + +	if (length > RB_MAX_SMALL_DATA) +		length += sizeof(event.array[0]); + +	length += RB_EVNT_HDR_SIZE; +	length = ALIGN(length, RB_ALIGNMENT); + +	return length; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, +		  unsigned type, unsigned long length, u64 *ts) +{ +	struct buffer_page *tail_page, *head_page, *reader_page; +	unsigned long tail, write; +	struct ring_buffer *buffer = cpu_buffer->buffer; +	struct ring_buffer_event *event; +	unsigned long flags; + +	tail_page = cpu_buffer->tail_page; +	write = local_add_return(length, &tail_page->write); +	tail = write - length; + +	/* See if we shot pass the end of this buffer page */ +	if (write > BUF_PAGE_SIZE) { +		struct buffer_page *next_page = tail_page; + +		spin_lock_irqsave(&cpu_buffer->lock, flags); + +		rb_inc_page(cpu_buffer, &next_page); + +		head_page = cpu_buffer->head_page; +		reader_page = cpu_buffer->reader_page; + +		/* we grabbed the lock before incrementing */ +		RB_WARN_ON(cpu_buffer, next_page == reader_page); + +		/* +		 * If for some reason, we had an interrupt storm that made +		 * it all the way around the buffer, bail, and warn +		 * about it. +		 */ +		if (unlikely(next_page == cpu_buffer->commit_page)) { +			WARN_ON_ONCE(1); +			goto out_unlock; +		} + +		if (next_page == head_page) { +			if (!(buffer->flags & RB_FL_OVERWRITE)) { +				/* reset write */ +				if (tail <= BUF_PAGE_SIZE) +					local_set(&tail_page->write, tail); +				goto out_unlock; +			} + +			/* tail_page has not moved yet? */ +			if (tail_page == cpu_buffer->tail_page) { +				/* count overflows */ +				rb_update_overflow(cpu_buffer); + +				rb_inc_page(cpu_buffer, &head_page); +				cpu_buffer->head_page = head_page; +				cpu_buffer->head_page->read = 0; +			} +		} + +		/* +		 * If the tail page is still the same as what we think +		 * it is, then it is up to us to update the tail +		 * pointer. +		 */ +		if (tail_page == cpu_buffer->tail_page) { +			local_set(&next_page->write, 0); +			local_set(&next_page->commit, 0); +			cpu_buffer->tail_page = next_page; + +			/* reread the time stamp */ +			*ts = ring_buffer_time_stamp(cpu_buffer->cpu); +			cpu_buffer->tail_page->time_stamp = *ts; +		} + +		/* +		 * The actual tail page has moved forward. +		 */ +		if (tail < BUF_PAGE_SIZE) { +			/* Mark the rest of the page with padding */ +			event = __rb_page_index(tail_page, tail); +			event->type = RINGBUF_TYPE_PADDING; +		} + +		if (tail <= BUF_PAGE_SIZE) +			/* Set the write back to the previous setting */ +			local_set(&tail_page->write, tail); + +		/* +		 * If this was a commit entry that failed, +		 * increment that too +		 */ +		if (tail_page == cpu_buffer->commit_page && +		    tail == rb_commit_index(cpu_buffer)) { +			rb_set_commit_to_write(cpu_buffer); +		} + +		spin_unlock_irqrestore(&cpu_buffer->lock, flags); + +		/* fail and let the caller try again */ +		return ERR_PTR(-EAGAIN); +	} + +	/* We reserved something on the buffer */ + +	BUG_ON(write > BUF_PAGE_SIZE); + +	event = __rb_page_index(tail_page, tail); +	rb_update_event(event, type, length); + +	/* +	 * If this is a commit and the tail is zero, then update +	 * this page's time stamp. +	 */ +	if (!tail && rb_is_commit(cpu_buffer, event)) +		cpu_buffer->commit_page->time_stamp = *ts; + +	return event; + + out_unlock: +	spin_unlock_irqrestore(&cpu_buffer->lock, flags); +	return NULL; +} + +static int +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, +		  u64 *ts, u64 *delta) +{ +	struct ring_buffer_event *event; +	static int once; +	int ret; + +	if (unlikely(*delta > (1ULL << 59) && !once++)) { +		printk(KERN_WARNING "Delta way too big! %llu" +		       " ts=%llu write stamp = %llu\n", +		       *delta, *ts, cpu_buffer->write_stamp); +		WARN_ON(1); +	} + +	/* +	 * The delta is too big, we to add a +	 * new timestamp. +	 */ +	event = __rb_reserve_next(cpu_buffer, +				  RINGBUF_TYPE_TIME_EXTEND, +				  RB_LEN_TIME_EXTEND, +				  ts); +	if (!event) +		return -EBUSY; + +	if (PTR_ERR(event) == -EAGAIN) +		return -EAGAIN; + +	/* Only a commited time event can update the write stamp */ +	if (rb_is_commit(cpu_buffer, event)) { +		/* +		 * If this is the first on the page, then we need to +		 * update the page itself, and just put in a zero. +		 */ +		if (rb_event_index(event)) { +			event->time_delta = *delta & TS_MASK; +			event->array[0] = *delta >> TS_SHIFT; +		} else { +			cpu_buffer->commit_page->time_stamp = *ts; +			event->time_delta = 0; +			event->array[0] = 0; +		} +		cpu_buffer->write_stamp = *ts; +		/* let the caller know this was the commit */ +		ret = 1; +	} else { +		/* Darn, this is just wasted space */ +		event->time_delta = 0; +		event->array[0] = 0; +		ret = 0; +	} + +	*delta = 0; + +	return ret; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, +		      unsigned type, unsigned long length) +{ +	struct ring_buffer_event *event; +	u64 ts, delta; +	int commit = 0; + + again: +	ts = ring_buffer_time_stamp(cpu_buffer->cpu); + +	/* +	 * Only the first commit can update the timestamp. +	 * Yes there is a race here. If an interrupt comes in +	 * just after the conditional and it traces too, then it +	 * will also check the deltas. More than one timestamp may +	 * also be made. But only the entry that did the actual +	 * commit will be something other than zero. +	 */ +	if (cpu_buffer->tail_page == cpu_buffer->commit_page && +	    rb_page_write(cpu_buffer->tail_page) == +	    rb_commit_index(cpu_buffer)) { + +		delta = ts - cpu_buffer->write_stamp; + +		/* make sure this delta is calculated here */ +		barrier(); + +		/* Did the write stamp get updated already? */ +		if (unlikely(ts < cpu_buffer->write_stamp)) +			goto again; + +		if (test_time_stamp(delta)) { + +			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); + +			if (commit == -EBUSY) +				return NULL; + +			if (commit == -EAGAIN) +				goto again; + +			RB_WARN_ON(cpu_buffer, commit < 0); +		} +	} else +		/* Non commits have zero deltas */ +		delta = 0; + +	event = __rb_reserve_next(cpu_buffer, type, length, &ts); +	if (PTR_ERR(event) == -EAGAIN) +		goto again; + +	if (!event) { +		if (unlikely(commit)) +			/* +			 * Ouch! We needed a timestamp and it was commited. But +			 * we didn't get our event reserved. +			 */ +			rb_set_commit_to_write(cpu_buffer); +		return NULL; +	} + +	/* +	 * If the timestamp was commited, make the commit our entry +	 * now so that we will update it when needed. +	 */ +	if (commit) +		rb_set_commit_event(cpu_buffer, event); +	else if (!rb_is_commit(cpu_buffer, event)) +		delta = 0; + +	event->time_delta = delta; + +	return event; +} + +static DEFINE_PER_CPU(int, rb_need_resched); + +/** + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) + * @flags: a pointer to save the interrupt flags + * + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. + * + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. + */ +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, +			 unsigned long length, +			 unsigned long *flags) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct ring_buffer_event *event; +	int cpu, resched; + +	if (atomic_read(&buffer->record_disabled)) +		return NULL; + +	/* If we are tracing schedule, we don't want to recurse */ +	resched = need_resched(); +	preempt_disable_notrace(); + +	cpu = raw_smp_processor_id(); + +	if (!cpu_isset(cpu, buffer->cpumask)) +		goto out; + +	cpu_buffer = buffer->buffers[cpu]; + +	if (atomic_read(&cpu_buffer->record_disabled)) +		goto out; + +	length = rb_calculate_event_length(length); +	if (length > BUF_PAGE_SIZE) +		goto out; + +	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); +	if (!event) +		goto out; + +	/* +	 * Need to store resched state on this cpu. +	 * Only the first needs to. +	 */ + +	if (preempt_count() == 1) +		per_cpu(rb_need_resched, cpu) = resched; + +	return event; + + out: +	if (resched) +		preempt_enable_notrace(); +	else +		preempt_enable_notrace(); +	return NULL; +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, +		      struct ring_buffer_event *event) +{ +	cpu_buffer->entries++; + +	/* Only process further if we own the commit */ +	if (!rb_is_commit(cpu_buffer, event)) +		return; + +	cpu_buffer->write_stamp += event->time_delta; + +	rb_set_commit_to_write(cpu_buffer); +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * @flags: the interrupt flags received from ring_buffer_lock_reserve. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, +			      struct ring_buffer_event *event, +			      unsigned long flags) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	int cpu = raw_smp_processor_id(); + +	cpu_buffer = buffer->buffers[cpu]; + +	rb_commit(cpu_buffer, event); + +	/* +	 * Only the last preempt count needs to restore preemption. +	 */ +	if (preempt_count() == 1) { +		if (per_cpu(rb_need_resched, cpu)) +			preempt_enable_no_resched_notrace(); +		else +			preempt_enable_notrace(); +	} else +		preempt_enable_no_resched_notrace(); + +	return 0; +} + +/** + * ring_buffer_write - write data to the buffer without reserving + * @buffer: The ring buffer to write to. + * @length: The length of the data being written (excluding the event header) + * @data: The data to write to the buffer. + * + * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as + * one function. If you already have the data to write to the buffer, it + * may be easier to simply call this function. + * + * Note, like ring_buffer_lock_reserve, the length is the length of the data + * and not the length of the event which would hold the header. + */ +int ring_buffer_write(struct ring_buffer *buffer, +			unsigned long length, +			void *data) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct ring_buffer_event *event; +	unsigned long event_length; +	void *body; +	int ret = -EBUSY; +	int cpu, resched; + +	if (atomic_read(&buffer->record_disabled)) +		return -EBUSY; + +	resched = need_resched(); +	preempt_disable_notrace(); + +	cpu = raw_smp_processor_id(); + +	if (!cpu_isset(cpu, buffer->cpumask)) +		goto out; + +	cpu_buffer = buffer->buffers[cpu]; + +	if (atomic_read(&cpu_buffer->record_disabled)) +		goto out; + +	event_length = rb_calculate_event_length(length); +	event = rb_reserve_next_event(cpu_buffer, +				      RINGBUF_TYPE_DATA, event_length); +	if (!event) +		goto out; + +	body = rb_event_data(event); + +	memcpy(body, data, length); + +	rb_commit(cpu_buffer, event); + +	ret = 0; + out: +	if (resched) +		preempt_enable_no_resched_notrace(); +	else +		preempt_enable_notrace(); + +	return ret; +} + +static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ +	struct buffer_page *reader = cpu_buffer->reader_page; +	struct buffer_page *head = cpu_buffer->head_page; +	struct buffer_page *commit = cpu_buffer->commit_page; + +	return reader->read == rb_page_commit(reader) && +		(commit == reader || +		 (commit == head && +		  head->read == rb_page_commit(commit))); +} + +/** + * ring_buffer_record_disable - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable(struct ring_buffer *buffer) +{ +	atomic_inc(&buffer->record_disabled); +} + +/** + * ring_buffer_record_enable - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable(struct ring_buffer *buffer) +{ +	atomic_dec(&buffer->record_disabled); +} + +/** + * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer + * @buffer: The ring buffer to stop writes to. + * @cpu: The CPU buffer to stop + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return; + +	cpu_buffer = buffer->buffers[cpu]; +	atomic_inc(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_record_enable_cpu - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * @cpu: The CPU to enable. + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return; + +	cpu_buffer = buffer->buffers[cpu]; +	atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_entries_cpu - get the number of entries in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the entries from. + */ +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	return cpu_buffer->entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	return cpu_buffer->overrun; +} + +/** + * ring_buffer_entries - get the number of entries in a buffer + * @buffer: The ring buffer + * + * Returns the total number of entries in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_entries(struct ring_buffer *buffer) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long entries = 0; +	int cpu; + +	/* if you care about this being correct, lock the buffer */ +	for_each_buffer_cpu(buffer, cpu) { +		cpu_buffer = buffer->buffers[cpu]; +		entries += cpu_buffer->entries; +	} + +	return entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in buffer + * @buffer: The ring buffer + * + * Returns the total number of overruns in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_overruns(struct ring_buffer *buffer) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long overruns = 0; +	int cpu; + +	/* if you care about this being correct, lock the buffer */ +	for_each_buffer_cpu(buffer, cpu) { +		cpu_buffer = buffer->buffers[cpu]; +		overruns += cpu_buffer->overrun; +	} + +	return overruns; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ +	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + +	/* Iterator usage is expected to have record disabled */ +	if (list_empty(&cpu_buffer->reader_page->list)) { +		iter->head_page = cpu_buffer->head_page; +		iter->head = cpu_buffer->head_page->read; +	} else { +		iter->head_page = cpu_buffer->reader_page; +		iter->head = cpu_buffer->reader_page->read; +	} +	if (iter->head) +		iter->read_stamp = cpu_buffer->read_stamp; +	else +		iter->read_stamp = iter->head_page->time_stamp; +} + +/** + * ring_buffer_iter_empty - check if an iterator has no more to read + * @iter: The iterator to check + */ +int ring_buffer_iter_empty(struct ring_buffer_iter *iter) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	cpu_buffer = iter->cpu_buffer; + +	return iter->head_page == cpu_buffer->commit_page && +		iter->head == rb_commit_index(cpu_buffer); +} + +static void +rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, +		     struct ring_buffer_event *event) +{ +	u64 delta; + +	switch (event->type) { +	case RINGBUF_TYPE_PADDING: +		return; + +	case RINGBUF_TYPE_TIME_EXTEND: +		delta = event->array[0]; +		delta <<= TS_SHIFT; +		delta += event->time_delta; +		cpu_buffer->read_stamp += delta; +		return; + +	case RINGBUF_TYPE_TIME_STAMP: +		/* FIXME: not implemented */ +		return; + +	case RINGBUF_TYPE_DATA: +		cpu_buffer->read_stamp += event->time_delta; +		return; + +	default: +		BUG(); +	} +	return; +} + +static void +rb_update_iter_read_stamp(struct ring_buffer_iter *iter, +			  struct ring_buffer_event *event) +{ +	u64 delta; + +	switch (event->type) { +	case RINGBUF_TYPE_PADDING: +		return; + +	case RINGBUF_TYPE_TIME_EXTEND: +		delta = event->array[0]; +		delta <<= TS_SHIFT; +		delta += event->time_delta; +		iter->read_stamp += delta; +		return; + +	case RINGBUF_TYPE_TIME_STAMP: +		/* FIXME: not implemented */ +		return; + +	case RINGBUF_TYPE_DATA: +		iter->read_stamp += event->time_delta; +		return; + +	default: +		BUG(); +	} +	return; +} + +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ +	struct buffer_page *reader = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&cpu_buffer->lock, flags); + + again: +	reader = cpu_buffer->reader_page; + +	/* If there's more to read, return this page */ +	if (cpu_buffer->reader_page->read < rb_page_size(reader)) +		goto out; + +	/* Never should we have an index greater than the size */ +	RB_WARN_ON(cpu_buffer, +		   cpu_buffer->reader_page->read > rb_page_size(reader)); + +	/* check if we caught up to the tail */ +	reader = NULL; +	if (cpu_buffer->commit_page == cpu_buffer->reader_page) +		goto out; + +	/* +	 * Splice the empty reader page into the list around the head. +	 * Reset the reader page to size zero. +	 */ + +	reader = cpu_buffer->head_page; +	cpu_buffer->reader_page->list.next = reader->list.next; +	cpu_buffer->reader_page->list.prev = reader->list.prev; + +	local_set(&cpu_buffer->reader_page->write, 0); +	local_set(&cpu_buffer->reader_page->commit, 0); + +	/* Make the reader page now replace the head */ +	reader->list.prev->next = &cpu_buffer->reader_page->list; +	reader->list.next->prev = &cpu_buffer->reader_page->list; + +	/* +	 * If the tail is on the reader, then we must set the head +	 * to the inserted page, otherwise we set it one before. +	 */ +	cpu_buffer->head_page = cpu_buffer->reader_page; + +	if (cpu_buffer->commit_page != reader) +		rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + +	/* Finally update the reader page to the new head */ +	cpu_buffer->reader_page = reader; +	rb_reset_reader_page(cpu_buffer); + +	goto again; + + out: +	spin_unlock_irqrestore(&cpu_buffer->lock, flags); + +	return reader; +} + +static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) +{ +	struct ring_buffer_event *event; +	struct buffer_page *reader; +	unsigned length; + +	reader = rb_get_reader_page(cpu_buffer); + +	/* This function should not be called when buffer is empty */ +	BUG_ON(!reader); + +	event = rb_reader_event(cpu_buffer); + +	if (event->type == RINGBUF_TYPE_DATA) +		cpu_buffer->entries--; + +	rb_update_read_stamp(cpu_buffer, event); + +	length = rb_event_length(event); +	cpu_buffer->reader_page->read += length; +} + +static void rb_advance_iter(struct ring_buffer_iter *iter) +{ +	struct ring_buffer *buffer; +	struct ring_buffer_per_cpu *cpu_buffer; +	struct ring_buffer_event *event; +	unsigned length; + +	cpu_buffer = iter->cpu_buffer; +	buffer = cpu_buffer->buffer; + +	/* +	 * Check if we are at the end of the buffer. +	 */ +	if (iter->head >= rb_page_size(iter->head_page)) { +		BUG_ON(iter->head_page == cpu_buffer->commit_page); +		rb_inc_iter(iter); +		return; +	} + +	event = rb_iter_head_event(iter); + +	length = rb_event_length(event); + +	/* +	 * This should not be called to advance the header if we are +	 * at the tail of the buffer. +	 */ +	BUG_ON((iter->head_page == cpu_buffer->commit_page) && +	       (iter->head + length > rb_commit_index(cpu_buffer))); + +	rb_update_iter_read_stamp(iter, event); + +	iter->head += length; + +	/* check for end of page padding */ +	if ((iter->head >= rb_page_size(iter->head_page)) && +	    (iter->head_page != cpu_buffer->commit_page)) +		rb_advance_iter(iter); +} + +/** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct ring_buffer_event *event; +	struct buffer_page *reader; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return NULL; + +	cpu_buffer = buffer->buffers[cpu]; + + again: +	reader = rb_get_reader_page(cpu_buffer); +	if (!reader) +		return NULL; + +	event = rb_reader_event(cpu_buffer); + +	switch (event->type) { +	case RINGBUF_TYPE_PADDING: +		RB_WARN_ON(cpu_buffer, 1); +		rb_advance_reader(cpu_buffer); +		return NULL; + +	case RINGBUF_TYPE_TIME_EXTEND: +		/* Internal data, OK to advance */ +		rb_advance_reader(cpu_buffer); +		goto again; + +	case RINGBUF_TYPE_TIME_STAMP: +		/* FIXME: not implemented */ +		rb_advance_reader(cpu_buffer); +		goto again; + +	case RINGBUF_TYPE_DATA: +		if (ts) { +			*ts = cpu_buffer->read_stamp + event->time_delta; +			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); +		} +		return event; + +	default: +		BUG(); +	} + +	return NULL; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ +	struct ring_buffer *buffer; +	struct ring_buffer_per_cpu *cpu_buffer; +	struct ring_buffer_event *event; + +	if (ring_buffer_iter_empty(iter)) +		return NULL; + +	cpu_buffer = iter->cpu_buffer; +	buffer = cpu_buffer->buffer; + + again: +	if (rb_per_cpu_empty(cpu_buffer)) +		return NULL; + +	event = rb_iter_head_event(iter); + +	switch (event->type) { +	case RINGBUF_TYPE_PADDING: +		rb_inc_iter(iter); +		goto again; + +	case RINGBUF_TYPE_TIME_EXTEND: +		/* Internal data, OK to advance */ +		rb_advance_iter(iter); +		goto again; + +	case RINGBUF_TYPE_TIME_STAMP: +		/* FIXME: not implemented */ +		rb_advance_iter(iter); +		goto again; + +	case RINGBUF_TYPE_DATA: +		if (ts) { +			*ts = iter->read_stamp + event->time_delta; +			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); +		} +		return event; + +	default: +		BUG(); +	} + +	return NULL; +} + +/** + * ring_buffer_consume - return an event and consume it + * @buffer: The ring buffer to get the next event from + * + * Returns the next event in the ring buffer, and that event is consumed. + * Meaning, that sequential reads will keep returning a different event, + * and eventually empty the ring buffer if the producer is slower. + */ +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct ring_buffer_event *event; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return NULL; + +	event = ring_buffer_peek(buffer, cpu, ts); +	if (!event) +		return NULL; + +	cpu_buffer = buffer->buffers[cpu]; +	rb_advance_reader(cpu_buffer); + +	return event; +} + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @buffer: The ring buffer to read from + * @cpu: The cpu buffer to iterate over + * + * This starts up an iteration through the buffer. It also disables + * the recording to the buffer until the reading is finished. + * This prevents the reading from being corrupted. This is not + * a consuming read, so a producer is not expected. + * + * Must be paired with ring_buffer_finish. + */ +struct ring_buffer_iter * +ring_buffer_read_start(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct ring_buffer_iter *iter; +	unsigned long flags; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return NULL; + +	iter = kmalloc(sizeof(*iter), GFP_KERNEL); +	if (!iter) +		return NULL; + +	cpu_buffer = buffer->buffers[cpu]; + +	iter->cpu_buffer = cpu_buffer; + +	atomic_inc(&cpu_buffer->record_disabled); +	synchronize_sched(); + +	spin_lock_irqsave(&cpu_buffer->lock, flags); +	ring_buffer_iter_reset(iter); +	spin_unlock_irqrestore(&cpu_buffer->lock, flags); + +	return iter; +} + +/** + * ring_buffer_finish - finish reading the iterator of the buffer + * @iter: The iterator retrieved by ring_buffer_start + * + * This re-enables the recording to the buffer, and frees the + * iterator. + */ +void +ring_buffer_read_finish(struct ring_buffer_iter *iter) +{ +	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + +	atomic_dec(&cpu_buffer->record_disabled); +	kfree(iter); +} + +/** + * ring_buffer_read - read the next item in the ring buffer by the iterator + * @iter: The ring buffer iterator + * @ts: The time stamp of the event read. + * + * This reads the next event in the ring buffer and increments the iterator. + */ +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +{ +	struct ring_buffer_event *event; + +	event = ring_buffer_iter_peek(iter, ts); +	if (!event) +		return NULL; + +	rb_advance_iter(iter); + +	return event; +} + +/** + * ring_buffer_size - return the size of the ring buffer (in bytes) + * @buffer: The ring buffer. + */ +unsigned long ring_buffer_size(struct ring_buffer *buffer) +{ +	return BUF_PAGE_SIZE * buffer->pages; +} + +static void +rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) +{ +	cpu_buffer->head_page +		= list_entry(cpu_buffer->pages.next, struct buffer_page, list); +	local_set(&cpu_buffer->head_page->write, 0); +	local_set(&cpu_buffer->head_page->commit, 0); + +	cpu_buffer->head_page->read = 0; + +	cpu_buffer->tail_page = cpu_buffer->head_page; +	cpu_buffer->commit_page = cpu_buffer->head_page; + +	INIT_LIST_HEAD(&cpu_buffer->reader_page->list); +	local_set(&cpu_buffer->reader_page->write, 0); +	local_set(&cpu_buffer->reader_page->commit, 0); +	cpu_buffer->reader_page->read = 0; + +	cpu_buffer->overrun = 0; +	cpu_buffer->entries = 0; +} + +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; +	unsigned long flags; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return; + +	spin_lock_irqsave(&cpu_buffer->lock, flags); + +	rb_reset_cpu(cpu_buffer); + +	spin_unlock_irqrestore(&cpu_buffer->lock, flags); +} + +/** + * ring_buffer_reset - reset a ring buffer + * @buffer: The ring buffer to reset all cpu buffers + */ +void ring_buffer_reset(struct ring_buffer *buffer) +{ +	int cpu; + +	for_each_buffer_cpu(buffer, cpu) +		ring_buffer_reset_cpu(buffer, cpu); +} + +/** + * rind_buffer_empty - is the ring buffer empty? + * @buffer: The ring buffer to test + */ +int ring_buffer_empty(struct ring_buffer *buffer) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	int cpu; + +	/* yes this is racy, but if you don't like the race, lock the buffer */ +	for_each_buffer_cpu(buffer, cpu) { +		cpu_buffer = buffer->buffers[cpu]; +		if (!rb_per_cpu_empty(cpu_buffer)) +			return 0; +	} +	return 1; +} + +/** + * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? + * @buffer: The ring buffer + * @cpu: The CPU buffer to test + */ +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	if (!cpu_isset(cpu, buffer->cpumask)) +		return 1; + +	cpu_buffer = buffer->buffers[cpu]; +	return rb_per_cpu_empty(cpu_buffer); +} + +/** + * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers + * @buffer_a: One buffer to swap with + * @buffer_b: The other buffer to swap with + * + * This function is useful for tracers that want to take a "snapshot" + * of a CPU buffer and has another back up buffer lying around. + * it is expected that the tracer handles the cpu buffer not being + * used at the moment. + */ +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, +			 struct ring_buffer *buffer_b, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer_a; +	struct ring_buffer_per_cpu *cpu_buffer_b; + +	if (!cpu_isset(cpu, buffer_a->cpumask) || +	    !cpu_isset(cpu, buffer_b->cpumask)) +		return -EINVAL; + +	/* At least make sure the two buffers are somewhat the same */ +	if (buffer_a->size != buffer_b->size || +	    buffer_a->pages != buffer_b->pages) +		return -EINVAL; + +	cpu_buffer_a = buffer_a->buffers[cpu]; +	cpu_buffer_b = buffer_b->buffers[cpu]; + +	/* +	 * We can't do a synchronize_sched here because this +	 * function can be called in atomic context. +	 * Normally this will be called from the same CPU as cpu. +	 * If not it's up to the caller to protect this. +	 */ +	atomic_inc(&cpu_buffer_a->record_disabled); +	atomic_inc(&cpu_buffer_b->record_disabled); + +	buffer_a->buffers[cpu] = cpu_buffer_b; +	buffer_b->buffers[cpu] = cpu_buffer_a; + +	cpu_buffer_b->buffer = buffer_a; +	cpu_buffer_a->buffer = buffer_b; + +	atomic_dec(&cpu_buffer_a->record_disabled); +	atomic_dec(&cpu_buffer_b->record_disabled); + +	return 0; +} + diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f3fb3db61c..d345d649d07 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -14,6 +14,7 @@  #include <linux/utsrelease.h>  #include <linux/kallsyms.h>  #include <linux/seq_file.h> +#include <linux/notifier.h>  #include <linux/debugfs.h>  #include <linux/pagemap.h>  #include <linux/hardirq.h> @@ -22,6 +23,7 @@  #include <linux/ftrace.h>  #include <linux/module.h>  #include <linux/percpu.h> +#include <linux/kdebug.h>  #include <linux/ctype.h>  #include <linux/init.h>  #include <linux/poll.h> @@ -31,25 +33,36 @@  #include <linux/writeback.h>  #include <linux/stacktrace.h> +#include <linux/ring_buffer.h>  #include "trace.h" +#define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE) +  unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;  unsigned long __read_mostly	tracing_thresh; -static unsigned long __read_mostly	tracing_nr_buffers; +static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); + +static inline void ftrace_disable_cpu(void) +{ +	preempt_disable(); +	local_inc(&__get_cpu_var(ftrace_cpu_disabled)); +} + +static inline void ftrace_enable_cpu(void) +{ +	local_dec(&__get_cpu_var(ftrace_cpu_disabled)); +	preempt_enable(); +} +  static cpumask_t __read_mostly		tracing_buffer_mask;  #define for_each_tracing_cpu(cpu)	\  	for_each_cpu_mask(cpu, tracing_buffer_mask) -static int trace_alloc_page(void); -static int trace_free_page(void); -  static int tracing_disabled = 1; -static unsigned long tracing_pages_allocated; -  long  ns2usecs(cycle_t nsec)  { @@ -60,7 +73,9 @@ ns2usecs(cycle_t nsec)  cycle_t ftrace_now(int cpu)  { -	return cpu_clock(cpu); +	u64 ts = ring_buffer_time_stamp(cpu); +	ring_buffer_normalize_time_stamp(cpu, &ts); +	return ts;  }  /* @@ -100,11 +115,18 @@ static int			tracer_enabled = 1;  int				ftrace_function_enabled;  /* - * trace_nr_entries is the number of entries that is allocated - * for a buffer. Note, the number of entries is always rounded - * to ENTRIES_PER_PAGE. + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded + * to page size. + * + * This number is purposely set to a low number of 16384. + * If the dump on oops happens, it will be much appreciated + * to not have to wait for all that output. Anyway this can be + * boot time and run time configurable.   */ -static unsigned long		trace_nr_entries = 65536UL; +#define TRACE_BUF_SIZE_DEFAULT	1441792UL /* 16384 * 88 (sizeof(entry)) */ + +static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;  /* trace_types holds a link list of available tracers. */  static struct tracer		*trace_types __read_mostly; @@ -133,24 +155,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);  /* trace_flags holds iter_ctrl options */  unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -static notrace void no_trace_init(struct trace_array *tr) -{ -	int cpu; - -	ftrace_function_enabled = 0; -	if(tr->ctrl) -		for_each_online_cpu(cpu) -			tracing_reset(tr->data[cpu]); -	tracer_enabled = 0; -} - -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = { -	.name		= "none", -	.init		= no_trace_init -}; - -  /**   * trace_wake_up - wake up tasks waiting for trace input   * @@ -167,23 +171,21 @@ void trace_wake_up(void)  		wake_up(&trace_wait);  } -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) - -static int __init set_nr_entries(char *str) +static int __init set_buf_size(char *str)  { -	unsigned long nr_entries; +	unsigned long buf_size;  	int ret;  	if (!str)  		return 0; -	ret = strict_strtoul(str, 0, &nr_entries); +	ret = strict_strtoul(str, 0, &buf_size);  	/* nr_entries can not be zero */ -	if (ret < 0 || nr_entries == 0) +	if (ret < 0 || buf_size == 0)  		return 0; -	trace_nr_entries = nr_entries; +	trace_buf_size = buf_size;  	return 1;  } -__setup("trace_entries=", set_nr_entries); +__setup("trace_buf_size=", set_buf_size);  unsigned long nsecs_to_usecs(unsigned long nsecs)  { @@ -191,21 +193,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)  }  /* - * trace_flag_type is an enumeration that holds different - * states when a trace occurs. These are: - *  IRQS_OFF	- interrupts were disabled - *  NEED_RESCED - reschedule is requested - *  HARDIRQ	- inside an interrupt handler - *  SOFTIRQ	- inside a softirq handler - */ -enum trace_flag_type { -	TRACE_FLAG_IRQS_OFF		= 0x01, -	TRACE_FLAG_NEED_RESCHED		= 0x02, -	TRACE_FLAG_HARDIRQ		= 0x04, -	TRACE_FLAG_SOFTIRQ		= 0x08, -}; - -/*   * TRACE_ITER_SYM_MASK masks the options in trace_flags that   * control the output of kernel symbols.   */ @@ -224,6 +211,7 @@ static const char *trace_options[] = {  	"block",  	"stacktrace",  	"sched-tree", +	"ftrace_printk",  	NULL  }; @@ -266,54 +254,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  	tracing_record_cmdline(current);  } -#define CHECK_COND(cond)			\ -	if (unlikely(cond)) {			\ -		tracing_disabled = 1;		\ -		WARN_ON(1);			\ -		return -1;			\ -	} - -/** - * check_pages - integrity check of trace buffers - * - * As a safty measure we check to make sure the data pages have not - * been corrupted. - */ -int check_pages(struct trace_array_cpu *data) -{ -	struct page *page, *tmp; - -	CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); -	CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); - -	list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { -		CHECK_COND(page->lru.next->prev != &page->lru); -		CHECK_COND(page->lru.prev->next != &page->lru); -	} - -	return 0; -} - -/** - * head_page - page address of the first page in per_cpu buffer. - * - * head_page returns the page address of the first page in - * a per_cpu buffer. This also preforms various consistency - * checks to make sure the buffer has not been corrupted. - */ -void *head_page(struct trace_array_cpu *data) -{ -	struct page *page; - -	if (list_empty(&data->trace_pages)) -		return NULL; - -	page = list_entry(data->trace_pages.next, struct page, lru); -	BUG_ON(&page->lru == &data->trace_pages); - -	return page_address(page); -} -  /**   * trace_seq_printf - sequence printing of trace information   * @s: trace sequence descriptor @@ -395,28 +335,23 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)  	return len;  } -#define HEX_CHARS 17 -static const char hex2asc[] = "0123456789abcdef"; +#define MAX_MEMHEX_BYTES	8 +#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)  static int  trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)  {  	unsigned char hex[HEX_CHARS];  	unsigned char *data = mem; -	unsigned char byte;  	int i, j; -	BUG_ON(len >= HEX_CHARS); -  #ifdef __BIG_ENDIAN  	for (i = 0, j = 0; i < len; i++) {  #else  	for (i = len-1, j = 0; i >= 0; i--) {  #endif -		byte = data[i]; - -		hex[j++] = hex2asc[byte & 0x0f]; -		hex[j++] = hex2asc[byte >> 4]; +		hex[j++] = hex_asc_hi(data[i]); +		hex[j++] = hex_asc_lo(data[i]);  	}  	hex[j++] = ' '; @@ -460,34 +395,6 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)  	trace_seq_reset(s);  } -/* - * flip the trace buffers between two trace descriptors. - * This usually is the buffers between the global_trace and - * the max_tr to record a snapshot of a current trace. - * - * The ftrace_max_lock must be held. - */ -static void -flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) -{ -	struct list_head flip_pages; - -	INIT_LIST_HEAD(&flip_pages); - -	memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, -		sizeof(struct trace_array_cpu) - -		offsetof(struct trace_array_cpu, trace_head_idx)); - -	check_pages(tr1); -	check_pages(tr2); -	list_splice_init(&tr1->trace_pages, &flip_pages); -	list_splice_init(&tr2->trace_pages, &tr1->trace_pages); -	list_splice_init(&flip_pages, &tr2->trace_pages); -	BUG_ON(!list_empty(&flip_pages)); -	check_pages(tr1); -	check_pages(tr2); -} -  /**   * update_max_tr - snapshot all trace buffers from global_trace to max_tr   * @tr: tracer @@ -500,17 +407,17 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)  void  update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct trace_array_cpu *data; -	int i; +	struct ring_buffer *buf = tr->buffer;  	WARN_ON_ONCE(!irqs_disabled());  	__raw_spin_lock(&ftrace_max_lock); -	/* clear out all the previous traces */ -	for_each_tracing_cpu(i) { -		data = tr->data[i]; -		flip_trace(max_tr.data[i], data); -		tracing_reset(data); -	} + +	tr->buffer = max_tr.buffer; +	max_tr.buffer = buf; + +	ftrace_disable_cpu(); +	ring_buffer_reset(tr->buffer); +	ftrace_enable_cpu();  	__update_max_tr(tr, tsk, cpu);  	__raw_spin_unlock(&ftrace_max_lock); @@ -527,16 +434,19 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  void  update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct trace_array_cpu *data = tr->data[cpu]; -	int i; +	int ret;  	WARN_ON_ONCE(!irqs_disabled());  	__raw_spin_lock(&ftrace_max_lock); -	for_each_tracing_cpu(i) -		tracing_reset(max_tr.data[i]); -	flip_trace(max_tr.data[cpu], data); -	tracing_reset(data); +	ftrace_disable_cpu(); + +	ring_buffer_reset(max_tr.buffer); +	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + +	ftrace_enable_cpu(); + +	WARN_ON_ONCE(ret);  	__update_max_tr(tr, tsk, cpu);  	__raw_spin_unlock(&ftrace_max_lock); @@ -573,7 +483,6 @@ int register_tracer(struct tracer *type)  #ifdef CONFIG_FTRACE_STARTUP_TEST  	if (type->selftest) {  		struct tracer *saved_tracer = current_trace; -		struct trace_array_cpu *data;  		struct trace_array *tr = &global_trace;  		int saved_ctrl = tr->ctrl;  		int i; @@ -585,10 +494,7 @@ int register_tracer(struct tracer *type)  		 * If we fail, we do not register this tracer.  		 */  		for_each_tracing_cpu(i) { -			data = tr->data[i]; -			if (!head_page(data)) -				continue; -			tracing_reset(data); +			tracing_reset(tr, i);  		}  		current_trace = type;  		tr->ctrl = 0; @@ -604,10 +510,7 @@ int register_tracer(struct tracer *type)  		}  		/* Only reset on passing, to avoid touching corrupted buffers */  		for_each_tracing_cpu(i) { -			data = tr->data[i]; -			if (!head_page(data)) -				continue; -			tracing_reset(data); +			tracing_reset(tr, i);  		}  		printk(KERN_CONT "PASSED\n");  	} @@ -653,13 +556,11 @@ void unregister_tracer(struct tracer *type)  	mutex_unlock(&trace_types_lock);  } -void tracing_reset(struct trace_array_cpu *data) +void tracing_reset(struct trace_array *tr, int cpu)  { -	data->trace_idx = 0; -	data->overrun = 0; -	data->trace_head = data->trace_tail = head_page(data); -	data->trace_head_idx = 0; -	data->trace_tail_idx = 0; +	ftrace_disable_cpu(); +	ring_buffer_reset_cpu(tr->buffer, cpu); +	ftrace_enable_cpu();  }  #define SAVED_CMDLINES 128 @@ -745,82 +646,16 @@ void tracing_record_cmdline(struct task_struct *tsk)  	trace_save_cmdline(tsk);  } -static inline struct list_head * -trace_next_list(struct trace_array_cpu *data, struct list_head *next) -{ -	/* -	 * Roundrobin - but skip the head (which is not a real page): -	 */ -	next = next->next; -	if (unlikely(next == &data->trace_pages)) -		next = next->next; -	BUG_ON(next == &data->trace_pages); - -	return next; -} - -static inline void * -trace_next_page(struct trace_array_cpu *data, void *addr) -{ -	struct list_head *next; -	struct page *page; - -	page = virt_to_page(addr); - -	next = trace_next_list(data, &page->lru); -	page = list_entry(next, struct page, lru); - -	return page_address(page); -} - -static inline struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) -{ -	unsigned long idx, idx_next; -	struct trace_entry *entry; - -	data->trace_idx++; -	idx = data->trace_head_idx; -	idx_next = idx + 1; - -	BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); - -	entry = data->trace_head + idx * TRACE_ENTRY_SIZE; - -	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { -		data->trace_head = trace_next_page(data, data->trace_head); -		idx_next = 0; -	} - -	if (data->trace_head == data->trace_tail && -	    idx_next == data->trace_tail_idx) { -		/* overrun */ -		data->overrun++; -		data->trace_tail_idx++; -		if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { -			data->trace_tail = -				trace_next_page(data, data->trace_tail); -			data->trace_tail_idx = 0; -		} -	} - -	data->trace_head_idx = idx_next; - -	return entry; -} - -static inline void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) +void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, +			     int pc)  {  	struct task_struct *tsk = current; -	unsigned long pc; - -	pc = preempt_count(); -	entry->preempt_count	= pc & 0xff; -	entry->pid		= (tsk) ? tsk->pid : 0; -	entry->t		= ftrace_now(raw_smp_processor_id()); -	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | +	entry->preempt_count		= pc & 0xff; +	entry->pid			= (tsk) ? tsk->pid : 0; +	entry->flags = +		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |  		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |  		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |  		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); @@ -828,145 +663,139 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)  void  trace_function(struct trace_array *tr, struct trace_array_cpu *data, -	       unsigned long ip, unsigned long parent_ip, unsigned long flags) +	       unsigned long ip, unsigned long parent_ip, unsigned long flags, +	       int pc)  { -	struct trace_entry *entry; +	struct ring_buffer_event *event; +	struct ftrace_entry *entry;  	unsigned long irq_flags; -	raw_local_irq_save(irq_flags); -	__raw_spin_lock(&data->lock); -	entry			= tracing_get_trace_entry(tr, data); -	tracing_generic_entry_update(entry, flags); -	entry->type		= TRACE_FN; -	entry->fn.ip		= ip; -	entry->fn.parent_ip	= parent_ip; -	__raw_spin_unlock(&data->lock); -	raw_local_irq_restore(irq_flags); +	/* If we are reading the ring buffer, don't trace */ +	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) +		return; + +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					 &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, flags, pc); +	entry->ent.type			= TRACE_FN; +	entry->ip			= ip; +	entry->parent_ip		= parent_ip; +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);  }  void  ftrace(struct trace_array *tr, struct trace_array_cpu *data, -       unsigned long ip, unsigned long parent_ip, unsigned long flags) +       unsigned long ip, unsigned long parent_ip, unsigned long flags, +       int pc)  {  	if (likely(!atomic_read(&data->disabled))) -		trace_function(tr, data, ip, parent_ip, flags); +		trace_function(tr, data, ip, parent_ip, flags, pc);  } -#ifdef CONFIG_MMIOTRACE -void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, -						struct mmiotrace_rw *rw) +static void ftrace_trace_stack(struct trace_array *tr, +			       struct trace_array_cpu *data, +			       unsigned long flags, +			       int skip, int pc)  { -	struct trace_entry *entry; +	struct ring_buffer_event *event; +	struct stack_entry *entry; +	struct stack_trace trace;  	unsigned long irq_flags; -	raw_local_irq_save(irq_flags); -	__raw_spin_lock(&data->lock); - -	entry			= tracing_get_trace_entry(tr, data); -	tracing_generic_entry_update(entry, 0); -	entry->type		= TRACE_MMIO_RW; -	entry->mmiorw		= *rw; - -	__raw_spin_unlock(&data->lock); -	raw_local_irq_restore(irq_flags); - -	trace_wake_up(); -} - -void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, -						struct mmiotrace_map *map) -{ -	struct trace_entry *entry; -	unsigned long irq_flags; +	if (!(trace_flags & TRACE_ITER_STACKTRACE)) +		return; -	raw_local_irq_save(irq_flags); -	__raw_spin_lock(&data->lock); +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					 &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, flags, pc); +	entry->ent.type		= TRACE_STACK; -	entry			= tracing_get_trace_entry(tr, data); -	tracing_generic_entry_update(entry, 0); -	entry->type		= TRACE_MMIO_MAP; -	entry->mmiomap		= *map; +	memset(&entry->caller, 0, sizeof(entry->caller)); -	__raw_spin_unlock(&data->lock); -	raw_local_irq_restore(irq_flags); +	trace.nr_entries	= 0; +	trace.max_entries	= FTRACE_STACK_ENTRIES; +	trace.skip		= skip; +	trace.entries		= entry->caller; -	trace_wake_up(); +	save_stack_trace(&trace); +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);  } -#endif  void __trace_stack(struct trace_array *tr,  		   struct trace_array_cpu *data,  		   unsigned long flags,  		   int skip)  { -	struct trace_entry *entry; -	struct stack_trace trace; - -	if (!(trace_flags & TRACE_ITER_STACKTRACE)) -		return; - -	entry			= tracing_get_trace_entry(tr, data); -	tracing_generic_entry_update(entry, flags); -	entry->type		= TRACE_STACK; - -	memset(&entry->stack, 0, sizeof(entry->stack)); - -	trace.nr_entries	= 0; -	trace.max_entries	= FTRACE_STACK_ENTRIES; -	trace.skip		= skip; -	trace.entries		= entry->stack.caller; - -	save_stack_trace(&trace); +	ftrace_trace_stack(tr, data, flags, skip, preempt_count());  } -void -__trace_special(void *__tr, void *__data, -		unsigned long arg1, unsigned long arg2, unsigned long arg3) +static void +ftrace_trace_special(void *__tr, void *__data, +		     unsigned long arg1, unsigned long arg2, unsigned long arg3, +		     int pc)  { +	struct ring_buffer_event *event;  	struct trace_array_cpu *data = __data;  	struct trace_array *tr = __tr; -	struct trace_entry *entry; +	struct special_entry *entry;  	unsigned long irq_flags; -	raw_local_irq_save(irq_flags); -	__raw_spin_lock(&data->lock); -	entry			= tracing_get_trace_entry(tr, data); -	tracing_generic_entry_update(entry, 0); -	entry->type		= TRACE_SPECIAL; -	entry->special.arg1	= arg1; -	entry->special.arg2	= arg2; -	entry->special.arg3	= arg3; -	__trace_stack(tr, data, irq_flags, 4); -	__raw_spin_unlock(&data->lock); -	raw_local_irq_restore(irq_flags); +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					 &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, 0, pc); +	entry->ent.type			= TRACE_SPECIAL; +	entry->arg1			= arg1; +	entry->arg2			= arg2; +	entry->arg3			= arg3; +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +	ftrace_trace_stack(tr, data, irq_flags, 4, pc);  	trace_wake_up();  }  void +__trace_special(void *__tr, void *__data, +		unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ +	ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); +} + +void  tracing_sched_switch_trace(struct trace_array *tr,  			   struct trace_array_cpu *data,  			   struct task_struct *prev,  			   struct task_struct *next, -			   unsigned long flags) +			   unsigned long flags, int pc)  { -	struct trace_entry *entry; +	struct ring_buffer_event *event; +	struct ctx_switch_entry *entry;  	unsigned long irq_flags; -	raw_local_irq_save(irq_flags); -	__raw_spin_lock(&data->lock); -	entry			= tracing_get_trace_entry(tr, data); -	tracing_generic_entry_update(entry, flags); -	entry->type		= TRACE_CTX; -	entry->ctx.prev_pid	= prev->pid; -	entry->ctx.prev_prio	= prev->prio; -	entry->ctx.prev_state	= prev->state; -	entry->ctx.next_pid	= next->pid; -	entry->ctx.next_prio	= next->prio; -	entry->ctx.next_state	= next->state; -	__trace_stack(tr, data, flags, 5); -	__raw_spin_unlock(&data->lock); -	raw_local_irq_restore(irq_flags); +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					   &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, flags, pc); +	entry->ent.type			= TRACE_CTX; +	entry->prev_pid			= prev->pid; +	entry->prev_prio		= prev->prio; +	entry->prev_state		= prev->state; +	entry->next_pid			= next->pid; +	entry->next_prio		= next->prio; +	entry->next_state		= next->state; +	entry->next_cpu	= task_cpu(next); +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +	ftrace_trace_stack(tr, data, flags, 5, pc);  }  void @@ -974,25 +803,28 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  			   struct trace_array_cpu *data,  			   struct task_struct *wakee,  			   struct task_struct *curr, -			   unsigned long flags) +			   unsigned long flags, int pc)  { -	struct trace_entry *entry; +	struct ring_buffer_event *event; +	struct ctx_switch_entry *entry;  	unsigned long irq_flags; -	raw_local_irq_save(irq_flags); -	__raw_spin_lock(&data->lock); -	entry			= tracing_get_trace_entry(tr, data); -	tracing_generic_entry_update(entry, flags); -	entry->type		= TRACE_WAKE; -	entry->ctx.prev_pid	= curr->pid; -	entry->ctx.prev_prio	= curr->prio; -	entry->ctx.prev_state	= curr->state; -	entry->ctx.next_pid	= wakee->pid; -	entry->ctx.next_prio	= wakee->prio; -	entry->ctx.next_state	= wakee->state; -	__trace_stack(tr, data, flags, 6); -	__raw_spin_unlock(&data->lock); -	raw_local_irq_restore(irq_flags); +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					   &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, flags, pc); +	entry->ent.type			= TRACE_WAKE; +	entry->prev_pid			= curr->pid; +	entry->prev_prio		= curr->prio; +	entry->prev_state		= curr->state; +	entry->next_pid			= wakee->pid; +	entry->next_prio		= wakee->prio; +	entry->next_state		= wakee->state; +	entry->next_cpu			= task_cpu(wakee); +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +	ftrace_trace_stack(tr, data, flags, 6, pc);  	trace_wake_up();  } @@ -1002,23 +834,21 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)  {  	struct trace_array *tr = &global_trace;  	struct trace_array_cpu *data; -	unsigned long flags; -	long disabled;  	int cpu; +	int pc; -	if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) +	if (tracing_disabled || !tr->ctrl)  		return; -	local_irq_save(flags); +	pc = preempt_count(); +	preempt_disable_notrace();  	cpu = raw_smp_processor_id();  	data = tr->data[cpu]; -	disabled = atomic_inc_return(&data->disabled); -	if (likely(disabled == 1)) -		__trace_special(tr, data, arg1, arg2, arg3); +	if (likely(!atomic_read(&data->disabled))) +		ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); -	atomic_dec(&data->disabled); -	local_irq_restore(flags); +	preempt_enable_notrace();  }  #ifdef CONFIG_FTRACE @@ -1029,7 +859,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)  	struct trace_array_cpu *data;  	unsigned long flags;  	long disabled; -	int cpu; +	int cpu, resched; +	int pc;  	if (unlikely(!ftrace_function_enabled))  		return; @@ -1037,16 +868,22 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)  	if (skip_trace(ip))  		return; -	local_irq_save(flags); +	pc = preempt_count(); +	resched = need_resched(); +	preempt_disable_notrace(); +	local_save_flags(flags);  	cpu = raw_smp_processor_id();  	data = tr->data[cpu];  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) -		trace_function(tr, data, ip, parent_ip, flags); +		trace_function(tr, data, ip, parent_ip, flags, pc);  	atomic_dec(&data->disabled); -	local_irq_restore(flags); +	if (resched) +		preempt_enable_no_resched_notrace(); +	else +		preempt_enable_notrace();  }  static struct ftrace_ops trace_ops __read_mostly = @@ -1073,111 +910,96 @@ enum trace_file_type {  	TRACE_FILE_LAT_FMT	= 1,  }; -static struct trace_entry * -trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, -		struct trace_iterator *iter, int cpu) +static void trace_iterator_increment(struct trace_iterator *iter, int cpu)  { -	struct page *page; -	struct trace_entry *array; +	/* Don't allow ftrace to trace into the ring buffers */ +	ftrace_disable_cpu(); -	if (iter->next_idx[cpu] >= tr->entries || -	    iter->next_idx[cpu] >= data->trace_idx || -	    (data->trace_head == data->trace_tail && -	     data->trace_head_idx == data->trace_tail_idx)) -		return NULL; +	iter->idx++; +	if (iter->buffer_iter[iter->cpu]) +		ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); -	if (!iter->next_page[cpu]) { -		/* Initialize the iterator for this cpu trace buffer */ -		WARN_ON(!data->trace_tail); -		page = virt_to_page(data->trace_tail); -		iter->next_page[cpu] = &page->lru; -		iter->next_page_idx[cpu] = data->trace_tail_idx; -	} +	ftrace_enable_cpu(); +} -	page = list_entry(iter->next_page[cpu], struct page, lru); -	BUG_ON(&data->trace_pages == &page->lru); +static struct trace_entry * +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) +{ +	struct ring_buffer_event *event; +	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; -	array = page_address(page); +	/* Don't allow ftrace to trace into the ring buffers */ +	ftrace_disable_cpu(); -	WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); -	return &array[iter->next_page_idx[cpu]]; +	if (buf_iter) +		event = ring_buffer_iter_peek(buf_iter, ts); +	else +		event = ring_buffer_peek(iter->tr->buffer, cpu, ts); + +	ftrace_enable_cpu(); + +	return event ? ring_buffer_event_data(event) : NULL;  }  static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu) +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)  { -	struct trace_array *tr = iter->tr; +	struct ring_buffer *buffer = iter->tr->buffer;  	struct trace_entry *ent, *next = NULL; +	u64 next_ts = 0, ts;  	int next_cpu = -1;  	int cpu;  	for_each_tracing_cpu(cpu) { -		if (!head_page(tr->data[cpu])) + +		if (ring_buffer_empty_cpu(buffer, cpu))  			continue; -		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + +		ent = peek_next_entry(iter, cpu, &ts); +  		/*  		 * Pick the entry with the smallest timestamp:  		 */ -		if (ent && (!next || ent->t < next->t)) { +		if (ent && (!next || ts < next_ts)) {  			next = ent;  			next_cpu = cpu; +			next_ts = ts;  		}  	}  	if (ent_cpu)  		*ent_cpu = next_cpu; +	if (ent_ts) +		*ent_ts = next_ts; +  	return next;  } -static void trace_iterator_increment(struct trace_iterator *iter) +/* Find the next real entry, without updating the iterator itself */ +static struct trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)  { -	iter->idx++; -	iter->next_idx[iter->cpu]++; -	iter->next_page_idx[iter->cpu]++; - -	if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { -		struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - -		iter->next_page_idx[iter->cpu] = 0; -		iter->next_page[iter->cpu] = -			trace_next_list(data, iter->next_page[iter->cpu]); -	} +	return __find_next_entry(iter, ent_cpu, ent_ts);  } -static void trace_consume(struct trace_iterator *iter) +/* Find the next real entry, and increment the iterator to the next entry */ +static void *find_next_entry_inc(struct trace_iterator *iter)  { -	struct trace_array_cpu *data = iter->tr->data[iter->cpu]; +	iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); -	data->trace_tail_idx++; -	if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { -		data->trace_tail = trace_next_page(data, data->trace_tail); -		data->trace_tail_idx = 0; -	} +	if (iter->ent) +		trace_iterator_increment(iter, iter->cpu); -	/* Check if we empty it, then reset the index */ -	if (data->trace_head == data->trace_tail && -	    data->trace_head_idx == data->trace_tail_idx) -		data->trace_idx = 0; +	return iter->ent ? iter : NULL;  } -static void *find_next_entry_inc(struct trace_iterator *iter) +static void trace_consume(struct trace_iterator *iter)  { -	struct trace_entry *next; -	int next_cpu = -1; - -	next = find_next_entry(iter, &next_cpu); - -	iter->prev_ent = iter->ent; -	iter->prev_cpu = iter->cpu; - -	iter->ent = next; -	iter->cpu = next_cpu; - -	if (next) -		trace_iterator_increment(iter); - -	return next ? iter : NULL; +	/* Don't allow ftrace to trace into the ring buffers */ +	ftrace_disable_cpu(); +	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); +	ftrace_enable_cpu();  }  static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1210,7 +1032,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)  	struct trace_iterator *iter = m->private;  	void *p = NULL;  	loff_t l = 0; -	int i; +	int cpu;  	mutex_lock(&trace_types_lock); @@ -1229,14 +1051,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)  		iter->ent = NULL;  		iter->cpu = 0;  		iter->idx = -1; -		iter->prev_ent = NULL; -		iter->prev_cpu = -1; -		for_each_tracing_cpu(i) { -			iter->next_idx[i] = 0; -			iter->next_page[i] = NULL; +		ftrace_disable_cpu(); + +		for_each_tracing_cpu(cpu) { +			ring_buffer_iter_reset(iter->buffer_iter[cpu]);  		} +		ftrace_enable_cpu(); +  		for (p = iter; p && l < *pos; p = s_next(m, p, &l))  			; @@ -1330,21 +1153,21 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)  static void print_lat_help_header(struct seq_file *m)  { -	seq_puts(m, "#                _------=> CPU#            \n"); -	seq_puts(m, "#               / _-----=> irqs-off        \n"); -	seq_puts(m, "#              | / _----=> need-resched    \n"); -	seq_puts(m, "#              || / _---=> hardirq/softirq \n"); -	seq_puts(m, "#              ||| / _--=> preempt-depth   \n"); -	seq_puts(m, "#              |||| /                      \n"); -	seq_puts(m, "#              |||||     delay             \n"); -	seq_puts(m, "#  cmd     pid ||||| time  |   caller      \n"); -	seq_puts(m, "#     \\   /    |||||   \\   |   /           \n"); +	seq_puts(m, "#                  _------=> CPU#            \n"); +	seq_puts(m, "#                 / _-----=> irqs-off        \n"); +	seq_puts(m, "#                | / _----=> need-resched    \n"); +	seq_puts(m, "#                || / _---=> hardirq/softirq \n"); +	seq_puts(m, "#                ||| / _--=> preempt-depth   \n"); +	seq_puts(m, "#                |||| /                      \n"); +	seq_puts(m, "#                |||||     delay             \n"); +	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n"); +	seq_puts(m, "#     \\   /      |||||   \\   |   /           \n");  }  static void print_func_help_header(struct seq_file *m)  { -	seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n"); -	seq_puts(m, "#              | |      |          |         |\n"); +	seq_puts(m, "#           TASK-PID    CPU#    TIMESTAMP  FUNCTION\n"); +	seq_puts(m, "#              | |       |          |         |\n");  } @@ -1355,23 +1178,16 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  	struct trace_array *tr = iter->tr;  	struct trace_array_cpu *data = tr->data[tr->cpu];  	struct tracer *type = current_trace; -	unsigned long total   = 0; -	unsigned long entries = 0; -	int cpu; +	unsigned long total; +	unsigned long entries;  	const char *name = "preemption";  	if (type)  		name = type->name; -	for_each_tracing_cpu(cpu) { -		if (head_page(tr->data[cpu])) { -			total += tr->data[cpu]->trace_idx; -			if (tr->data[cpu]->trace_idx > tr->entries) -				entries += tr->entries; -			else -				entries += tr->data[cpu]->trace_idx; -		} -	} +	entries = ring_buffer_entries(iter->tr->buffer); +	total = entries + +		ring_buffer_overruns(iter->tr->buffer);  	seq_printf(m, "%s latency trace v1.1.5 on %s\n",  		   name, UTS_RELEASE); @@ -1428,7 +1244,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)  	comm = trace_find_cmdline(entry->pid);  	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); -	trace_seq_printf(s, "%d", cpu); +	trace_seq_printf(s, "%3d", cpu);  	trace_seq_printf(s, "%c%c",  			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',  			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); @@ -1457,7 +1273,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)  unsigned long preempt_mark_thresh = 100;  static void -lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, +lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,  		    unsigned long rel_usecs)  {  	trace_seq_printf(s, " %4lldus", abs_usecs); @@ -1471,34 +1287,76 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,  static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static int +/* + * The message is supposed to contain an ending newline. + * If the printing stops prematurely, try to add a newline of our own. + */ +void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) +{ +	struct trace_entry *ent; +	struct trace_field_cont *cont; +	bool ok = true; + +	ent = peek_next_entry(iter, iter->cpu, NULL); +	if (!ent || ent->type != TRACE_CONT) { +		trace_seq_putc(s, '\n'); +		return; +	} + +	do { +		cont = (struct trace_field_cont *)ent; +		if (ok) +			ok = (trace_seq_printf(s, "%s", cont->buf) > 0); + +		ftrace_disable_cpu(); + +		if (iter->buffer_iter[iter->cpu]) +			ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); +		else +			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + +		ftrace_enable_cpu(); + +		ent = peek_next_entry(iter, iter->cpu, NULL); +	} while (ent && ent->type == TRACE_CONT); + +	if (!ok) +		trace_seq_putc(s, '\n'); +} + +static enum print_line_t  print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)  {  	struct trace_seq *s = &iter->seq;  	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); -	struct trace_entry *next_entry = find_next_entry(iter, NULL); +	struct trace_entry *next_entry;  	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);  	struct trace_entry *entry = iter->ent;  	unsigned long abs_usecs;  	unsigned long rel_usecs; +	u64 next_ts;  	char *comm;  	int S, T;  	int i;  	unsigned state; +	if (entry->type == TRACE_CONT) +		return TRACE_TYPE_HANDLED; + +	next_entry = find_next_entry(iter, NULL, &next_ts);  	if (!next_entry) -		next_entry = entry; -	rel_usecs = ns2usecs(next_entry->t - entry->t); -	abs_usecs = ns2usecs(entry->t - iter->tr->time_start); +		next_ts = iter->ts; +	rel_usecs = ns2usecs(next_ts - iter->ts); +	abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);  	if (verbose) {  		comm = trace_find_cmdline(entry->pid); -		trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" +		trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"  				 " %ld.%03ldms (+%ld.%03ldms): ",  				 comm,  				 entry->pid, cpu, entry->flags,  				 entry->preempt_count, trace_idx, -				 ns2usecs(entry->t), +				 ns2usecs(iter->ts),  				 abs_usecs/1000,  				 abs_usecs % 1000, rel_usecs/1000,  				 rel_usecs % 1000); @@ -1507,52 +1365,85 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)  		lat_print_timestamp(s, abs_usecs, rel_usecs);  	}  	switch (entry->type) { -	case TRACE_FN: -		seq_print_ip_sym(s, entry->fn.ip, sym_flags); +	case TRACE_FN: { +		struct ftrace_entry *field; + +		trace_assign_type(field, entry); + +		seq_print_ip_sym(s, field->ip, sym_flags);  		trace_seq_puts(s, " ("); -		if (kretprobed(entry->fn.parent_ip)) +		if (kretprobed(field->parent_ip))  			trace_seq_puts(s, KRETPROBE_MSG);  		else -			seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); +			seq_print_ip_sym(s, field->parent_ip, sym_flags);  		trace_seq_puts(s, ")\n");  		break; +	}  	case TRACE_CTX: -	case TRACE_WAKE: -		T = entry->ctx.next_state < sizeof(state_to_char) ? -			state_to_char[entry->ctx.next_state] : 'X'; +	case TRACE_WAKE: { +		struct ctx_switch_entry *field; + +		trace_assign_type(field, entry); -		state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; +		T = field->next_state < sizeof(state_to_char) ? +			state_to_char[field->next_state] : 'X'; + +		state = field->prev_state ? +			__ffs(field->prev_state) + 1 : 0;  		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; -		comm = trace_find_cmdline(entry->ctx.next_pid); -		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", -				 entry->ctx.prev_pid, -				 entry->ctx.prev_prio, +		comm = trace_find_cmdline(field->next_pid); +		trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", +				 field->prev_pid, +				 field->prev_prio,  				 S, entry->type == TRACE_CTX ? "==>" : "  +", -				 entry->ctx.next_pid, -				 entry->ctx.next_prio, +				 field->next_cpu, +				 field->next_pid, +				 field->next_prio,  				 T, comm);  		break; -	case TRACE_SPECIAL: +	} +	case TRACE_SPECIAL: { +		struct special_entry *field; + +		trace_assign_type(field, entry); +  		trace_seq_printf(s, "# %ld %ld %ld\n", -				 entry->special.arg1, -				 entry->special.arg2, -				 entry->special.arg3); +				 field->arg1, +				 field->arg2, +				 field->arg3);  		break; -	case TRACE_STACK: +	} +	case TRACE_STACK: { +		struct stack_entry *field; + +		trace_assign_type(field, entry); +  		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {  			if (i)  				trace_seq_puts(s, " <= "); -			seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); +			seq_print_ip_sym(s, field->caller[i], sym_flags);  		}  		trace_seq_puts(s, "\n");  		break; +	} +	case TRACE_PRINT: { +		struct print_entry *field; + +		trace_assign_type(field, entry); + +		seq_print_ip_sym(s, field->ip, sym_flags); +		trace_seq_printf(s, ": %s", field->buf); +		if (entry->flags & TRACE_FLAG_CONT) +			trace_seq_print_cont(s, iter); +		break; +	}  	default:  		trace_seq_printf(s, "Unknown type %d\n", entry->type);  	} -	return 1; +	return TRACE_TYPE_HANDLED;  } -static int print_trace_fmt(struct trace_iterator *iter) +static enum print_line_t print_trace_fmt(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq;  	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1567,90 +1458,126 @@ static int print_trace_fmt(struct trace_iterator *iter)  	entry = iter->ent; +	if (entry->type == TRACE_CONT) +		return TRACE_TYPE_HANDLED; +  	comm = trace_find_cmdline(iter->ent->pid); -	t = ns2usecs(entry->t); +	t = ns2usecs(iter->ts);  	usec_rem = do_div(t, 1000000ULL);  	secs = (unsigned long)t;  	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);  	if (!ret) -		return 0; -	ret = trace_seq_printf(s, "[%02d] ", iter->cpu); +		return TRACE_TYPE_PARTIAL_LINE; +	ret = trace_seq_printf(s, "[%03d] ", iter->cpu);  	if (!ret) -		return 0; +		return TRACE_TYPE_PARTIAL_LINE;  	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);  	if (!ret) -		return 0; +		return TRACE_TYPE_PARTIAL_LINE;  	switch (entry->type) { -	case TRACE_FN: -		ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); +	case TRACE_FN: { +		struct ftrace_entry *field; + +		trace_assign_type(field, entry); + +		ret = seq_print_ip_sym(s, field->ip, sym_flags);  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		if ((sym_flags & TRACE_ITER_PRINT_PARENT) && -						entry->fn.parent_ip) { +						field->parent_ip) {  			ret = trace_seq_printf(s, " <-");  			if (!ret) -				return 0; -			if (kretprobed(entry->fn.parent_ip)) +				return TRACE_TYPE_PARTIAL_LINE; +			if (kretprobed(field->parent_ip))  				ret = trace_seq_puts(s, KRETPROBE_MSG);  			else -				ret = seq_print_ip_sym(s, entry->fn.parent_ip, +				ret = seq_print_ip_sym(s, +						       field->parent_ip,  						       sym_flags);  			if (!ret) -				return 0; +				return TRACE_TYPE_PARTIAL_LINE;  		}  		ret = trace_seq_printf(s, "\n");  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		break; +	}  	case TRACE_CTX: -	case TRACE_WAKE: -		S = entry->ctx.prev_state < sizeof(state_to_char) ? -			state_to_char[entry->ctx.prev_state] : 'X'; -		T = entry->ctx.next_state < sizeof(state_to_char) ? -			state_to_char[entry->ctx.next_state] : 'X'; -		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", -				       entry->ctx.prev_pid, -				       entry->ctx.prev_prio, +	case TRACE_WAKE: { +		struct ctx_switch_entry *field; + +		trace_assign_type(field, entry); + +		S = field->prev_state < sizeof(state_to_char) ? +			state_to_char[field->prev_state] : 'X'; +		T = field->next_state < sizeof(state_to_char) ? +			state_to_char[field->next_state] : 'X'; +		ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", +				       field->prev_pid, +				       field->prev_prio,  				       S,  				       entry->type == TRACE_CTX ? "==>" : "  +", -				       entry->ctx.next_pid, -				       entry->ctx.next_prio, +				       field->next_cpu, +				       field->next_pid, +				       field->next_prio,  				       T);  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		break; -	case TRACE_SPECIAL: +	} +	case TRACE_SPECIAL: { +		struct special_entry *field; + +		trace_assign_type(field, entry); +  		ret = trace_seq_printf(s, "# %ld %ld %ld\n", -				 entry->special.arg1, -				 entry->special.arg2, -				 entry->special.arg3); +				 field->arg1, +				 field->arg2, +				 field->arg3);  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		break; -	case TRACE_STACK: +	} +	case TRACE_STACK: { +		struct stack_entry *field; + +		trace_assign_type(field, entry); +  		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {  			if (i) {  				ret = trace_seq_puts(s, " <= ");  				if (!ret) -					return 0; +					return TRACE_TYPE_PARTIAL_LINE;  			} -			ret = seq_print_ip_sym(s, entry->stack.caller[i], +			ret = seq_print_ip_sym(s, field->caller[i],  					       sym_flags);  			if (!ret) -				return 0; +				return TRACE_TYPE_PARTIAL_LINE;  		}  		ret = trace_seq_puts(s, "\n");  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		break;  	} -	return 1; +	case TRACE_PRINT: { +		struct print_entry *field; + +		trace_assign_type(field, entry); + +		seq_print_ip_sym(s, field->ip, sym_flags); +		trace_seq_printf(s, ": %s", field->buf); +		if (entry->flags & TRACE_FLAG_CONT) +			trace_seq_print_cont(s, iter); +		break; +	} +	} +	return TRACE_TYPE_HANDLED;  } -static int print_raw_fmt(struct trace_iterator *iter) +static enum print_line_t print_raw_fmt(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq;  	struct trace_entry *entry; @@ -1659,47 +1586,77 @@ static int print_raw_fmt(struct trace_iterator *iter)  	entry = iter->ent; +	if (entry->type == TRACE_CONT) +		return TRACE_TYPE_HANDLED; +  	ret = trace_seq_printf(s, "%d %d %llu ", -		entry->pid, iter->cpu, entry->t); +		entry->pid, iter->cpu, iter->ts);  	if (!ret) -		return 0; +		return TRACE_TYPE_PARTIAL_LINE;  	switch (entry->type) { -	case TRACE_FN: +	case TRACE_FN: { +		struct ftrace_entry *field; + +		trace_assign_type(field, entry); +  		ret = trace_seq_printf(s, "%x %x\n", -					entry->fn.ip, entry->fn.parent_ip); +					field->ip, +					field->parent_ip);  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		break; +	}  	case TRACE_CTX: -	case TRACE_WAKE: -		S = entry->ctx.prev_state < sizeof(state_to_char) ? -			state_to_char[entry->ctx.prev_state] : 'X'; -		T = entry->ctx.next_state < sizeof(state_to_char) ? -			state_to_char[entry->ctx.next_state] : 'X'; +	case TRACE_WAKE: { +		struct ctx_switch_entry *field; + +		trace_assign_type(field, entry); + +		S = field->prev_state < sizeof(state_to_char) ? +			state_to_char[field->prev_state] : 'X'; +		T = field->next_state < sizeof(state_to_char) ? +			state_to_char[field->next_state] : 'X';  		if (entry->type == TRACE_WAKE)  			S = '+'; -		ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", -				       entry->ctx.prev_pid, -				       entry->ctx.prev_prio, +		ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", +				       field->prev_pid, +				       field->prev_prio,  				       S, -				       entry->ctx.next_pid, -				       entry->ctx.next_prio, +				       field->next_cpu, +				       field->next_pid, +				       field->next_prio,  				       T);  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		break; +	}  	case TRACE_SPECIAL: -	case TRACE_STACK: +	case TRACE_STACK: { +		struct special_entry *field; + +		trace_assign_type(field, entry); +  		ret = trace_seq_printf(s, "# %ld %ld %ld\n", -				 entry->special.arg1, -				 entry->special.arg2, -				 entry->special.arg3); +				 field->arg1, +				 field->arg2, +				 field->arg3);  		if (!ret) -			return 0; +			return TRACE_TYPE_PARTIAL_LINE;  		break;  	} -	return 1; +	case TRACE_PRINT: { +		struct print_entry *field; + +		trace_assign_type(field, entry); + +		trace_seq_printf(s, "# %lx %s", field->ip, field->buf); +		if (entry->flags & TRACE_FLAG_CONT) +			trace_seq_print_cont(s, iter); +		break; +	} +	} +	return TRACE_TYPE_HANDLED;  }  #define SEQ_PUT_FIELD_RET(s, x)				\ @@ -1710,11 +1667,12 @@ do {							\  #define SEQ_PUT_HEX_FIELD_RET(s, x)			\  do {							\ +	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\  	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\  		return 0;				\  } while (0) -static int print_hex_fmt(struct trace_iterator *iter) +static enum print_line_t print_hex_fmt(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq;  	unsigned char newline = '\n'; @@ -1723,97 +1681,139 @@ static int print_hex_fmt(struct trace_iterator *iter)  	entry = iter->ent; +	if (entry->type == TRACE_CONT) +		return TRACE_TYPE_HANDLED; +  	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);  	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); -	SEQ_PUT_HEX_FIELD_RET(s, entry->t); +	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);  	switch (entry->type) { -	case TRACE_FN: -		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); -		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); +	case TRACE_FN: { +		struct ftrace_entry *field; + +		trace_assign_type(field, entry); + +		SEQ_PUT_HEX_FIELD_RET(s, field->ip); +		SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);  		break; +	}  	case TRACE_CTX: -	case TRACE_WAKE: -		S = entry->ctx.prev_state < sizeof(state_to_char) ? -			state_to_char[entry->ctx.prev_state] : 'X'; -		T = entry->ctx.next_state < sizeof(state_to_char) ? -			state_to_char[entry->ctx.next_state] : 'X'; +	case TRACE_WAKE: { +		struct ctx_switch_entry *field; + +		trace_assign_type(field, entry); + +		S = field->prev_state < sizeof(state_to_char) ? +			state_to_char[field->prev_state] : 'X'; +		T = field->next_state < sizeof(state_to_char) ? +			state_to_char[field->next_state] : 'X';  		if (entry->type == TRACE_WAKE)  			S = '+'; -		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); -		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); +		SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); +		SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);  		SEQ_PUT_HEX_FIELD_RET(s, S); -		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); -		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); -		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); +		SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); +		SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); +		SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);  		SEQ_PUT_HEX_FIELD_RET(s, T);  		break; +	}  	case TRACE_SPECIAL: -	case TRACE_STACK: -		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); -		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); -		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); +	case TRACE_STACK: { +		struct special_entry *field; + +		trace_assign_type(field, entry); + +		SEQ_PUT_HEX_FIELD_RET(s, field->arg1); +		SEQ_PUT_HEX_FIELD_RET(s, field->arg2); +		SEQ_PUT_HEX_FIELD_RET(s, field->arg3);  		break;  	} +	}  	SEQ_PUT_FIELD_RET(s, newline); -	return 1; +	return TRACE_TYPE_HANDLED;  } -static int print_bin_fmt(struct trace_iterator *iter) +static enum print_line_t print_bin_fmt(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq;  	struct trace_entry *entry;  	entry = iter->ent; +	if (entry->type == TRACE_CONT) +		return TRACE_TYPE_HANDLED; +  	SEQ_PUT_FIELD_RET(s, entry->pid); -	SEQ_PUT_FIELD_RET(s, entry->cpu); -	SEQ_PUT_FIELD_RET(s, entry->t); +	SEQ_PUT_FIELD_RET(s, iter->cpu); +	SEQ_PUT_FIELD_RET(s, iter->ts);  	switch (entry->type) { -	case TRACE_FN: -		SEQ_PUT_FIELD_RET(s, entry->fn.ip); -		SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); +	case TRACE_FN: { +		struct ftrace_entry *field; + +		trace_assign_type(field, entry); + +		SEQ_PUT_FIELD_RET(s, field->ip); +		SEQ_PUT_FIELD_RET(s, field->parent_ip);  		break; -	case TRACE_CTX: -		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); -		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); -		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); -		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); -		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); -		SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); +	} +	case TRACE_CTX: { +		struct ctx_switch_entry *field; + +		trace_assign_type(field, entry); + +		SEQ_PUT_FIELD_RET(s, field->prev_pid); +		SEQ_PUT_FIELD_RET(s, field->prev_prio); +		SEQ_PUT_FIELD_RET(s, field->prev_state); +		SEQ_PUT_FIELD_RET(s, field->next_pid); +		SEQ_PUT_FIELD_RET(s, field->next_prio); +		SEQ_PUT_FIELD_RET(s, field->next_state);  		break; +	}  	case TRACE_SPECIAL: -	case TRACE_STACK: -		SEQ_PUT_FIELD_RET(s, entry->special.arg1); -		SEQ_PUT_FIELD_RET(s, entry->special.arg2); -		SEQ_PUT_FIELD_RET(s, entry->special.arg3); +	case TRACE_STACK: { +		struct special_entry *field; + +		trace_assign_type(field, entry); + +		SEQ_PUT_FIELD_RET(s, field->arg1); +		SEQ_PUT_FIELD_RET(s, field->arg2); +		SEQ_PUT_FIELD_RET(s, field->arg3);  		break;  	} +	}  	return 1;  }  static int trace_empty(struct trace_iterator *iter)  { -	struct trace_array_cpu *data;  	int cpu;  	for_each_tracing_cpu(cpu) { -		data = iter->tr->data[cpu]; - -		if (head_page(data) && data->trace_idx && -		    (data->trace_tail != data->trace_head || -		     data->trace_tail_idx != data->trace_head_idx)) -			return 0; +		if (iter->buffer_iter[cpu]) { +			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) +				return 0; +		} else { +			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +				return 0; +		}  	} +  	return 1;  } -static int print_trace_line(struct trace_iterator *iter) +static enum print_line_t print_trace_line(struct trace_iterator *iter)  { -	if (iter->trace && iter->trace->print_line) -		return iter->trace->print_line(iter); +	enum print_line_t ret; + +	if (iter->trace && iter->trace->print_line) { +		ret = iter->trace->print_line(iter); +		if (ret != TRACE_TYPE_UNHANDLED) +			return ret; +	}  	if (trace_flags & TRACE_ITER_BIN)  		return print_bin_fmt(iter); @@ -1869,6 +1869,8 @@ static struct trace_iterator *  __tracing_open(struct inode *inode, struct file *file, int *ret)  {  	struct trace_iterator *iter; +	struct seq_file *m; +	int cpu;  	if (tracing_disabled) {  		*ret = -ENODEV; @@ -1889,28 +1891,45 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)  	iter->trace = current_trace;  	iter->pos = -1; +	for_each_tracing_cpu(cpu) { + +		iter->buffer_iter[cpu] = +			ring_buffer_read_start(iter->tr->buffer, cpu); + +		if (!iter->buffer_iter[cpu]) +			goto fail_buffer; +	} +  	/* TODO stop tracer */  	*ret = seq_open(file, &tracer_seq_ops); -	if (!*ret) { -		struct seq_file *m = file->private_data; -		m->private = iter; +	if (*ret) +		goto fail_buffer; -		/* stop the trace while dumping */ -		if (iter->tr->ctrl) { -			tracer_enabled = 0; -			ftrace_function_enabled = 0; -		} +	m = file->private_data; +	m->private = iter; -		if (iter->trace && iter->trace->open) -			iter->trace->open(iter); -	} else { -		kfree(iter); -		iter = NULL; +	/* stop the trace while dumping */ +	if (iter->tr->ctrl) { +		tracer_enabled = 0; +		ftrace_function_enabled = 0;  	} + +	if (iter->trace && iter->trace->open) +			iter->trace->open(iter); +  	mutex_unlock(&trace_types_lock);   out:  	return iter; + + fail_buffer: +	for_each_tracing_cpu(cpu) { +		if (iter->buffer_iter[cpu]) +			ring_buffer_read_finish(iter->buffer_iter[cpu]); +	} +	mutex_unlock(&trace_types_lock); + +	return ERR_PTR(-ENOMEM);  }  int tracing_open_generic(struct inode *inode, struct file *filp) @@ -1926,8 +1945,14 @@ int tracing_release(struct inode *inode, struct file *file)  {  	struct seq_file *m = (struct seq_file *)file->private_data;  	struct trace_iterator *iter = m->private; +	int cpu;  	mutex_lock(&trace_types_lock); +	for_each_tracing_cpu(cpu) { +		if (iter->buffer_iter[cpu]) +			ring_buffer_read_finish(iter->buffer_iter[cpu]); +	} +  	if (iter->trace && iter->trace->close)  		iter->trace->close(iter); @@ -2352,9 +2377,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,  	struct tracer *t;  	char buf[max_tracer_type_len+1];  	int i; +	size_t ret;  	if (cnt > max_tracer_type_len)  		cnt = max_tracer_type_len; +	ret = cnt;  	if (copy_from_user(&buf, ubuf, cnt))  		return -EFAULT; @@ -2370,7 +2397,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,  		if (strcmp(t->name, buf) == 0)  			break;  	} -	if (!t || t == current_trace) +	if (!t) { +		ret = -EINVAL; +		goto out; +	} +	if (t == current_trace)  		goto out;  	if (current_trace && current_trace->reset) @@ -2383,9 +2414,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,   out:  	mutex_unlock(&trace_types_lock); -	filp->f_pos += cnt; +	if (ret == cnt) +		filp->f_pos += cnt; -	return cnt; +	return ret;  }  static ssize_t @@ -2500,20 +2532,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  		  size_t cnt, loff_t *ppos)  {  	struct trace_iterator *iter = filp->private_data; -	struct trace_array_cpu *data; -	static cpumask_t mask; -	unsigned long flags; -#ifdef CONFIG_FTRACE -	int ftrace_save; -#endif -	int cpu;  	ssize_t sret;  	/* return any leftover data */  	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);  	if (sret != -EBUSY)  		return sret; -	sret = 0;  	trace_seq_reset(&iter->seq); @@ -2524,6 +2548,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  			goto out;  	} +waitagain: +	sret = 0;  	while (trace_empty(iter)) {  		if ((filp->f_flags & O_NONBLOCK)) { @@ -2588,46 +2614,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  	       offsetof(struct trace_iterator, seq));  	iter->pos = -1; -	/* -	 * We need to stop all tracing on all CPUS to read the -	 * the next buffer. This is a bit expensive, but is -	 * not done often. We fill all what we can read, -	 * and then release the locks again. -	 */ - -	cpus_clear(mask); -	local_irq_save(flags); -#ifdef CONFIG_FTRACE -	ftrace_save = ftrace_enabled; -	ftrace_enabled = 0; -#endif -	smp_wmb(); -	for_each_tracing_cpu(cpu) { -		data = iter->tr->data[cpu]; - -		if (!head_page(data) || !data->trace_idx) -			continue; - -		atomic_inc(&data->disabled); -		cpu_set(cpu, mask); -	} - -	for_each_cpu_mask(cpu, mask) { -		data = iter->tr->data[cpu]; -		__raw_spin_lock(&data->lock); - -		if (data->overrun > iter->last_overrun[cpu]) -			iter->overrun[cpu] += -				data->overrun - iter->last_overrun[cpu]; -		iter->last_overrun[cpu] = data->overrun; -	} -  	while (find_next_entry_inc(iter) != NULL) { -		int ret; +		enum print_line_t ret;  		int len = iter->seq.len;  		ret = print_trace_line(iter); -		if (!ret) { +		if (ret == TRACE_TYPE_PARTIAL_LINE) {  			/* don't print partial lines */  			iter->seq.len = len;  			break; @@ -2639,26 +2631,17 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  			break;  	} -	for_each_cpu_mask(cpu, mask) { -		data = iter->tr->data[cpu]; -		__raw_spin_unlock(&data->lock); -	} - -	for_each_cpu_mask(cpu, mask) { -		data = iter->tr->data[cpu]; -		atomic_dec(&data->disabled); -	} -#ifdef CONFIG_FTRACE -	ftrace_enabled = ftrace_save; -#endif -	local_irq_restore(flags); -  	/* Now copy what we have to the user */  	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);  	if (iter->seq.readpos >= iter->seq.len)  		trace_seq_reset(&iter->seq); + +	/* +	 * If there was nothing to send to user, inspite of consuming trace +	 * entries, go back to wait for more entries. +	 */  	if (sret == -EBUSY) -		sret = 0; +		goto waitagain;  out:  	mutex_unlock(&trace_types_lock); @@ -2684,7 +2667,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  {  	unsigned long val;  	char buf[64]; -	int i, ret; +	int ret; +	struct trace_array *tr = filp->private_data;  	if (cnt >= sizeof(buf))  		return -EINVAL; @@ -2704,59 +2688,38 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	mutex_lock(&trace_types_lock); -	if (current_trace != &no_tracer) { +	if (tr->ctrl) {  		cnt = -EBUSY; -		pr_info("ftrace: set current_tracer to none" +		pr_info("ftrace: please disable tracing"  			" before modifying buffer size\n");  		goto out;  	} -	if (val > global_trace.entries) { -		long pages_requested; -		unsigned long freeable_pages; - -		/* make sure we have enough memory before mapping */ -		pages_requested = -			(val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; - -		/* account for each buffer (and max_tr) */ -		pages_requested *= tracing_nr_buffers * 2; - -		/* Check for overflow */ -		if (pages_requested < 0) { -			cnt = -ENOMEM; -			goto out; -		} - -		freeable_pages = determine_dirtyable_memory(); - -		/* we only allow to request 1/4 of useable memory */ -		if (pages_requested > -		    ((freeable_pages + tracing_pages_allocated) / 4)) { -			cnt = -ENOMEM; +	if (val != global_trace.entries) { +		ret = ring_buffer_resize(global_trace.buffer, val); +		if (ret < 0) { +			cnt = ret;  			goto out;  		} -		while (global_trace.entries < val) { -			if (trace_alloc_page()) { -				cnt = -ENOMEM; -				goto out; +		ret = ring_buffer_resize(max_tr.buffer, val); +		if (ret < 0) { +			int r; +			cnt = ret; +			r = ring_buffer_resize(global_trace.buffer, +					       global_trace.entries); +			if (r < 0) { +				/* AARGH! We are left with different +				 * size max buffer!!!! */ +				WARN_ON(1); +				tracing_disabled = 1;  			} -			/* double check that we don't go over the known pages */ -			if (tracing_pages_allocated > pages_requested) -				break; +			goto out;  		} -	} else { -		/* include the number of entries in val (inc of page entries) */ -		while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) -			trace_free_page(); +		global_trace.entries = val;  	} -	/* check integrity */ -	for_each_tracing_cpu(i) -		check_pages(global_trace.data[i]); -  	filp->f_pos += cnt;  	/* If check pages failed, return ENOMEM */ @@ -2769,6 +2732,52 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	return cnt;  } +static int mark_printk(const char *fmt, ...) +{ +	int ret; +	va_list args; +	va_start(args, fmt); +	ret = trace_vprintk(0, fmt, args); +	va_end(args); +	return ret; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, +					size_t cnt, loff_t *fpos) +{ +	char *buf; +	char *end; +	struct trace_array *tr = &global_trace; + +	if (!tr->ctrl || tracing_disabled) +		return -EINVAL; + +	if (cnt > TRACE_BUF_SIZE) +		cnt = TRACE_BUF_SIZE; + +	buf = kmalloc(cnt + 1, GFP_KERNEL); +	if (buf == NULL) +		return -ENOMEM; + +	if (copy_from_user(buf, ubuf, cnt)) { +		kfree(buf); +		return -EFAULT; +	} + +	/* Cut from the first nil or newline. */ +	buf[cnt] = '\0'; +	end = strchr(buf, '\n'); +	if (end) +		*end = '\0'; + +	cnt = mark_printk("%s\n", buf); +	kfree(buf); +	*fpos += cnt; + +	return cnt; +} +  static struct file_operations tracing_max_lat_fops = {  	.open		= tracing_open_generic,  	.read		= tracing_max_lat_read, @@ -2800,6 +2809,11 @@ static struct file_operations tracing_entries_fops = {  	.write		= tracing_entries_write,  }; +static struct file_operations tracing_mark_fops = { +	.open		= tracing_open_generic, +	.write		= tracing_mark_write, +}; +  #ifdef CONFIG_DYNAMIC_FTRACE  static ssize_t @@ -2846,7 +2860,7 @@ struct dentry *tracing_init_dentry(void)  #include "trace_selftest.c"  #endif -static __init void tracer_init_debugfs(void) +static __init int tracer_init_debugfs(void)  {  	struct dentry *d_tracer;  	struct dentry *entry; @@ -2881,12 +2895,12 @@ static __init void tracer_init_debugfs(void)  	entry = debugfs_create_file("available_tracers", 0444, d_tracer,  				    &global_trace, &show_traces_fops);  	if (!entry) -		pr_warning("Could not create debugfs 'trace' entry\n"); +		pr_warning("Could not create debugfs 'available_tracers' entry\n");  	entry = debugfs_create_file("current_tracer", 0444, d_tracer,  				    &global_trace, &set_tracer_fops);  	if (!entry) -		pr_warning("Could not create debugfs 'trace' entry\n"); +		pr_warning("Could not create debugfs 'current_tracer' entry\n");  	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,  				    &tracing_max_latency, @@ -2899,7 +2913,7 @@ static __init void tracer_init_debugfs(void)  				    &tracing_thresh, &tracing_max_lat_fops);  	if (!entry)  		pr_warning("Could not create debugfs " -			   "'tracing_threash' entry\n"); +			   "'tracing_thresh' entry\n");  	entry = debugfs_create_file("README", 0644, d_tracer,  				    NULL, &tracing_readme_fops);  	if (!entry) @@ -2909,13 +2923,19 @@ static __init void tracer_init_debugfs(void)  				    NULL, &tracing_pipe_fops);  	if (!entry)  		pr_warning("Could not create debugfs " -			   "'tracing_threash' entry\n"); +			   "'trace_pipe' entry\n");  	entry = debugfs_create_file("trace_entries", 0644, d_tracer,  				    &global_trace, &tracing_entries_fops);  	if (!entry)  		pr_warning("Could not create debugfs " -			   "'tracing_threash' entry\n"); +			   "'trace_entries' entry\n"); + +	entry = debugfs_create_file("trace_marker", 0220, d_tracer, +				    NULL, &tracing_mark_fops); +	if (!entry) +		pr_warning("Could not create debugfs " +			   "'trace_marker' entry\n");  #ifdef CONFIG_DYNAMIC_FTRACE  	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, @@ -2928,230 +2948,263 @@ static __init void tracer_init_debugfs(void)  #ifdef CONFIG_SYSPROF_TRACER  	init_tracer_sysprof_debugfs(d_tracer);  #endif +	return 0;  } -static int trace_alloc_page(void) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  { +	static DEFINE_SPINLOCK(trace_buf_lock); +	static char trace_buf[TRACE_BUF_SIZE]; + +	struct ring_buffer_event *event; +	struct trace_array *tr = &global_trace;  	struct trace_array_cpu *data; -	struct page *page, *tmp; -	LIST_HEAD(pages); -	void *array; -	unsigned pages_allocated = 0; -	int i; +	struct print_entry *entry; +	unsigned long flags, irq_flags; +	int cpu, len = 0, size, pc; -	/* first allocate a page for each CPU */ -	for_each_tracing_cpu(i) { -		array = (void *)__get_free_page(GFP_KERNEL); -		if (array == NULL) { -			printk(KERN_ERR "tracer: failed to allocate page" -			       "for trace buffer!\n"); -			goto free_pages; -		} +	if (!tr->ctrl || tracing_disabled) +		return 0; -		pages_allocated++; -		page = virt_to_page(array); -		list_add(&page->lru, &pages); +	pc = preempt_count(); +	preempt_disable_notrace(); +	cpu = raw_smp_processor_id(); +	data = tr->data[cpu]; -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE -		array = (void *)__get_free_page(GFP_KERNEL); -		if (array == NULL) { -			printk(KERN_ERR "tracer: failed to allocate page" -			       "for trace buffer!\n"); -			goto free_pages; -		} -		pages_allocated++; -		page = virt_to_page(array); -		list_add(&page->lru, &pages); -#endif -	} +	if (unlikely(atomic_read(&data->disabled))) +		goto out; -	/* Now that we successfully allocate a page per CPU, add them */ -	for_each_tracing_cpu(i) { -		data = global_trace.data[i]; -		page = list_entry(pages.next, struct page, lru); -		list_del_init(&page->lru); -		list_add_tail(&page->lru, &data->trace_pages); -		ClearPageLRU(page); +	spin_lock_irqsave(&trace_buf_lock, flags); +	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); -#ifdef CONFIG_TRACER_MAX_TRACE -		data = max_tr.data[i]; -		page = list_entry(pages.next, struct page, lru); -		list_del_init(&page->lru); -		list_add_tail(&page->lru, &data->trace_pages); -		SetPageLRU(page); -#endif -	} -	tracing_pages_allocated += pages_allocated; -	global_trace.entries += ENTRIES_PER_PAGE; +	len = min(len, TRACE_BUF_SIZE-1); +	trace_buf[len] = 0; -	return 0; +	size = sizeof(*entry) + len + 1; +	event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags); +	if (!event) +		goto out_unlock; +	entry = ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, flags, pc); +	entry->ent.type			= TRACE_PRINT; +	entry->ip			= ip; - free_pages: -	list_for_each_entry_safe(page, tmp, &pages, lru) { -		list_del_init(&page->lru); -		__free_page(page); -	} -	return -ENOMEM; +	memcpy(&entry->buf, trace_buf, len); +	entry->buf[len] = 0; +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + out_unlock: +	spin_unlock_irqrestore(&trace_buf_lock, flags); + + out: +	preempt_enable_notrace(); + +	return len;  } +EXPORT_SYMBOL_GPL(trace_vprintk); -static int trace_free_page(void) +int __ftrace_printk(unsigned long ip, const char *fmt, ...)  { -	struct trace_array_cpu *data; -	struct page *page; -	struct list_head *p; -	int i; -	int ret = 0; +	int ret; +	va_list ap; -	/* free one page from each buffer */ -	for_each_tracing_cpu(i) { -		data = global_trace.data[i]; -		p = data->trace_pages.next; -		if (p == &data->trace_pages) { -			/* should never happen */ -			WARN_ON(1); -			tracing_disabled = 1; -			ret = -1; -			break; -		} -		page = list_entry(p, struct page, lru); -		ClearPageLRU(page); -		list_del(&page->lru); -		tracing_pages_allocated--; -		tracing_pages_allocated--; -		__free_page(page); +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; -		tracing_reset(data); +	va_start(ap, fmt); +	ret = trace_vprintk(ip, fmt, ap); +	va_end(ap); +	return ret; +} +EXPORT_SYMBOL_GPL(__ftrace_printk); -#ifdef CONFIG_TRACER_MAX_TRACE -		data = max_tr.data[i]; -		p = data->trace_pages.next; -		if (p == &data->trace_pages) { -			/* should never happen */ -			WARN_ON(1); -			tracing_disabled = 1; -			ret = -1; -			break; -		} -		page = list_entry(p, struct page, lru); -		ClearPageLRU(page); -		list_del(&page->lru); -		__free_page(page); +static int trace_panic_handler(struct notifier_block *this, +			       unsigned long event, void *unused) +{ +	ftrace_dump(); +	return NOTIFY_OK; +} -		tracing_reset(data); -#endif -	} -	global_trace.entries -= ENTRIES_PER_PAGE; +static struct notifier_block trace_panic_notifier = { +	.notifier_call  = trace_panic_handler, +	.next           = NULL, +	.priority       = 150   /* priority: INT_MAX >= x >= 0 */ +}; -	return ret; +static int trace_die_handler(struct notifier_block *self, +			     unsigned long val, +			     void *data) +{ +	switch (val) { +	case DIE_OOPS: +		ftrace_dump(); +		break; +	default: +		break; +	} +	return NOTIFY_OK;  } -__init static int tracer_alloc_buffers(void) +static struct notifier_block trace_die_notifier = { +	.notifier_call = trace_die_handler, +	.priority = 200 +}; + +/* + * printk is set to max of 1024, we really don't need it that big. + * Nothing should be printing 1000 characters anyway. + */ +#define TRACE_MAX_PRINT		1000 + +/* + * Define here KERN_TRACE so that we have one place to modify + * it if we decide to change what log level the ftrace dump + * should be at. + */ +#define KERN_TRACE		KERN_INFO + +static void +trace_printk_seq(struct trace_seq *s)  { -	struct trace_array_cpu *data; -	void *array; -	struct page *page; -	int pages = 0; -	int ret = -ENOMEM; -	int i; +	/* Probably should print a warning here. */ +	if (s->len >= 1000) +		s->len = 1000; -	/* TODO: make the number of buffers hot pluggable with CPUS */ -	tracing_nr_buffers = num_possible_cpus(); -	tracing_buffer_mask = cpu_possible_map; +	/* should be zero ended, but we are paranoid. */ +	s->buffer[s->len] = 0; -	/* Allocate the first page for all buffers */ -	for_each_tracing_cpu(i) { -		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); -		max_tr.data[i] = &per_cpu(max_data, i); +	printk(KERN_TRACE "%s", s->buffer); -		array = (void *)__get_free_page(GFP_KERNEL); -		if (array == NULL) { -			printk(KERN_ERR "tracer: failed to allocate page" -			       "for trace buffer!\n"); -			goto free_buffers; -		} +	trace_seq_reset(s); +} + + +void ftrace_dump(void) +{ +	static DEFINE_SPINLOCK(ftrace_dump_lock); +	/* use static because iter can be a bit big for the stack */ +	static struct trace_iterator iter; +	static cpumask_t mask; +	static int dump_ran; +	unsigned long flags; +	int cnt = 0, cpu; -		/* set the array to the list */ -		INIT_LIST_HEAD(&data->trace_pages); -		page = virt_to_page(array); -		list_add(&page->lru, &data->trace_pages); -		/* use the LRU flag to differentiate the two buffers */ -		ClearPageLRU(page); +	/* only one dump */ +	spin_lock_irqsave(&ftrace_dump_lock, flags); +	if (dump_ran) +		goto out; -		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; -		max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +	dump_ran = 1; -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE -		array = (void *)__get_free_page(GFP_KERNEL); -		if (array == NULL) { -			printk(KERN_ERR "tracer: failed to allocate page" -			       "for trace buffer!\n"); -			goto free_buffers; -		} +	/* No turning back! */ +	ftrace_kill_atomic(); -		INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); -		page = virt_to_page(array); -		list_add(&page->lru, &max_tr.data[i]->trace_pages); -		SetPageLRU(page); -#endif +	for_each_tracing_cpu(cpu) { +		atomic_inc(&global_trace.data[cpu]->disabled);  	} +	printk(KERN_TRACE "Dumping ftrace buffer:\n"); + +	iter.tr = &global_trace; +	iter.trace = current_trace; +  	/* -	 * Since we allocate by orders of pages, we may be able to -	 * round up a bit. +	 * We need to stop all tracing on all CPUS to read the +	 * the next buffer. This is a bit expensive, but is +	 * not done often. We fill all what we can read, +	 * and then release the locks again.  	 */ -	global_trace.entries = ENTRIES_PER_PAGE; -	pages++; -	while (global_trace.entries < trace_nr_entries) { -		if (trace_alloc_page()) -			break; -		pages++; +	cpus_clear(mask); + +	while (!trace_empty(&iter)) { + +		if (!cnt) +			printk(KERN_TRACE "---------------------------------\n"); + +		cnt++; + +		/* reset all but tr, trace, and overruns */ +		memset(&iter.seq, 0, +		       sizeof(struct trace_iterator) - +		       offsetof(struct trace_iterator, seq)); +		iter.iter_flags |= TRACE_FILE_LAT_FMT; +		iter.pos = -1; + +		if (find_next_entry_inc(&iter) != NULL) { +			print_trace_line(&iter); +			trace_consume(&iter); +		} + +		trace_printk_seq(&iter.seq);  	} -	max_tr.entries = global_trace.entries; -	pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", -		pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); -	pr_info("   actual entries %ld\n", global_trace.entries); +	if (!cnt) +		printk(KERN_TRACE "   (ftrace buffer empty)\n"); +	else +		printk(KERN_TRACE "---------------------------------\n"); + + out: +	spin_unlock_irqrestore(&ftrace_dump_lock, flags); +} + +__init static int tracer_alloc_buffers(void) +{ +	struct trace_array_cpu *data; +	int i; + +	/* TODO: make the number of buffers hot pluggable with CPUS */ +	tracing_buffer_mask = cpu_possible_map; + +	global_trace.buffer = ring_buffer_alloc(trace_buf_size, +						   TRACE_BUFFER_FLAGS); +	if (!global_trace.buffer) { +		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); +		WARN_ON(1); +		return 0; +	} +	global_trace.entries = ring_buffer_size(global_trace.buffer); -	tracer_init_debugfs(); +#ifdef CONFIG_TRACER_MAX_TRACE +	max_tr.buffer = ring_buffer_alloc(trace_buf_size, +					     TRACE_BUFFER_FLAGS); +	if (!max_tr.buffer) { +		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); +		WARN_ON(1); +		ring_buffer_free(global_trace.buffer); +		return 0; +	} +	max_tr.entries = ring_buffer_size(max_tr.buffer); +	WARN_ON(max_tr.entries != global_trace.entries); +#endif + +	/* Allocate the first page for all buffers */ +	for_each_tracing_cpu(i) { +		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); +		max_tr.data[i] = &per_cpu(max_data, i); +	}  	trace_init_cmdlines(); -	register_tracer(&no_tracer); -	current_trace = &no_tracer; +	register_tracer(&nop_trace); +#ifdef CONFIG_BOOT_TRACER +	register_tracer(&boot_tracer); +	current_trace = &boot_tracer; +	current_trace->init(&global_trace); +#else +	current_trace = &nop_trace; +#endif  	/* All seems OK, enable tracing */  	global_trace.ctrl = tracer_enabled;  	tracing_disabled = 0; -	return 0; +	atomic_notifier_chain_register(&panic_notifier_list, +				       &trace_panic_notifier); - free_buffers: -	for (i-- ; i >= 0; i--) { -		struct page *page, *tmp; -		struct trace_array_cpu *data = global_trace.data[i]; +	register_die_notifier(&trace_die_notifier); -		if (data) { -			list_for_each_entry_safe(page, tmp, -						 &data->trace_pages, lru) { -				list_del_init(&page->lru); -				__free_page(page); -			} -		} - -#ifdef CONFIG_TRACER_MAX_TRACE -		data = max_tr.data[i]; -		if (data) { -			list_for_each_entry_safe(page, tmp, -						 &data->trace_pages, lru) { -				list_del_init(&page->lru); -				__free_page(page); -			} -		} -#endif -	} -	return ret; +	return 0;  } -fs_initcall(tracer_alloc_buffers); +early_initcall(tracer_alloc_buffers); +fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f69f86788c2..f1f99572cde 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -5,7 +5,9 @@  #include <asm/atomic.h>  #include <linux/sched.h>  #include <linux/clocksource.h> +#include <linux/ring_buffer.h>  #include <linux/mmiotrace.h> +#include <linux/ftrace.h>  enum trace_type {  	__TRACE_FIRST_TYPE = 0, @@ -13,38 +15,60 @@ enum trace_type {  	TRACE_FN,  	TRACE_CTX,  	TRACE_WAKE, +	TRACE_CONT,  	TRACE_STACK, +	TRACE_PRINT,  	TRACE_SPECIAL,  	TRACE_MMIO_RW,  	TRACE_MMIO_MAP, +	TRACE_BOOT,  	__TRACE_LAST_TYPE  };  /* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { +	unsigned char		type; +	unsigned char		cpu; +	unsigned char		flags; +	unsigned char		preempt_count; +	int			pid; +}; + +/*   * Function trace entry - function address and parent function addres:   */  struct ftrace_entry { +	struct trace_entry	ent;  	unsigned long		ip;  	unsigned long		parent_ip;  }; +extern struct tracer boot_tracer;  /*   * Context switch trace entry - which task (and prio) we switched from/to:   */  struct ctx_switch_entry { +	struct trace_entry	ent;  	unsigned int		prev_pid;  	unsigned char		prev_prio;  	unsigned char		prev_state;  	unsigned int		next_pid;  	unsigned char		next_prio;  	unsigned char		next_state; +	unsigned int		next_cpu;  };  /*   * Special (free-form) trace entry:   */  struct special_entry { +	struct trace_entry	ent;  	unsigned long		arg1;  	unsigned long		arg2;  	unsigned long		arg3; @@ -57,33 +81,60 @@ struct special_entry {  #define FTRACE_STACK_ENTRIES	8  struct stack_entry { +	struct trace_entry	ent;  	unsigned long		caller[FTRACE_STACK_ENTRIES];  };  /* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter + * ftrace_printk entry:   */ -struct trace_entry { -	char			type; -	char			cpu; -	char			flags; -	char			preempt_count; -	int			pid; -	cycle_t			t; -	union { -		struct ftrace_entry		fn; -		struct ctx_switch_entry		ctx; -		struct special_entry		special; -		struct stack_entry		stack; -		struct mmiotrace_rw		mmiorw; -		struct mmiotrace_map		mmiomap; -	}; +struct print_entry { +	struct trace_entry	ent; +	unsigned long		ip; +	char			buf[]; +}; + +#define TRACE_OLD_SIZE		88 + +struct trace_field_cont { +	unsigned char		type; +	/* Temporary till we get rid of this completely */ +	char			buf[TRACE_OLD_SIZE - 1]; +}; + +struct trace_mmiotrace_rw { +	struct trace_entry	ent; +	struct mmiotrace_rw	rw; +}; + +struct trace_mmiotrace_map { +	struct trace_entry	ent; +	struct mmiotrace_map	map; +}; + +struct trace_boot { +	struct trace_entry	ent; +	struct boot_trace	initcall; +}; + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + *  IRQS_OFF	- interrupts were disabled + *  NEED_RESCED - reschedule is requested + *  HARDIRQ	- inside an interrupt handler + *  SOFTIRQ	- inside a softirq handler + *  CONT	- multiple entries hold the trace item + */ +enum trace_flag_type { +	TRACE_FLAG_IRQS_OFF		= 0x01, +	TRACE_FLAG_NEED_RESCHED		= 0x02, +	TRACE_FLAG_HARDIRQ		= 0x04, +	TRACE_FLAG_SOFTIRQ		= 0x08, +	TRACE_FLAG_CONT			= 0x10,  }; -#define TRACE_ENTRY_SIZE	sizeof(struct trace_entry) +#define TRACE_BUF_SIZE		1024  /*   * The CPU trace array - it consists of thousands of trace entries @@ -91,16 +142,9 @@ struct trace_entry {   * the trace, etc.)   */  struct trace_array_cpu { -	struct list_head	trace_pages;  	atomic_t		disabled; -	raw_spinlock_t		lock; -	struct lock_class_key	lock_key;  	/* these fields get copied into max-trace: */ -	unsigned		trace_head_idx; -	unsigned		trace_tail_idx; -	void			*trace_head; /* producer */ -	void			*trace_tail; /* consumer */  	unsigned long		trace_idx;  	unsigned long		overrun;  	unsigned long		saved_latency; @@ -124,6 +168,7 @@ struct trace_iterator;   * They have on/off state as well:   */  struct trace_array { +	struct ring_buffer	*buffer;  	unsigned long		entries;  	long			ctrl;  	int			cpu; @@ -132,6 +177,56 @@ struct trace_array {  	struct trace_array_cpu	*data[NR_CPUS];  }; +#define FTRACE_CMP_TYPE(var, type) \ +	__builtin_types_compatible_p(typeof(var), type *) + +#undef IF_ASSIGN +#define IF_ASSIGN(var, entry, etype, id)		\ +	if (FTRACE_CMP_TYPE(var, etype)) {		\ +		var = (typeof(var))(entry);		\ +		WARN_ON(id && (entry)->type != id);	\ +		break;					\ +	} + +/* Will cause compile errors if type is not found. */ +extern void __ftrace_bad_type(void); + +/* + * The trace_assign_type is a verifier that the entry type is + * the same as the type being assigned. To add new types simply + * add a line with the following format: + * + * IF_ASSIGN(var, ent, type, id); + * + *  Where "type" is the trace type that includes the trace_entry + *  as the "ent" item. And "id" is the trace identifier that is + *  used in the trace_type enum. + * + *  If the type can have more than one id, then use zero. + */ +#define trace_assign_type(var, ent)					\ +	do {								\ +		IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN);	\ +		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\ +		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ +		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\ +		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\ +		IF_ASSIGN(var, ent, struct special_entry, 0);		\ +		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\ +			  TRACE_MMIO_RW);				\ +		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\ +			  TRACE_MMIO_MAP);				\ +		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\ +		__ftrace_bad_type();					\ +	} while (0) + +/* Return values for print_line callback */ +enum print_line_t { +	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */ +	TRACE_TYPE_HANDLED	= 1, +	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */ +}; +  /*   * A specific tracer, represented by methods that operate on a trace array:   */ @@ -152,7 +247,7 @@ struct tracer {  	int			(*selftest)(struct tracer *trace,  					    struct trace_array *tr);  #endif -	int			(*print_line)(struct trace_iterator *iter); +	enum print_line_t	(*print_line)(struct trace_iterator *iter);  	struct tracer		*next;  	int			print_max;  }; @@ -171,57 +266,58 @@ struct trace_iterator {  	struct trace_array	*tr;  	struct tracer		*trace;  	void			*private; -	long			last_overrun[NR_CPUS]; -	long			overrun[NR_CPUS]; +	struct ring_buffer_iter	*buffer_iter[NR_CPUS];  	/* The below is zeroed out in pipe_read */  	struct trace_seq	seq;  	struct trace_entry	*ent;  	int			cpu; - -	struct trace_entry	*prev_ent; -	int			prev_cpu; +	u64			ts;  	unsigned long		iter_flags;  	loff_t			pos; -	unsigned long		next_idx[NR_CPUS]; -	struct list_head	*next_page[NR_CPUS]; -	unsigned		next_page_idx[NR_CPUS];  	long			idx;  }; -void tracing_reset(struct trace_array_cpu *data); +void trace_wake_up(void); +void tracing_reset(struct trace_array *tr, int cpu);  int tracing_open_generic(struct inode *inode, struct file *filp);  struct dentry *tracing_init_dentry(void);  void init_tracer_sysprof_debugfs(struct dentry *d_tracer); +struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, +						struct trace_array_cpu *data); +void tracing_generic_entry_update(struct trace_entry *entry, +				  unsigned long flags, +				  int pc); +  void ftrace(struct trace_array *tr,  			    struct trace_array_cpu *data,  			    unsigned long ip,  			    unsigned long parent_ip, -			    unsigned long flags); +			    unsigned long flags, int pc);  void tracing_sched_switch_trace(struct trace_array *tr,  				struct trace_array_cpu *data,  				struct task_struct *prev,  				struct task_struct *next, -				unsigned long flags); +				unsigned long flags, int pc);  void tracing_record_cmdline(struct task_struct *tsk);  void tracing_sched_wakeup_trace(struct trace_array *tr,  				struct trace_array_cpu *data,  				struct task_struct *wakee,  				struct task_struct *cur, -				unsigned long flags); +				unsigned long flags, int pc);  void trace_special(struct trace_array *tr,  		   struct trace_array_cpu *data,  		   unsigned long arg1,  		   unsigned long arg2, -		   unsigned long arg3); +		   unsigned long arg3, int pc);  void trace_function(struct trace_array *tr,  		    struct trace_array_cpu *data,  		    unsigned long ip,  		    unsigned long parent_ip, -		    unsigned long flags); +		    unsigned long flags, int pc);  void tracing_start_cmdline_record(void);  void tracing_stop_cmdline_record(void); @@ -268,51 +364,33 @@ extern unsigned long ftrace_update_tot_cnt;  extern int DYN_FTRACE_TEST_NAME(void);  #endif -#ifdef CONFIG_MMIOTRACE -extern void __trace_mmiotrace_rw(struct trace_array *tr, -				struct trace_array_cpu *data, -				struct mmiotrace_rw *rw); -extern void __trace_mmiotrace_map(struct trace_array *tr, -				struct trace_array_cpu *data, -				struct mmiotrace_map *map); -#endif -  #ifdef CONFIG_FTRACE_STARTUP_TEST -#ifdef CONFIG_FTRACE  extern int trace_selftest_startup_function(struct tracer *trace,  					   struct trace_array *tr); -#endif -#ifdef CONFIG_IRQSOFF_TRACER  extern int trace_selftest_startup_irqsoff(struct tracer *trace,  					  struct trace_array *tr); -#endif -#ifdef CONFIG_PREEMPT_TRACER  extern int trace_selftest_startup_preemptoff(struct tracer *trace,  					     struct trace_array *tr); -#endif -#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)  extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,  						 struct trace_array *tr); -#endif -#ifdef CONFIG_SCHED_TRACER  extern int trace_selftest_startup_wakeup(struct tracer *trace,  					 struct trace_array *tr); -#endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_nop(struct tracer *trace, +					 struct trace_array *tr);  extern int trace_selftest_startup_sched_switch(struct tracer *trace,  					       struct trace_array *tr); -#endif -#ifdef CONFIG_SYSPROF_TRACER  extern int trace_selftest_startup_sysprof(struct tracer *trace,  					       struct trace_array *tr); -#endif  #endif /* CONFIG_FTRACE_STARTUP_TEST */  extern void *head_page(struct trace_array_cpu *data);  extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern void trace_seq_print_cont(struct trace_seq *s, +				 struct trace_iterator *iter);  extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,  				 size_t cnt);  extern long ns2usecs(cycle_t nsec); +extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);  extern unsigned long trace_flags; @@ -334,6 +412,9 @@ enum trace_iterator_flags {  	TRACE_ITER_BLOCK		= 0x80,  	TRACE_ITER_STACKTRACE		= 0x100,  	TRACE_ITER_SCHED_TREE		= 0x200, +	TRACE_ITER_PRINTK		= 0x400,  }; +extern struct tracer nop_trace; +  #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c new file mode 100644 index 00000000000..d0a5e50eeff --- /dev/null +++ b/kernel/trace/trace_boot.c @@ -0,0 +1,126 @@ +/* + * ring buffer based initcalls tracer + * + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * + */ + +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> + +#include "trace.h" + +static struct trace_array *boot_trace; +static int trace_boot_enabled; + + +/* Should be started after do_pre_smp_initcalls() in init/main.c */ +void start_boot_trace(void) +{ +	trace_boot_enabled = 1; +} + +void stop_boot_trace(void) +{ +	trace_boot_enabled = 0; +} + +void reset_boot_trace(struct trace_array *tr) +{ +	stop_boot_trace(); +} + +static void boot_trace_init(struct trace_array *tr) +{ +	int cpu; +	boot_trace = tr; + +	trace_boot_enabled = 0; + +	for_each_cpu_mask(cpu, cpu_possible_map) +		tracing_reset(tr, cpu); +} + +static void boot_trace_ctrl_update(struct trace_array *tr) +{ +	if (tr->ctrl) +		start_boot_trace(); +	else +		stop_boot_trace(); +} + +static enum print_line_t initcall_print_line(struct trace_iterator *iter) +{ +	int ret; +	struct trace_entry *entry = iter->ent; +	struct trace_boot *field = (struct trace_boot *)entry; +	struct boot_trace *it = &field->initcall; +	struct trace_seq *s = &iter->seq; +	struct timespec calltime = ktime_to_timespec(it->calltime); +	struct timespec rettime = ktime_to_timespec(it->rettime); + +	if (entry->type == TRACE_BOOT) { +		ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n", +					  calltime.tv_sec, +					  calltime.tv_nsec, +					  it->func, it->caller); +		if (!ret) +			return TRACE_TYPE_PARTIAL_LINE; + +		ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " +					  "returned %d after %lld msecs\n", +					  rettime.tv_sec, +					  rettime.tv_nsec, +					  it->func, it->result, it->duration); + +		if (!ret) +			return TRACE_TYPE_PARTIAL_LINE; +		return TRACE_TYPE_HANDLED; +	} +	return TRACE_TYPE_UNHANDLED; +} + +struct tracer boot_tracer __read_mostly = +{ +	.name		= "initcall", +	.init		= boot_trace_init, +	.reset		= reset_boot_trace, +	.ctrl_update	= boot_trace_ctrl_update, +	.print_line	= initcall_print_line, +}; + +void trace_boot(struct boot_trace *it, initcall_t fn) +{ +	struct ring_buffer_event *event; +	struct trace_boot *entry; +	struct trace_array_cpu *data; +	unsigned long irq_flags; +	struct trace_array *tr = boot_trace; + +	if (!trace_boot_enabled) +		return; + +	/* Get its name now since this function could +	 * disappear because it is in the .init section. +	 */ +	sprint_symbol(it->func, (unsigned long)fn); +	preempt_disable(); +	data = tr->data[smp_processor_id()]; + +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					 &irq_flags); +	if (!event) +		goto out; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, 0, 0); +	entry->ent.type = TRACE_BOOT; +	entry->initcall = *it; +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + +	trace_wake_up(); + + out: +	preempt_enable(); +} diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 31214489797..e90eb0c2c56 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -23,7 +23,7 @@ static void function_reset(struct trace_array *tr)  	tr->time_start = ftrace_now(tr->cpu);  	for_each_online_cpu(cpu) -		tracing_reset(tr->data[cpu]); +		tracing_reset(tr, cpu);  }  static void start_function_trace(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ece6cfb649f..a7db7f040ae 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) -		trace_function(tr, data, ip, parent_ip, flags); +		trace_function(tr, data, ip, parent_ip, flags, preempt_count());  	atomic_dec(&data->disabled);  } @@ -130,6 +130,7 @@ check_critical_timing(struct trace_array *tr,  	unsigned long latency, t0, t1;  	cycle_t T0, T1, delta;  	unsigned long flags; +	int pc;  	/*  	 * usecs conversion is slow so we try to delay the conversion @@ -141,6 +142,8 @@ check_critical_timing(struct trace_array *tr,  	local_save_flags(flags); +	pc = preempt_count(); +  	if (!report_latency(delta))  		goto out; @@ -150,7 +153,7 @@ check_critical_timing(struct trace_array *tr,  	if (!report_latency(delta))  		goto out_unlock; -	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); +	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);  	latency = nsecs_to_usecs(delta); @@ -173,8 +176,8 @@ out_unlock:  out:  	data->critical_sequence = max_sequence;  	data->preempt_timestamp = ftrace_now(cpu); -	tracing_reset(data); -	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); +	tracing_reset(tr, cpu); +	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);  }  static inline void @@ -203,11 +206,11 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)  	data->critical_sequence = max_sequence;  	data->preempt_timestamp = ftrace_now(cpu);  	data->critical_start = parent_ip ? : ip; -	tracing_reset(data); +	tracing_reset(tr, cpu);  	local_save_flags(flags); -	trace_function(tr, data, ip, parent_ip, flags); +	trace_function(tr, data, ip, parent_ip, flags, preempt_count());  	per_cpu(tracing_cpu, cpu) = 1; @@ -234,14 +237,14 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)  	data = tr->data[cpu]; -	if (unlikely(!data) || unlikely(!head_page(data)) || +	if (unlikely(!data) ||  	    !data->critical_start || atomic_read(&data->disabled))  		return;  	atomic_inc(&data->disabled);  	local_save_flags(flags); -	trace_function(tr, data, ip, parent_ip, flags); +	trace_function(tr, data, ip, parent_ip, flags, preempt_count());  	check_critical_timing(tr, data, parent_ip ? : ip, cpu);  	data->critical_start = 0;  	atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b13dc19dcbb..f28484618ff 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -27,7 +27,7 @@ static void mmio_reset_data(struct trace_array *tr)  	tr->time_start = ftrace_now(tr->cpu);  	for_each_online_cpu(cpu) -		tracing_reset(tr->data[cpu]); +		tracing_reset(tr, cpu);  }  static void mmio_trace_init(struct trace_array *tr) @@ -130,10 +130,14 @@ static unsigned long count_overruns(struct trace_iterator *iter)  {  	int cpu;  	unsigned long cnt = 0; +/* FIXME: */ +#if 0  	for_each_online_cpu(cpu) {  		cnt += iter->overrun[cpu];  		iter->overrun[cpu] = 0;  	} +#endif +	(void)cpu;  	return cnt;  } @@ -171,17 +175,21 @@ print_out:  	return (ret == -EBUSY) ? 0 : ret;  } -static int mmio_print_rw(struct trace_iterator *iter) +static enum print_line_t mmio_print_rw(struct trace_iterator *iter)  {  	struct trace_entry *entry = iter->ent; -	struct mmiotrace_rw *rw	= &entry->mmiorw; +	struct trace_mmiotrace_rw *field; +	struct mmiotrace_rw *rw;  	struct trace_seq *s	= &iter->seq; -	unsigned long long t	= ns2usecs(entry->t); +	unsigned long long t	= ns2usecs(iter->ts);  	unsigned long usec_rem	= do_div(t, 1000000ULL);  	unsigned secs		= (unsigned long)t;  	int ret = 1; -	switch (entry->mmiorw.opcode) { +	trace_assign_type(field, entry); +	rw = &field->rw; + +	switch (rw->opcode) {  	case MMIO_READ:  		ret = trace_seq_printf(s,  			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", @@ -209,21 +217,25 @@ static int mmio_print_rw(struct trace_iterator *iter)  		break;  	}  	if (ret) -		return 1; -	return 0; +		return TRACE_TYPE_HANDLED; +	return TRACE_TYPE_PARTIAL_LINE;  } -static int mmio_print_map(struct trace_iterator *iter) +static enum print_line_t mmio_print_map(struct trace_iterator *iter)  {  	struct trace_entry *entry = iter->ent; -	struct mmiotrace_map *m	= &entry->mmiomap; +	struct trace_mmiotrace_map *field; +	struct mmiotrace_map *m;  	struct trace_seq *s	= &iter->seq; -	unsigned long long t	= ns2usecs(entry->t); +	unsigned long long t	= ns2usecs(iter->ts);  	unsigned long usec_rem	= do_div(t, 1000000ULL);  	unsigned secs		= (unsigned long)t; -	int ret = 1; +	int ret; -	switch (entry->mmiorw.opcode) { +	trace_assign_type(field, entry); +	m = &field->map; + +	switch (m->opcode) {  	case MMIO_PROBE:  		ret = trace_seq_printf(s,  			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", @@ -241,20 +253,43 @@ static int mmio_print_map(struct trace_iterator *iter)  		break;  	}  	if (ret) -		return 1; -	return 0; +		return TRACE_TYPE_HANDLED; +	return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t mmio_print_mark(struct trace_iterator *iter) +{ +	struct trace_entry *entry = iter->ent; +	struct print_entry *print = (struct print_entry *)entry; +	const char *msg		= print->buf; +	struct trace_seq *s	= &iter->seq; +	unsigned long long t	= ns2usecs(iter->ts); +	unsigned long usec_rem	= do_div(t, 1000000ULL); +	unsigned secs		= (unsigned long)t; +	int ret; + +	/* The trailing newline must be in the message. */ +	ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	if (entry->flags & TRACE_FLAG_CONT) +		trace_seq_print_cont(s, iter); + +	return TRACE_TYPE_HANDLED;  } -/* return 0 to abort printing without consuming current entry in pipe mode */ -static int mmio_print_line(struct trace_iterator *iter) +static enum print_line_t mmio_print_line(struct trace_iterator *iter)  {  	switch (iter->ent->type) {  	case TRACE_MMIO_RW:  		return mmio_print_rw(iter);  	case TRACE_MMIO_MAP:  		return mmio_print_map(iter); +	case TRACE_PRINT: +		return mmio_print_mark(iter);  	default: -		return 1; /* ignore unknown entries */ +		return TRACE_TYPE_HANDLED; /* ignore unknown entries */  	}  } @@ -276,6 +311,27 @@ __init static int init_mmio_trace(void)  }  device_initcall(init_mmio_trace); +static void __trace_mmiotrace_rw(struct trace_array *tr, +				struct trace_array_cpu *data, +				struct mmiotrace_rw *rw) +{ +	struct ring_buffer_event *event; +	struct trace_mmiotrace_rw *entry; +	unsigned long irq_flags; + +	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					   &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, 0, preempt_count()); +	entry->ent.type			= TRACE_MMIO_RW; +	entry->rw			= *rw; +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + +	trace_wake_up(); +} +  void mmio_trace_rw(struct mmiotrace_rw *rw)  {  	struct trace_array *tr = mmio_trace_array; @@ -283,6 +339,27 @@ void mmio_trace_rw(struct mmiotrace_rw *rw)  	__trace_mmiotrace_rw(tr, data, rw);  } +static void __trace_mmiotrace_map(struct trace_array *tr, +				struct trace_array_cpu *data, +				struct mmiotrace_map *map) +{ +	struct ring_buffer_event *event; +	struct trace_mmiotrace_map *entry; +	unsigned long irq_flags; + +	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					   &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, 0, preempt_count()); +	entry->ent.type			= TRACE_MMIO_MAP; +	entry->map			= *map; +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + +	trace_wake_up(); +} +  void mmio_trace_mapping(struct mmiotrace_map *map)  {  	struct trace_array *tr = mmio_trace_array; @@ -293,3 +370,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map)  	__trace_mmiotrace_map(tr, data, map);  	preempt_enable();  } + +int mmio_trace_printk(const char *fmt, va_list args) +{ +	return trace_vprintk(0, fmt, args); +} diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c new file mode 100644 index 00000000000..4592b486251 --- /dev/null +++ b/kernel/trace/trace_nop.c @@ -0,0 +1,64 @@ +/* + * nop tracer + * + * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net> + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> + +#include "trace.h" + +static struct trace_array	*ctx_trace; + +static void start_nop_trace(struct trace_array *tr) +{ +	/* Nothing to do! */ +} + +static void stop_nop_trace(struct trace_array *tr) +{ +	/* Nothing to do! */ +} + +static void nop_trace_init(struct trace_array *tr) +{ +	int cpu; +	ctx_trace = tr; + +	for_each_online_cpu(cpu) +		tracing_reset(tr, cpu); + +	if (tr->ctrl) +		start_nop_trace(tr); +} + +static void nop_trace_reset(struct trace_array *tr) +{ +	if (tr->ctrl) +		stop_nop_trace(tr); +} + +static void nop_trace_ctrl_update(struct trace_array *tr) +{ +	/* When starting a new trace, reset the buffers */ +	if (tr->ctrl) +		start_nop_trace(tr); +	else +		stop_nop_trace(tr); +} + +struct tracer nop_trace __read_mostly = +{ +	.name		= "nop", +	.init		= nop_trace_init, +	.reset		= nop_trace_reset, +	.ctrl_update	= nop_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST +	.selftest	= trace_selftest_startup_nop, +#endif +}; + diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb817a209aa..b8f56beb1a6 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -9,8 +9,8 @@  #include <linux/debugfs.h>  #include <linux/kallsyms.h>  #include <linux/uaccess.h> -#include <linux/marker.h>  #include <linux/ftrace.h> +#include <trace/sched.h>  #include "trace.h" @@ -19,15 +19,16 @@ static int __read_mostly	tracer_enabled;  static atomic_t			sched_ref;  static void -sched_switch_func(void *private, void *__rq, struct task_struct *prev, +probe_sched_switch(struct rq *__rq, struct task_struct *prev,  			struct task_struct *next)  { -	struct trace_array **ptr = private; -	struct trace_array *tr = *ptr;  	struct trace_array_cpu *data;  	unsigned long flags; -	long disabled;  	int cpu; +	int pc; + +	if (!atomic_read(&sched_ref)) +		return;  	tracing_record_cmdline(prev);  	tracing_record_cmdline(next); @@ -35,97 +36,41 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev,  	if (!tracer_enabled)  		return; +	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disabled = atomic_inc_return(&data->disabled); +	data = ctx_trace->data[cpu]; -	if (likely(disabled == 1)) -		tracing_sched_switch_trace(tr, data, prev, next, flags); +	if (likely(!atomic_read(&data->disabled))) +		tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); -	atomic_dec(&data->disabled);  	local_irq_restore(flags);  } -static notrace void -sched_switch_callback(void *probe_data, void *call_data, -		      const char *format, va_list *args) -{ -	struct task_struct *prev; -	struct task_struct *next; -	struct rq *__rq; - -	if (!atomic_read(&sched_ref)) -		return; - -	/* skip prev_pid %d next_pid %d prev_state %ld */ -	(void)va_arg(*args, int); -	(void)va_arg(*args, int); -	(void)va_arg(*args, long); -	__rq = va_arg(*args, typeof(__rq)); -	prev = va_arg(*args, typeof(prev)); -	next = va_arg(*args, typeof(next)); - -	/* -	 * If tracer_switch_func only points to the local -	 * switch func, it still needs the ptr passed to it. -	 */ -	sched_switch_func(probe_data, __rq, prev, next); -} -  static void -wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct -			task_struct *curr) +probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)  { -	struct trace_array **ptr = private; -	struct trace_array *tr = *ptr;  	struct trace_array_cpu *data;  	unsigned long flags; -	long disabled; -	int cpu; +	int cpu, pc; -	if (!tracer_enabled) +	if (!likely(tracer_enabled))  		return; -	tracing_record_cmdline(curr); +	pc = preempt_count(); +	tracing_record_cmdline(current);  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disabled = atomic_inc_return(&data->disabled); +	data = ctx_trace->data[cpu]; -	if (likely(disabled == 1)) -		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); +	if (likely(!atomic_read(&data->disabled))) +		tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, +					   flags, pc); -	atomic_dec(&data->disabled);  	local_irq_restore(flags);  } -static notrace void -wake_up_callback(void *probe_data, void *call_data, -		 const char *format, va_list *args) -{ -	struct task_struct *curr; -	struct task_struct *task; -	struct rq *__rq; - -	if (likely(!tracer_enabled)) -		return; - -	/* Skip pid %d state %ld */ -	(void)va_arg(*args, int); -	(void)va_arg(*args, long); -	/* now get the meat: "rq %p task %p rq->curr %p" */ -	__rq = va_arg(*args, typeof(__rq)); -	task = va_arg(*args, typeof(task)); -	curr = va_arg(*args, typeof(curr)); - -	tracing_record_cmdline(task); -	tracing_record_cmdline(curr); - -	wakeup_func(probe_data, __rq, task, curr); -} -  static void sched_switch_reset(struct trace_array *tr)  {  	int cpu; @@ -133,67 +78,47 @@ static void sched_switch_reset(struct trace_array *tr)  	tr->time_start = ftrace_now(tr->cpu);  	for_each_online_cpu(cpu) -		tracing_reset(tr->data[cpu]); +		tracing_reset(tr, cpu);  }  static int tracing_sched_register(void)  {  	int ret; -	ret = marker_probe_register("kernel_sched_wakeup", -			"pid %d state %ld ## rq %p task %p rq->curr %p", -			wake_up_callback, -			&ctx_trace); +	ret = register_trace_sched_wakeup(probe_sched_wakeup);  	if (ret) { -		pr_info("wakeup trace: Couldn't add marker" +		pr_info("wakeup trace: Couldn't activate tracepoint"  			" probe to kernel_sched_wakeup\n");  		return ret;  	} -	ret = marker_probe_register("kernel_sched_wakeup_new", -			"pid %d state %ld ## rq %p task %p rq->curr %p", -			wake_up_callback, -			&ctx_trace); +	ret = register_trace_sched_wakeup_new(probe_sched_wakeup);  	if (ret) { -		pr_info("wakeup trace: Couldn't add marker" +		pr_info("wakeup trace: Couldn't activate tracepoint"  			" probe to kernel_sched_wakeup_new\n");  		goto fail_deprobe;  	} -	ret = marker_probe_register("kernel_sched_schedule", -		"prev_pid %d next_pid %d prev_state %ld " -		"## rq %p prev %p next %p", -		sched_switch_callback, -		&ctx_trace); +	ret = register_trace_sched_switch(probe_sched_switch);  	if (ret) { -		pr_info("sched trace: Couldn't add marker" +		pr_info("sched trace: Couldn't activate tracepoint"  			" probe to kernel_sched_schedule\n");  		goto fail_deprobe_wake_new;  	}  	return ret;  fail_deprobe_wake_new: -	marker_probe_unregister("kernel_sched_wakeup_new", -				wake_up_callback, -				&ctx_trace); +	unregister_trace_sched_wakeup_new(probe_sched_wakeup);  fail_deprobe: -	marker_probe_unregister("kernel_sched_wakeup", -				wake_up_callback, -				&ctx_trace); +	unregister_trace_sched_wakeup(probe_sched_wakeup);  	return ret;  }  static void tracing_sched_unregister(void)  { -	marker_probe_unregister("kernel_sched_schedule", -				sched_switch_callback, -				&ctx_trace); -	marker_probe_unregister("kernel_sched_wakeup_new", -				wake_up_callback, -				&ctx_trace); -	marker_probe_unregister("kernel_sched_wakeup", -				wake_up_callback, -				&ctx_trace); +	unregister_trace_sched_switch(probe_sched_switch); +	unregister_trace_sched_wakeup_new(probe_sched_wakeup); +	unregister_trace_sched_wakeup(probe_sched_wakeup);  }  static void tracing_start_sched_switch(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e303ccb62cd..fe4a252c236 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@  #include <linux/kallsyms.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> -#include <linux/marker.h> +#include <trace/sched.h>  #include "trace.h" @@ -44,10 +44,12 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)  	long disabled;  	int resched;  	int cpu; +	int pc;  	if (likely(!wakeup_task))  		return; +	pc = preempt_count();  	resched = need_resched();  	preempt_disable_notrace(); @@ -70,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)  	if (task_cpu(wakeup_task) != cpu)  		goto unlock; -	trace_function(tr, data, ip, parent_ip, flags); +	trace_function(tr, data, ip, parent_ip, flags, pc);   unlock:  	__raw_spin_unlock(&wakeup_lock); @@ -112,17 +114,18 @@ static int report_latency(cycle_t delta)  }  static void notrace -wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, +probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,  	struct task_struct *next)  {  	unsigned long latency = 0, t0 = 0, t1 = 0; -	struct trace_array **ptr = private; -	struct trace_array *tr = *ptr;  	struct trace_array_cpu *data;  	cycle_t T0, T1, delta;  	unsigned long flags;  	long disabled;  	int cpu; +	int pc; + +	tracing_record_cmdline(prev);  	if (unlikely(!tracer_enabled))  		return; @@ -139,12 +142,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,  	if (next != wakeup_task)  		return; +	pc = preempt_count(); +  	/* The task we are waiting for is waking up */ -	data = tr->data[wakeup_cpu]; +	data = wakeup_trace->data[wakeup_cpu];  	/* disable local data, not wakeup_cpu data */  	cpu = raw_smp_processor_id(); -	disabled = atomic_inc_return(&tr->data[cpu]->disabled); +	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);  	if (likely(disabled != 1))  		goto out; @@ -155,7 +160,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,  	if (unlikely(!tracer_enabled || next != wakeup_task))  		goto out_unlock; -	trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); +	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);  	/*  	 * usecs conversion is slow so we try to delay the conversion @@ -174,39 +179,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,  	t0 = nsecs_to_usecs(T0);  	t1 = nsecs_to_usecs(T1); -	update_max_tr(tr, wakeup_task, wakeup_cpu); +	update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);  out_unlock: -	__wakeup_reset(tr); +	__wakeup_reset(wakeup_trace);  	__raw_spin_unlock(&wakeup_lock);  	local_irq_restore(flags);  out: -	atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -sched_switch_callback(void *probe_data, void *call_data, -		      const char *format, va_list *args) -{ -	struct task_struct *prev; -	struct task_struct *next; -	struct rq *__rq; - -	/* skip prev_pid %d next_pid %d prev_state %ld */ -	(void)va_arg(*args, int); -	(void)va_arg(*args, int); -	(void)va_arg(*args, long); -	__rq = va_arg(*args, typeof(__rq)); -	prev = va_arg(*args, typeof(prev)); -	next = va_arg(*args, typeof(next)); - -	tracing_record_cmdline(prev); - -	/* -	 * If tracer_switch_func only points to the local -	 * switch func, it still needs the ptr passed to it. -	 */ -	wakeup_sched_switch(probe_data, __rq, prev, next); +	atomic_dec(&wakeup_trace->data[cpu]->disabled);  }  static void __wakeup_reset(struct trace_array *tr) @@ -216,7 +196,7 @@ static void __wakeup_reset(struct trace_array *tr)  	for_each_possible_cpu(cpu) {  		data = tr->data[cpu]; -		tracing_reset(data); +		tracing_reset(tr, cpu);  	}  	wakeup_cpu = -1; @@ -240,19 +220,26 @@ static void wakeup_reset(struct trace_array *tr)  }  static void -wakeup_check_start(struct trace_array *tr, struct task_struct *p, -		   struct task_struct *curr) +probe_wakeup(struct rq *rq, struct task_struct *p)  {  	int cpu = smp_processor_id();  	unsigned long flags;  	long disabled; +	int pc; + +	if (likely(!tracer_enabled)) +		return; + +	tracing_record_cmdline(p); +	tracing_record_cmdline(current);  	if (likely(!rt_task(p)) ||  			p->prio >= wakeup_prio || -			p->prio >= curr->prio) +			p->prio >= current->prio)  		return; -	disabled = atomic_inc_return(&tr->data[cpu]->disabled); +	pc = preempt_count(); +	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -264,7 +251,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,  		goto out_locked;  	/* reset the trace */ -	__wakeup_reset(tr); +	__wakeup_reset(wakeup_trace);  	wakeup_cpu = task_cpu(p);  	wakeup_prio = p->prio; @@ -274,74 +261,37 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,  	local_save_flags(flags); -	tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); -	trace_function(tr, tr->data[wakeup_cpu], -		       CALLER_ADDR1, CALLER_ADDR2, flags); +	wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); +	trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], +		       CALLER_ADDR1, CALLER_ADDR2, flags, pc);  out_locked:  	__raw_spin_unlock(&wakeup_lock);  out: -	atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -wake_up_callback(void *probe_data, void *call_data, -		 const char *format, va_list *args) -{ -	struct trace_array **ptr = probe_data; -	struct trace_array *tr = *ptr; -	struct task_struct *curr; -	struct task_struct *task; -	struct rq *__rq; - -	if (likely(!tracer_enabled)) -		return; - -	/* Skip pid %d state %ld */ -	(void)va_arg(*args, int); -	(void)va_arg(*args, long); -	/* now get the meat: "rq %p task %p rq->curr %p" */ -	__rq = va_arg(*args, typeof(__rq)); -	task = va_arg(*args, typeof(task)); -	curr = va_arg(*args, typeof(curr)); - -	tracing_record_cmdline(task); -	tracing_record_cmdline(curr); - -	wakeup_check_start(tr, task, curr); +	atomic_dec(&wakeup_trace->data[cpu]->disabled);  }  static void start_wakeup_tracer(struct trace_array *tr)  {  	int ret; -	ret = marker_probe_register("kernel_sched_wakeup", -			"pid %d state %ld ## rq %p task %p rq->curr %p", -			wake_up_callback, -			&wakeup_trace); +	ret = register_trace_sched_wakeup(probe_wakeup);  	if (ret) { -		pr_info("wakeup trace: Couldn't add marker" +		pr_info("wakeup trace: Couldn't activate tracepoint"  			" probe to kernel_sched_wakeup\n");  		return;  	} -	ret = marker_probe_register("kernel_sched_wakeup_new", -			"pid %d state %ld ## rq %p task %p rq->curr %p", -			wake_up_callback, -			&wakeup_trace); +	ret = register_trace_sched_wakeup_new(probe_wakeup);  	if (ret) { -		pr_info("wakeup trace: Couldn't add marker" +		pr_info("wakeup trace: Couldn't activate tracepoint"  			" probe to kernel_sched_wakeup_new\n");  		goto fail_deprobe;  	} -	ret = marker_probe_register("kernel_sched_schedule", -		"prev_pid %d next_pid %d prev_state %ld " -		"## rq %p prev %p next %p", -		sched_switch_callback, -		&wakeup_trace); +	ret = register_trace_sched_switch(probe_wakeup_sched_switch);  	if (ret) { -		pr_info("sched trace: Couldn't add marker" +		pr_info("sched trace: Couldn't activate tracepoint"  			" probe to kernel_sched_schedule\n");  		goto fail_deprobe_wake_new;  	} @@ -363,28 +313,18 @@ static void start_wakeup_tracer(struct trace_array *tr)  	return;  fail_deprobe_wake_new: -	marker_probe_unregister("kernel_sched_wakeup_new", -				wake_up_callback, -				&wakeup_trace); +	unregister_trace_sched_wakeup_new(probe_wakeup);  fail_deprobe: -	marker_probe_unregister("kernel_sched_wakeup", -				wake_up_callback, -				&wakeup_trace); +	unregister_trace_sched_wakeup(probe_wakeup);  }  static void stop_wakeup_tracer(struct trace_array *tr)  {  	tracer_enabled = 0;  	unregister_ftrace_function(&trace_ops); -	marker_probe_unregister("kernel_sched_schedule", -				sched_switch_callback, -				&wakeup_trace); -	marker_probe_unregister("kernel_sched_wakeup_new", -				wake_up_callback, -				&wakeup_trace); -	marker_probe_unregister("kernel_sched_wakeup", -				wake_up_callback, -				&wakeup_trace); +	unregister_trace_sched_switch(probe_wakeup_sched_switch); +	unregister_trace_sched_wakeup_new(probe_wakeup); +	unregister_trace_sched_wakeup(probe_wakeup);  }  static void wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0911b7e073b..09cf230d7ec 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,65 +9,29 @@ static inline int trace_valid_entry(struct trace_entry *entry)  	case TRACE_FN:  	case TRACE_CTX:  	case TRACE_WAKE: +	case TRACE_CONT:  	case TRACE_STACK: +	case TRACE_PRINT:  	case TRACE_SPECIAL:  		return 1;  	}  	return 0;  } -static int -trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)  { -	struct trace_entry *entries; -	struct page *page; -	int idx = 0; -	int i; +	struct ring_buffer_event *event; +	struct trace_entry *entry; -	BUG_ON(list_empty(&data->trace_pages)); -	page = list_entry(data->trace_pages.next, struct page, lru); -	entries = page_address(page); +	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { +		entry = ring_buffer_event_data(event); -	check_pages(data); -	if (head_page(data) != entries) -		goto failed; - -	/* -	 * The starting trace buffer always has valid elements, -	 * if any element exists. -	 */ -	entries = head_page(data); - -	for (i = 0; i < tr->entries; i++) { - -		if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { +		if (!trace_valid_entry(entry)) {  			printk(KERN_CONT ".. invalid entry %d ", -				entries[idx].type); +				entry->type);  			goto failed;  		} - -		idx++; -		if (idx >= ENTRIES_PER_PAGE) { -			page = virt_to_page(entries); -			if (page->lru.next == &data->trace_pages) { -				if (i != tr->entries - 1) { -					printk(KERN_CONT ".. entries buffer mismatch"); -					goto failed; -				} -			} else { -				page = list_entry(page->lru.next, struct page, lru); -				entries = page_address(page); -			} -			idx = 0; -		}  	} - -	page = virt_to_page(entries); -	if (page->lru.next != &data->trace_pages) { -		printk(KERN_CONT ".. too many entries"); -		goto failed; -	} -  	return 0;   failed: @@ -89,13 +53,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	/* Don't allow flipping of max traces now */  	raw_local_irq_save(flags);  	__raw_spin_lock(&ftrace_max_lock); -	for_each_possible_cpu(cpu) { -		if (!head_page(tr->data[cpu])) -			continue; -		cnt += tr->data[cpu]->trace_idx; +	cnt = ring_buffer_entries(tr->buffer); -		ret = trace_test_buffer_cpu(tr, tr->data[cpu]); +	for_each_possible_cpu(cpu) { +		ret = trace_test_buffer_cpu(tr, cpu);  		if (ret)  			break;  	} @@ -120,11 +82,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  					   struct trace_array *tr,  					   int (*func)(void))  { -	unsigned long count; -	int ret;  	int save_ftrace_enabled = ftrace_enabled;  	int save_tracer_enabled = tracer_enabled; +	unsigned long count;  	char *func_name; +	int ret;  	/* The ftrace test PASSED */  	printk(KERN_CONT "PASSED\n"); @@ -157,6 +119,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	/* enable tracing */  	tr->ctrl = 1;  	trace->init(tr); +  	/* Sleep for a 1/10 of a second */  	msleep(100); @@ -212,10 +175,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  int  trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  { -	unsigned long count; -	int ret;  	int save_ftrace_enabled = ftrace_enabled;  	int save_tracer_enabled = tracer_enabled; +	unsigned long count; +	int ret;  	/* make sure msleep has been recorded */  	msleep(1); @@ -415,6 +378,15 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  }  #endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ +#ifdef CONFIG_NOP_TRACER +int +trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) +{ +	/* What could possibly go wrong? */ +	return 0; +} +#endif +  #ifdef CONFIG_SCHED_TRACER  static int trace_wakeup_test_thread(void *data)  { @@ -486,6 +458,9 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  	wake_up_process(p); +	/* give a little time to let the thread wake up */ +	msleep(100); +  	/* stop the tracing. */  	tr->ctrl = 0;  	trace->ctrl_update(tr); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c new file mode 100644 index 00000000000..74c5d9a3afa --- /dev/null +++ b/kernel/trace/trace_stack.c @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/stacktrace.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/fs.h> +#include "trace.h" + +#define STACK_TRACE_ENTRIES 500 + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = +	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; +static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; + +static struct stack_trace max_stack_trace = { +	.max_entries		= STACK_TRACE_ENTRIES, +	.entries		= stack_dump_trace, +}; + +static unsigned long max_stack_size; +static raw_spinlock_t max_stack_lock = +	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static int stack_trace_disabled __read_mostly; +static DEFINE_PER_CPU(int, trace_active); + +static inline void check_stack(void) +{ +	unsigned long this_size, flags; +	unsigned long *p, *top, *start; +	int i; + +	this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); +	this_size = THREAD_SIZE - this_size; + +	if (this_size <= max_stack_size) +		return; + +	raw_local_irq_save(flags); +	__raw_spin_lock(&max_stack_lock); + +	/* a race could have already updated it */ +	if (this_size <= max_stack_size) +		goto out; + +	max_stack_size = this_size; + +	max_stack_trace.nr_entries	= 0; +	max_stack_trace.skip		= 3; + +	save_stack_trace(&max_stack_trace); + +	/* +	 * Now find where in the stack these are. +	 */ +	i = 0; +	start = &this_size; +	top = (unsigned long *) +		(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); + +	/* +	 * Loop through all the entries. One of the entries may +	 * for some reason be missed on the stack, so we may +	 * have to account for them. If they are all there, this +	 * loop will only happen once. This code only takes place +	 * on a new max, so it is far from a fast path. +	 */ +	while (i < max_stack_trace.nr_entries) { + +		stack_dump_index[i] = this_size; +		p = start; + +		for (; p < top && i < max_stack_trace.nr_entries; p++) { +			if (*p == stack_dump_trace[i]) { +				this_size = stack_dump_index[i++] = +					(top - p) * sizeof(unsigned long); +				/* Start the search from here */ +				start = p + 1; +			} +		} + +		i++; +	} + + out: +	__raw_spin_unlock(&max_stack_lock); +	raw_local_irq_restore(flags); +} + +static void +stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ +	int cpu, resched; + +	if (unlikely(!ftrace_enabled || stack_trace_disabled)) +		return; + +	resched = need_resched(); +	preempt_disable_notrace(); + +	cpu = raw_smp_processor_id(); +	/* no atomic needed, we only modify this variable by this cpu */ +	if (per_cpu(trace_active, cpu)++ != 0) +		goto out; + +	check_stack(); + + out: +	per_cpu(trace_active, cpu)--; +	/* prevent recursion in schedule */ +	if (resched) +		preempt_enable_no_resched_notrace(); +	else +		preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ +	.func = stack_trace_call, +}; + +static ssize_t +stack_max_size_read(struct file *filp, char __user *ubuf, +		    size_t count, loff_t *ppos) +{ +	unsigned long *ptr = filp->private_data; +	char buf[64]; +	int r; + +	r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); +	if (r > sizeof(buf)) +		r = sizeof(buf); +	return simple_read_from_buffer(ubuf, count, ppos, buf, r); +} + +static ssize_t +stack_max_size_write(struct file *filp, const char __user *ubuf, +		     size_t count, loff_t *ppos) +{ +	long *ptr = filp->private_data; +	unsigned long val, flags; +	char buf[64]; +	int ret; + +	if (count >= sizeof(buf)) +		return -EINVAL; + +	if (copy_from_user(&buf, ubuf, count)) +		return -EFAULT; + +	buf[count] = 0; + +	ret = strict_strtoul(buf, 10, &val); +	if (ret < 0) +		return ret; + +	raw_local_irq_save(flags); +	__raw_spin_lock(&max_stack_lock); +	*ptr = val; +	__raw_spin_unlock(&max_stack_lock); +	raw_local_irq_restore(flags); + +	return count; +} + +static struct file_operations stack_max_size_fops = { +	.open		= tracing_open_generic, +	.read		= stack_max_size_read, +	.write		= stack_max_size_write, +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ +	long i = (long)m->private; + +	(*pos)++; + +	i++; + +	if (i >= max_stack_trace.nr_entries || +	    stack_dump_trace[i] == ULONG_MAX) +		return NULL; + +	m->private = (void *)i; + +	return &m->private; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ +	void *t = &m->private; +	loff_t l = 0; + +	local_irq_disable(); +	__raw_spin_lock(&max_stack_lock); + +	for (; t && l < *pos; t = t_next(m, t, &l)) +		; + +	return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ +	__raw_spin_unlock(&max_stack_lock); +	local_irq_enable(); +} + +static int trace_lookup_stack(struct seq_file *m, long i) +{ +	unsigned long addr = stack_dump_trace[i]; +#ifdef CONFIG_KALLSYMS +	char str[KSYM_SYMBOL_LEN]; + +	sprint_symbol(str, addr); + +	return seq_printf(m, "%s\n", str); +#else +	return seq_printf(m, "%p\n", (void*)addr); +#endif +} + +static int t_show(struct seq_file *m, void *v) +{ +	long i = *(long *)v; +	int size; + +	if (i < 0) { +		seq_printf(m, "        Depth   Size      Location" +			   "    (%d entries)\n" +			   "        -----   ----      --------\n", +			   max_stack_trace.nr_entries); +		return 0; +	} + +	if (i >= max_stack_trace.nr_entries || +	    stack_dump_trace[i] == ULONG_MAX) +		return 0; + +	if (i+1 == max_stack_trace.nr_entries || +	    stack_dump_trace[i+1] == ULONG_MAX) +		size = stack_dump_index[i]; +	else +		size = stack_dump_index[i] - stack_dump_index[i+1]; + +	seq_printf(m, "%3ld) %8d   %5d   ", i, stack_dump_index[i], size); + +	trace_lookup_stack(m, i); + +	return 0; +} + +static struct seq_operations stack_trace_seq_ops = { +	.start		= t_start, +	.next		= t_next, +	.stop		= t_stop, +	.show		= t_show, +}; + +static int stack_trace_open(struct inode *inode, struct file *file) +{ +	int ret; + +	ret = seq_open(file, &stack_trace_seq_ops); +	if (!ret) { +		struct seq_file *m = file->private_data; +		m->private = (void *)-1; +	} + +	return ret; +} + +static struct file_operations stack_trace_fops = { +	.open		= stack_trace_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +}; + +static __init int stack_trace_init(void) +{ +	struct dentry *d_tracer; +	struct dentry *entry; + +	d_tracer = tracing_init_dentry(); + +	entry = debugfs_create_file("stack_max_size", 0644, d_tracer, +				    &max_stack_size, &stack_max_size_fops); +	if (!entry) +		pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + +	entry = debugfs_create_file("stack_trace", 0444, d_tracer, +				    NULL, &stack_trace_fops); +	if (!entry) +		pr_warning("Could not create debugfs 'stack_trace' entry\n"); + +	register_ftrace_function(&trace_ops); + +	return 0; +} + +device_initcall(stack_trace_init); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index db58fb66a13..9587d3bcba5 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -241,7 +241,7 @@ static void stack_reset(struct trace_array *tr)  	tr->time_start = ftrace_now(tr->cpu);  	for_each_online_cpu(cpu) -		tracing_reset(tr->data[cpu]); +		tracing_reset(tr, cpu);  }  static void start_stack_trace(struct trace_array *tr) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c new file mode 100644 index 00000000000..f2b7c28a470 --- /dev/null +++ b/kernel/tracepoint.c @@ -0,0 +1,477 @@ +/* + * Copyright (C) 2008 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/tracepoint.h> +#include <linux/err.h> +#include <linux/slab.h> + +extern struct tracepoint __start___tracepoints[]; +extern struct tracepoint __stop___tracepoints[]; + +/* Set to 1 to enable tracepoint debug output */ +static const int tracepoint_debug; + +/* + * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the + * builtin and module tracepoints and the hash table. + */ +static DEFINE_MUTEX(tracepoints_mutex); + +/* + * Tracepoint hash table, containing the active tracepoints. + * Protected by tracepoints_mutex. + */ +#define TRACEPOINT_HASH_BITS 6 +#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) + +/* + * Note about RCU : + * It is used to to delay the free of multiple probes array until a quiescent + * state is reached. + * Tracepoint entries modifications are protected by the tracepoints_mutex. + */ +struct tracepoint_entry { +	struct hlist_node hlist; +	void **funcs; +	int refcount;	/* Number of times armed. 0 if disarmed. */ +	struct rcu_head rcu; +	void *oldptr; +	unsigned char rcu_pending:1; +	char name[0]; +}; + +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; + +static void free_old_closure(struct rcu_head *head) +{ +	struct tracepoint_entry *entry = container_of(head, +		struct tracepoint_entry, rcu); +	kfree(entry->oldptr); +	/* Make sure we free the data before setting the pending flag to 0 */ +	smp_wmb(); +	entry->rcu_pending = 0; +} + +static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +{ +	if (!old) +		return; +	entry->oldptr = old; +	entry->rcu_pending = 1; +	/* write rcu_pending before calling the RCU callback */ +	smp_wmb(); +	call_rcu_sched(&entry->rcu, free_old_closure); +} + +static void debug_print_probes(struct tracepoint_entry *entry) +{ +	int i; + +	if (!tracepoint_debug) +		return; + +	for (i = 0; entry->funcs[i]; i++) +		printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); +} + +static void * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) +{ +	int nr_probes = 0; +	void **old, **new; + +	WARN_ON(!probe); + +	debug_print_probes(entry); +	old = entry->funcs; +	if (old) { +		/* (N -> N+1), (N != 0, 1) probes */ +		for (nr_probes = 0; old[nr_probes]; nr_probes++) +			if (old[nr_probes] == probe) +				return ERR_PTR(-EEXIST); +	} +	/* + 2 : one for new probe, one for NULL func */ +	new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); +	if (new == NULL) +		return ERR_PTR(-ENOMEM); +	if (old) +		memcpy(new, old, nr_probes * sizeof(void *)); +	new[nr_probes] = probe; +	entry->refcount = nr_probes + 1; +	entry->funcs = new; +	debug_print_probes(entry); +	return old; +} + +static void * +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) +{ +	int nr_probes = 0, nr_del = 0, i; +	void **old, **new; + +	old = entry->funcs; + +	debug_print_probes(entry); +	/* (N -> M), (N > 1, M >= 0) probes */ +	for (nr_probes = 0; old[nr_probes]; nr_probes++) { +		if ((!probe || old[nr_probes] == probe)) +			nr_del++; +	} + +	if (nr_probes - nr_del == 0) { +		/* N -> 0, (N > 1) */ +		entry->funcs = NULL; +		entry->refcount = 0; +		debug_print_probes(entry); +		return old; +	} else { +		int j = 0; +		/* N -> M, (N > 1, M > 0) */ +		/* + 1 for NULL */ +		new = kzalloc((nr_probes - nr_del + 1) +			* sizeof(void *), GFP_KERNEL); +		if (new == NULL) +			return ERR_PTR(-ENOMEM); +		for (i = 0; old[i]; i++) +			if ((probe && old[i] != probe)) +				new[j++] = old[i]; +		entry->refcount = nr_probes - nr_del; +		entry->funcs = new; +	} +	debug_print_probes(entry); +	return old; +} + +/* + * Get tracepoint if the tracepoint is present in the tracepoint hash table. + * Must be called with tracepoints_mutex held. + * Returns NULL if not present. + */ +static struct tracepoint_entry *get_tracepoint(const char *name) +{ +	struct hlist_head *head; +	struct hlist_node *node; +	struct tracepoint_entry *e; +	u32 hash = jhash(name, strlen(name), 0); + +	head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; +	hlist_for_each_entry(e, node, head, hlist) { +		if (!strcmp(name, e->name)) +			return e; +	} +	return NULL; +} + +/* + * Add the tracepoint to the tracepoint hash table. Must be called with + * tracepoints_mutex held. + */ +static struct tracepoint_entry *add_tracepoint(const char *name) +{ +	struct hlist_head *head; +	struct hlist_node *node; +	struct tracepoint_entry *e; +	size_t name_len = strlen(name) + 1; +	u32 hash = jhash(name, name_len-1, 0); + +	head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; +	hlist_for_each_entry(e, node, head, hlist) { +		if (!strcmp(name, e->name)) { +			printk(KERN_NOTICE +				"tracepoint %s busy\n", name); +			return ERR_PTR(-EEXIST);	/* Already there */ +		} +	} +	/* +	 * Using kmalloc here to allocate a variable length element. Could +	 * cause some memory fragmentation if overused. +	 */ +	e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); +	if (!e) +		return ERR_PTR(-ENOMEM); +	memcpy(&e->name[0], name, name_len); +	e->funcs = NULL; +	e->refcount = 0; +	e->rcu_pending = 0; +	hlist_add_head(&e->hlist, head); +	return e; +} + +/* + * Remove the tracepoint from the tracepoint hash table. Must be called with + * mutex_lock held. + */ +static int remove_tracepoint(const char *name) +{ +	struct hlist_head *head; +	struct hlist_node *node; +	struct tracepoint_entry *e; +	int found = 0; +	size_t len = strlen(name) + 1; +	u32 hash = jhash(name, len-1, 0); + +	head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; +	hlist_for_each_entry(e, node, head, hlist) { +		if (!strcmp(name, e->name)) { +			found = 1; +			break; +		} +	} +	if (!found) +		return -ENOENT; +	if (e->refcount) +		return -EBUSY; +	hlist_del(&e->hlist); +	/* Make sure the call_rcu_sched has been executed */ +	if (e->rcu_pending) +		rcu_barrier_sched(); +	kfree(e); +	return 0; +} + +/* + * Sets the probe callback corresponding to one tracepoint. + */ +static void set_tracepoint(struct tracepoint_entry **entry, +	struct tracepoint *elem, int active) +{ +	WARN_ON(strcmp((*entry)->name, elem->name) != 0); + +	/* +	 * rcu_assign_pointer has a smp_wmb() which makes sure that the new +	 * probe callbacks array is consistent before setting a pointer to it. +	 * This array is referenced by __DO_TRACE from +	 * include/linux/tracepoints.h. A matching smp_read_barrier_depends() +	 * is used. +	 */ +	rcu_assign_pointer(elem->funcs, (*entry)->funcs); +	elem->state = active; +} + +/* + * Disable a tracepoint and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. + */ +static void disable_tracepoint(struct tracepoint *elem) +{ +	elem->state = 0; +} + +/** + * tracepoint_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of tracepoints. + */ +void tracepoint_update_probe_range(struct tracepoint *begin, +	struct tracepoint *end) +{ +	struct tracepoint *iter; +	struct tracepoint_entry *mark_entry; + +	mutex_lock(&tracepoints_mutex); +	for (iter = begin; iter < end; iter++) { +		mark_entry = get_tracepoint(iter->name); +		if (mark_entry) { +			set_tracepoint(&mark_entry, iter, +					!!mark_entry->refcount); +		} else { +			disable_tracepoint(iter); +		} +	} +	mutex_unlock(&tracepoints_mutex); +} + +/* + * Update probes, removing the faulty probes. + */ +static void tracepoint_update_probes(void) +{ +	/* Core kernel tracepoints */ +	tracepoint_update_probe_range(__start___tracepoints, +		__stop___tracepoints); +	/* tracepoints in modules. */ +	module_update_tracepoints(); +} + +/** + * tracepoint_probe_register -  Connect a probe to a tracepoint + * @name: tracepoint name + * @probe: probe handler + * + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int tracepoint_probe_register(const char *name, void *probe) +{ +	struct tracepoint_entry *entry; +	int ret = 0; +	void *old; + +	mutex_lock(&tracepoints_mutex); +	entry = get_tracepoint(name); +	if (!entry) { +		entry = add_tracepoint(name); +		if (IS_ERR(entry)) { +			ret = PTR_ERR(entry); +			goto end; +		} +	} +	/* +	 * If we detect that a call_rcu_sched is pending for this tracepoint, +	 * make sure it's executed now. +	 */ +	if (entry->rcu_pending) +		rcu_barrier_sched(); +	old = tracepoint_entry_add_probe(entry, probe); +	if (IS_ERR(old)) { +		ret = PTR_ERR(old); +		goto end; +	} +	mutex_unlock(&tracepoints_mutex); +	tracepoint_update_probes();		/* may update entry */ +	mutex_lock(&tracepoints_mutex); +	entry = get_tracepoint(name); +	WARN_ON(!entry); +	if (entry->rcu_pending) +		rcu_barrier_sched(); +	tracepoint_entry_free_old(entry, old); +end: +	mutex_unlock(&tracepoints_mutex); +	return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register); + +/** + * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint + * @name: tracepoint name + * @probe: probe function pointer + * + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int tracepoint_probe_unregister(const char *name, void *probe) +{ +	struct tracepoint_entry *entry; +	void *old; +	int ret = -ENOENT; + +	mutex_lock(&tracepoints_mutex); +	entry = get_tracepoint(name); +	if (!entry) +		goto end; +	if (entry->rcu_pending) +		rcu_barrier_sched(); +	old = tracepoint_entry_remove_probe(entry, probe); +	mutex_unlock(&tracepoints_mutex); +	tracepoint_update_probes();		/* may update entry */ +	mutex_lock(&tracepoints_mutex); +	entry = get_tracepoint(name); +	if (!entry) +		goto end; +	if (entry->rcu_pending) +		rcu_barrier_sched(); +	tracepoint_entry_free_old(entry, old); +	remove_tracepoint(name);	/* Ignore busy error message */ +	ret = 0; +end: +	mutex_unlock(&tracepoints_mutex); +	return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +/** + * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. + * @tracepoint: current tracepoints (in), next tracepoint (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next tracepoint has been found (1) or not (0). + * Will return the first tracepoint in the range if the input tracepoint is + * NULL. + */ +int tracepoint_get_iter_range(struct tracepoint **tracepoint, +	struct tracepoint *begin, struct tracepoint *end) +{ +	if (!*tracepoint && begin != end) { +		*tracepoint = begin; +		return 1; +	} +	if (*tracepoint >= begin && *tracepoint < end) +		return 1; +	return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); + +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ +	int found = 0; + +	/* Core kernel tracepoints */ +	if (!iter->module) { +		found = tracepoint_get_iter_range(&iter->tracepoint, +				__start___tracepoints, __stop___tracepoints); +		if (found) +			goto end; +	} +	/* tracepoints in modules. */ +	found = module_get_iter_tracepoints(iter); +end: +	if (!found) +		tracepoint_iter_reset(iter); +} + +void tracepoint_iter_start(struct tracepoint_iter *iter) +{ +	tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_start); + +void tracepoint_iter_next(struct tracepoint_iter *iter) +{ +	iter->tracepoint++; +	/* +	 * iter->tracepoint may be invalid because we blindly incremented it. +	 * Make sure it is valid by marshalling on the tracepoints, getting the +	 * tracepoints from following modules if necessary. +	 */ +	tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_next); + +void tracepoint_iter_stop(struct tracepoint_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(tracepoint_iter_stop); + +void tracepoint_iter_reset(struct tracepoint_iter *iter) +{ +	iter->module = NULL; +	iter->tracepoint = NULL; +} +EXPORT_SYMBOL_GPL(tracepoint_iter_reset);  |