diff options
Diffstat (limited to 'kernel')
65 files changed, 4971 insertions, 1710 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index ab4f1090f43..b3097bde4e9 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -4,3 +4,4 @@  config_data.h  config_data.gz  timeconst.h +hz.bc diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f1..9816a1b96cf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	/* As soon as there's any sign of userspace auditd,  	 * start kauditd to talk to it */ -	if (!kauditd_task) +	if (!kauditd_task) {  		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); -	if (IS_ERR(kauditd_task)) { -		err = PTR_ERR(kauditd_task); -		kauditd_task = NULL; -		return err; +		if (IS_ERR(kauditd_task)) { +			err = PTR_ERR(kauditd_task); +			kauditd_task = NULL; +			return err; +		}  	} -  	loginuid = audit_get_loginuid(current);  	sessionid = audit_get_sessionid(current);  	security_task_getsecid(current, &sid); diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1..11468d99dad 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -59,10 +59,7 @@ struct audit_entry {  	struct audit_krule	rule;  }; -#ifdef CONFIG_AUDIT -extern int audit_enabled;  extern int audit_ever_enabled; -#endif  extern int audit_pid; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d..a291aa23fb3 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -617,9 +617,9 @@ void audit_trim_trees(void)  		}  		spin_unlock(&hash_lock);  		trim_marked(tree); -		put_tree(tree);  		drop_collected_mounts(root_mnt);  skip_it: +		put_tree(tree);  		mutex_lock(&audit_filter_mutex);  	}  	list_del(&cursor); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06..267436826c3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -594,6 +594,10 @@ exit_nofree:  	return entry;  exit_free: +	if (entry->rule.watch) +		audit_put_watch(entry->rule.watch); /* matches initial get */ +	if (entry->rule.tree) +		audit_put_tree(entry->rule.tree); /* that's the temporary one */  	audit_free_rule(entry);  	return ERR_PTR(err);  } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a..c68229411a7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)  	}  } -static inline void audit_zero_context(struct audit_context *context, -				      enum audit_state state) -{ -	memset(context, 0, sizeof(*context)); -	context->state      = state; -	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} -  static inline struct audit_context *audit_alloc_context(enum audit_state state)  {  	struct audit_context *context; -	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) +	context = kzalloc(sizeof(*context), GFP_KERNEL); +	if (!context)  		return NULL; -	audit_zero_context(context, state); +	context->state = state; +	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;  	INIT_LIST_HEAD(&context->killed_trees);  	INIT_LIST_HEAD(&context->names_list);  	return context; diff --git a/kernel/capability.c b/kernel/capability.c index 493d9725948..f6c2ce5701e 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)  EXPORT_SYMBOL(ns_capable);  /** + * file_ns_capable - Determine if the file's opener had a capability in effect + * @file:  The file we want to check + * @ns:  The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if task that opened the file had a capability in effect + * when the file was opened. + * + * This does not set PF_SUPERPRIV because the caller may not + * actually be privileged. + */ +bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +{ +	if (WARN_ON_ONCE(!cap_valid(cap))) +		return false; + +	if (security_capable(file->f_cred, ns, cap) == 0) +		return true; + +	return false; +} +EXPORT_SYMBOL(file_ns_capable); + +/**   * capable - Determine if the current task has a superior capability in effect   * @cap: The capability to be tested for   * diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666..dfaf50d4705 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5416,55 +5416,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)  }  EXPORT_SYMBOL_GPL(css_lookup); -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, -	     struct cgroup_subsys_state *root, int *foundid) -{ -	struct cgroup_subsys_state *ret = NULL; -	struct css_id *tmp; -	int tmpid; -	int rootid = css_id(root); -	int depth = css_depth(root); - -	if (!rootid) -		return NULL; - -	BUG_ON(!ss->use_id); -	WARN_ON_ONCE(!rcu_read_lock_held()); - -	/* fill start point for scan */ -	tmpid = id; -	while (1) { -		/* -		 * scan next entry from bitmap(tree), tmpid is updated after -		 * idr_get_next(). -		 */ -		tmp = idr_get_next(&ss->idr, &tmpid); -		if (!tmp) -			break; -		if (tmp->depth >= depth && tmp->stack[depth] == rootid) { -			ret = rcu_dereference(tmp->css); -			if (ret) { -				*foundid = tmpid; -				break; -			} -		} -		/* continue to scan from next id */ -		tmpid = tmpid + 1; -	} -	return ret; -} -  /*   * get corresponding css from file open on cgroupfs directory   */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecb..334d983a36b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2251,7 +2251,6 @@ void cpuset_update_active_cpus(bool cpu_online)  	schedule_work(&cpuset_hotplug_work);  } -#ifdef CONFIG_MEMORY_HOTPLUG  /*   * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].   * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2263,20 +2262,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,  	schedule_work(&cpuset_hotplug_work);  	return NOTIFY_OK;  } -#endif + +static struct notifier_block cpuset_track_online_nodes_nb = { +	.notifier_call = cpuset_track_online_nodes, +	.priority = 10,		/* ??! */ +};  /**   * cpuset_init_smp - initialize cpus_allowed   *   * Description: Finish top cpuset after cpu, node maps are initialized - **/ - + */  void __init cpuset_init_smp(void)  {  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);  	top_cpuset.mems_allowed = node_states[N_MEMORY]; -	hotplug_memory_notifier(cpuset_track_online_nodes, 10); +	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);  	cpuset_propagate_hotplug_wq =  		alloc_ordered_workqueue("cpuset_hotplug", 0); diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c3..9fcb0944f07 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)  			if (ctxn < 0)  				goto next;  			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); +			if (ctx) +				perf_event_task_ctx(ctx, task_event);  		} -		if (ctx) -			perf_event_task_ctx(ctx, task_event);  next:  		put_cpu_ptr(pmu->pmu_cpu_context);  	} +	if (task_event->task_ctx) +		perf_event_task_ctx(task_event->task_ctx, task_event); +  	rcu_read_unlock();  } @@ -4593,6 +4596,7 @@ void perf_event_comm(struct task_struct *task)  	struct perf_event_context *ctx;  	int ctxn; +	rcu_read_lock();  	for_each_task_context_nr(ctxn) {  		ctx = task->perf_event_ctxp[ctxn];  		if (!ctx) @@ -4600,6 +4604,7 @@ void perf_event_comm(struct task_struct *task)  		perf_event_enable_on_exec(ctx);  	} +	rcu_read_unlock();  	if (!atomic_read(&nr_comm_events))  		return; @@ -4734,7 +4739,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)  	} else {  		if (arch_vma_name(mmap_event->vma)) {  			name = strncpy(tmp, arch_vma_name(mmap_event->vma), -				       sizeof(tmp)); +				       sizeof(tmp) - 1); +			tmp[sizeof(tmp) - 1] = '\0';  			goto got_name;  		} @@ -5327,7 +5333,7 @@ static void sw_perf_event_destroy(struct perf_event *event)  static int perf_swevent_init(struct perf_event *event)  { -	int event_id = event->attr.config; +	u64 event_id = event->attr.config;  	if (event->attr.type != PERF_TYPE_SOFTWARE)  		return -ENOENT; @@ -5647,6 +5653,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)  		event->attr.sample_period = NSEC_PER_SEC / freq;  		hwc->sample_period = event->attr.sample_period;  		local64_set(&hwc->period_left, hwc->sample_period); +		hwc->last_period = hwc->sample_period;  		event->attr.freq = 0;  	}  } @@ -5982,6 +5989,7 @@ skip_type:  	if (pmu->pmu_cpu_context)  		goto got_cpu_context; +	ret = -ENOMEM;  	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);  	if (!pmu->pmu_cpu_context)  		goto free_dev; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8..eb675c4d59d 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -16,7 +16,7 @@ struct ring_buffer {  	int				page_order;	/* allocation order  */  #endif  	int				nr_pages;	/* nr of data pages  */ -	int				writable;	/* are we writable   */ +	int				overwrite;	/* can overwrite itself */  	atomic_t			poll;		/* POLL_ for wakeups */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff397..97fddb09762 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -18,12 +18,24 @@  static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,  			      unsigned long offset, unsigned long head)  { -	unsigned long mask; +	unsigned long sz = perf_data_size(rb); +	unsigned long mask = sz - 1; -	if (!rb->writable) +	/* +	 * check if user-writable +	 * overwrite : over-write its own tail +	 * !overwrite: buffer possibly drops events. +	 */ +	if (rb->overwrite)  		return true; -	mask = perf_data_size(rb) - 1; +	/* +	 * verify that payload is not bigger than buffer +	 * otherwise masking logic may fail to detect +	 * the "not enough space" condition +	 */ +	if ((head - offset) > sz) +		return false;  	offset = (offset - tail) & mask;  	head   = (head   - tail) & mask; @@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)  		rb->watermark = max_size / 2;  	if (flags & RING_BUFFER_WRITABLE) -		rb->writable = 1; +		rb->overwrite = 0; +	else +		rb->overwrite = 1;  	atomic_set(&rb->refcount, 1); diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca993..60bc027c61c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code)  	/*  	 * Make sure we are holding no locks:  	 */ -	debug_check_no_locks_held(); +	debug_check_no_locks_held(tsk);  	/*  	 * We can do this unlocked here. The futex code uses this flag  	 * just to verify whether the pi state cleanup has been done diff --git a/kernel/fork.c b/kernel/fork.c index 8d932b1c905..1766d324d5e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))  		return ERR_PTR(-EINVAL); +	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) +		return ERR_PTR(-EINVAL); +  	/*  	 * Thread groups must share signals as well, and detached threads  	 * can only be started up within the thread group. @@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)  	 * If unsharing a user namespace must also unshare the thread.  	 */  	if (unshare_flags & CLONE_NEWUSER) -		unshare_flags |= CLONE_THREAD; +		unshare_flags |= CLONE_THREAD | CLONE_FS;  	/*  	 * If unsharing a pid namespace must also unshare the thread.  	 */ diff --git a/kernel/futex.c b/kernel/futex.c index f0090a993da..b26dcfc02c9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key)   * @rw:		mapping needs to be read/write (values: VERIFY_READ,   *              VERIFY_WRITE)   * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + *   * The key words are stored in *key on success.   *   * For shared mappings, it's (page->index, file_inode(vma->vm_file), @@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,   *			be "current" except in the case of requeue pi.   * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)   * - * Returns: - *  0 - ready to wait - *  1 - acquired the lock + * Return: + *  0 - ready to wait; + *  1 - acquired the lock;   * <0 - error   *   * The hb->lock and futex_key refs shall be held by the caller. @@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,   * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.   * hb1 and hb2 must be held by the caller.   * - * Returns: - *  0 - failed to acquire the lock atomicly - *  1 - acquired the lock + * Return: + *  0 - failed to acquire the lock atomically; + *  1 - acquired the lock;   * <0 - error   */  static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,   * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire   * uaddr2 atomically on behalf of the top waiter.   * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken;   *  <0 - on error   */  static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)   * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must   * be paired with exactly one earlier call to queue_me().   * - * Returns: - *   1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + *   1 - if the futex_q was still queued (and we removed unqueued it);   *   0 - if the futex_q was already removed by the waking thread   */  static int unqueue_me(struct futex_q *q) @@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);   * the pi_state owner as well as handle race conditions that may allow us to   * acquire the lock. Must be called with the hb lock held.   * - * Returns: - *  1 - success, lock taken - *  0 - success, lock not taken + * Return: + *  1 - success, lock taken; + *  0 - success, lock not taken;   * <0 - on error (-EFAULT)   */  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,   * Return with the hb lock held and a q.key reference on success, and unlocked   * with no q.key reference on failure.   * - * Returns: - *  0 - uaddr contains val and hb has been locked + * Return: + *  0 - uaddr contains val and hb has been locked;   * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked   */  static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -2203,9 +2204,9 @@ pi_faulted:   * the wakeup and return the appropriate error code to the caller.  Must be   * called with the hb lock held.   * - * Returns - *  0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + *  0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR   */  static inline  int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,   * @val:	the expected value of uaddr   * @abs_time:	absolute timeout   * @bitset:	32 bit wakeup bitset set by userspace, defaults to all - * @clockrt:	whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)   * @uaddr2:	the pi futex we will take prior to returning to user-space   *   * The caller will wait on uaddr and will be requeued by futex_requeue() to @@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,   * there was a need to.   *   * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following--   * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()   * 2) wakeup on uaddr2 after a requeue   * 3) signal @@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,   *   * If 4 or 7, we cleanup and return with -ETIMEDOUT.   * - * Returns: - *  0 - On success + * Return: + *  0 - On success;   * <0 - On error   */  static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3fe..14be27feda4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -63,6 +63,7 @@  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =  { +	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),  	.clock_base =  	{  		{ @@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)  	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);  	int i; -	raw_spin_lock_init(&cpu_base->lock); -  	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {  		cpu_base->clock_base[i].cpu_base = cpu_base;  		timerqueue_init_head(&cpu_base->clock_base[i].active); diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b..b574920cbd4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -55,7 +55,7 @@ struct resource crashk_res = {  	.flags = IORESOURCE_BUSY | IORESOURCE_MEM  };  struct resource crashk_low_res = { -	.name  = "Crash kernel low", +	.name  = "Crash kernel",  	.start = 0,  	.end   = 0,  	.flags = IORESOURCE_BUSY | IORESOURCE_MEM @@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,  {  	unsigned long addr; -	for (addr = begin; addr < end; addr += PAGE_SIZE) { -		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); -		init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); -		free_page((unsigned long)__va(addr)); -		totalram_pages++; -	} +	for (addr = begin; addr < end; addr += PAGE_SIZE) +		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));  }  int crash_shrink_memory(unsigned long new_size) @@ -1368,35 +1364,114 @@ static int __init parse_crashkernel_simple(char 		*cmdline,  	return 0;  } +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW  1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { +	[SUFFIX_HIGH] = ",high", +	[SUFFIX_LOW]  = ",low", +	[SUFFIX_NULL] = NULL, +}; +  /* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. + * That function parses "suffix"  crashkernel command lines like + * + *	crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure.   */ +static int __init parse_crashkernel_suffix(char *cmdline, +					   unsigned long long	*crash_size, +					   unsigned long long	*crash_base, +					   const char *suffix) +{ +	char *cur = cmdline; + +	*crash_size = memparse(cmdline, &cur); +	if (cmdline == cur) { +		pr_warn("crashkernel: memory value expected\n"); +		return -EINVAL; +	} + +	/* check with suffix */ +	if (strncmp(cur, suffix, strlen(suffix))) { +		pr_warn("crashkernel: unrecognized char\n"); +		return -EINVAL; +	} +	cur += strlen(suffix); +	if (*cur != ' ' && *cur != '\0') { +		pr_warn("crashkernel: unrecognized char\n"); +		return -EINVAL; +	} + +	return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, +			     const char *name, +			     const char *suffix) +{ +	char *p = cmdline, *ck_cmdline = NULL; + +	/* find crashkernel and use the last one if there are more */ +	p = strstr(p, name); +	while (p) { +		char *end_p = strchr(p, ' '); +		char *q; + +		if (!end_p) +			end_p = p + strlen(p); + +		if (!suffix) { +			int i; + +			/* skip the one with any known suffix */ +			for (i = 0; suffix_tbl[i]; i++) { +				q = end_p - strlen(suffix_tbl[i]); +				if (!strncmp(q, suffix_tbl[i], +					     strlen(suffix_tbl[i]))) +					goto next; +			} +			ck_cmdline = p; +		} else { +			q = end_p - strlen(suffix); +			if (!strncmp(q, suffix, strlen(suffix))) +				ck_cmdline = p; +		} +next: +		p = strstr(p+1, name); +	} + +	if (!ck_cmdline) +		return NULL; + +	return ck_cmdline; +} +  static int __init __parse_crashkernel(char *cmdline,  			     unsigned long long system_ram,  			     unsigned long long *crash_size,  			     unsigned long long *crash_base, -				const char *name) +			     const char *name, +			     const char *suffix)  { -	char 	*p = cmdline, *ck_cmdline = NULL;  	char	*first_colon, *first_space; +	char	*ck_cmdline;  	BUG_ON(!crash_size || !crash_base);  	*crash_size = 0;  	*crash_base = 0; -	/* find crashkernel and use the last one if there are more */ -	p = strstr(p, name); -	while (p) { -		ck_cmdline = p; -		p = strstr(p+1, name); -	} +	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);  	if (!ck_cmdline)  		return -EINVAL;  	ck_cmdline += strlen(name); +	if (suffix) +		return parse_crashkernel_suffix(ck_cmdline, crash_size, +				crash_base, suffix);  	/*  	 * if the commandline contains a ':', then that's the extended  	 * syntax -- if not, it must be the classic syntax @@ -1413,13 +1488,26 @@ static int __init __parse_crashkernel(char *cmdline,  	return 0;  } +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */  int __init parse_crashkernel(char *cmdline,  			     unsigned long long system_ram,  			     unsigned long long *crash_size,  			     unsigned long long *crash_base)  {  	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, -					"crashkernel="); +					"crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, +			     unsigned long long system_ram, +			     unsigned long long *crash_size, +			     unsigned long long *crash_base) +{ +	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, +				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);  }  int __init parse_crashkernel_low(char *cmdline, @@ -1428,7 +1516,7 @@ int __init parse_crashkernel_low(char *cmdline,  			     unsigned long long *crash_base)  {  	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, -					"crashkernel_low="); +				"crashkernel=", suffix_tbl[SUFFIX_LOW]);  }  static void update_vmcoreinfo_note(void) @@ -1489,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_SYMBOL(swapper_pg_dir);  #endif  	VMCOREINFO_SYMBOL(_stext); -	VMCOREINFO_SYMBOL(vmlist); +	VMCOREINFO_SYMBOL(vmap_area_list);  #ifndef CONFIG_NEED_MULTIPLE_NODES  	VMCOREINFO_SYMBOL(mem_map); @@ -1527,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_OFFSET(free_area, free_list);  	VMCOREINFO_OFFSET(list_head, next);  	VMCOREINFO_OFFSET(list_head, prev); -	VMCOREINFO_OFFSET(vm_struct, addr); +	VMCOREINFO_OFFSET(vmap_area, va_start); +	VMCOREINFO_OFFSET(vmap_area, list);  	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);  	log_buf_kexec_setup();  	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e35be53f661..3fed7f0cbcd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -794,16 +794,16 @@ out:  }  #ifdef CONFIG_SYSCTL -/* This should be called with kprobe_mutex locked */  static void __kprobes optimize_all_kprobes(void)  {  	struct hlist_head *head;  	struct kprobe *p;  	unsigned int i; +	mutex_lock(&kprobe_mutex);  	/* If optimization is already allowed, just return */  	if (kprobes_allow_optimization) -		return; +		goto out;  	kprobes_allow_optimization = true;  	for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)  				optimize_kprobe(p);  	}  	printk(KERN_INFO "Kprobes globally optimized\n"); +out: +	mutex_unlock(&kprobe_mutex);  } -/* This should be called with kprobe_mutex locked */  static void __kprobes unoptimize_all_kprobes(void)  {  	struct hlist_head *head;  	struct kprobe *p;  	unsigned int i; +	mutex_lock(&kprobe_mutex);  	/* If optimization is already prohibited, just return */ -	if (!kprobes_allow_optimization) +	if (!kprobes_allow_optimization) { +		mutex_unlock(&kprobe_mutex);  		return; +	}  	kprobes_allow_optimization = false;  	for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)  				unoptimize_kprobe(p, false);  		}  	} +	mutex_unlock(&kprobe_mutex); +  	/* Wait for unoptimizing completion */  	wait_for_kprobe_optimizer();  	printk(KERN_INFO "Kprobes globally unoptimized\n");  } +static DEFINE_MUTEX(kprobe_sysctl_mutex);  int sysctl_kprobes_optimization;  int proc_kprobes_optimization_handler(struct ctl_table *table, int write,  				      void __user *buffer, size_t *length, @@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,  {  	int ret; -	mutex_lock(&kprobe_mutex); +	mutex_lock(&kprobe_sysctl_mutex);  	sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,  		optimize_all_kprobes();  	else  		unoptimize_all_kprobes(); -	mutex_unlock(&kprobe_mutex); +	mutex_unlock(&kprobe_sysctl_mutex);  	return ret;  } diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9ba..9b12d65186f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,8 +52,21 @@ enum KTHREAD_BITS {  	KTHREAD_IS_PARKED,  }; -#define to_kthread(tsk)	\ -	container_of((tsk)->vfork_done, struct kthread, exited) +#define __to_kthread(vfork)	\ +	container_of(vfork, struct kthread, exited) + +static inline struct kthread *to_kthread(struct task_struct *k) +{ +	return __to_kthread(k->vfork_done); +} + +static struct kthread *to_live_kthread(struct task_struct *k) +{ +	struct completion *vfork = ACCESS_ONCE(k->vfork_done); +	if (likely(vfork)) +		return __to_kthread(vfork); +	return NULL; +}  /**   * kthread_should_stop - should this kthread return now? @@ -124,12 +137,12 @@ void *kthread_data(struct task_struct *task)  static void __kthread_parkme(struct kthread *self)  { -	__set_current_state(TASK_INTERRUPTIBLE); +	__set_current_state(TASK_PARKED);  	while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {  		if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))  			complete(&self->parked);  		schedule(); -		__set_current_state(TASK_INTERRUPTIBLE); +		__set_current_state(TASK_PARKED);  	}  	clear_bit(KTHREAD_IS_PARKED, &self->flags);  	__set_current_state(TASK_RUNNING); @@ -256,8 +269,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),  }  EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)  { +	/* Must have done schedule() in kthread() before we set_task_cpu */ +	if (!wait_task_inactive(p, state)) { +		WARN_ON(1); +		return; +	}  	/* It's safe because the task is inactive. */  	do_set_cpus_allowed(p, cpumask_of(cpu));  	p->flags |= PF_THREAD_BOUND; @@ -274,12 +292,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)   */  void kthread_bind(struct task_struct *p, unsigned int cpu)  { -	/* Must have done schedule() in kthread() before we set_task_cpu */ -	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { -		WARN_ON(1); -		return; -	} -	__kthread_bind(p, cpu); +	__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);  }  EXPORT_SYMBOL(kthread_bind); @@ -311,17 +324,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),  	return p;  } -static struct kthread *task_get_live_kthread(struct task_struct *k) +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)  { -	struct kthread *kthread; - -	get_task_struct(k); -	kthread = to_kthread(k); -	/* It might have exited */ -	barrier(); -	if (k->vfork_done != NULL) -		return kthread; -	return NULL; +	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); +	/* +	 * We clear the IS_PARKED bit here as we don't wait +	 * until the task has left the park code. So if we'd +	 * park before that happens we'd see the IS_PARKED bit +	 * which might be about to be cleared. +	 */ +	if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { +		if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) +			__kthread_bind(k, kthread->cpu, TASK_PARKED); +		wake_up_state(k, TASK_PARKED); +	}  }  /** @@ -334,23 +350,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)   */  void kthread_unpark(struct task_struct *k)  { -	struct kthread *kthread = task_get_live_kthread(k); +	struct kthread *kthread = to_live_kthread(k); -	if (kthread) { -		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); -		/* -		 * We clear the IS_PARKED bit here as we don't wait -		 * until the task has left the park code. So if we'd -		 * park before that happens we'd see the IS_PARKED bit -		 * which might be about to be cleared. -		 */ -		if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { -			if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) -				__kthread_bind(k, kthread->cpu); -			wake_up_process(k); -		} -	} -	put_task_struct(k); +	if (kthread) +		__kthread_unpark(k, kthread);  }  /** @@ -367,7 +370,7 @@ void kthread_unpark(struct task_struct *k)   */  int kthread_park(struct task_struct *k)  { -	struct kthread *kthread = task_get_live_kthread(k); +	struct kthread *kthread = to_live_kthread(k);  	int ret = -ENOSYS;  	if (kthread) { @@ -380,7 +383,6 @@ int kthread_park(struct task_struct *k)  		}  		ret = 0;  	} -	put_task_struct(k);  	return ret;  } @@ -401,21 +403,23 @@ int kthread_park(struct task_struct *k)   */  int kthread_stop(struct task_struct *k)  { -	struct kthread *kthread = task_get_live_kthread(k); +	struct kthread *kthread;  	int ret;  	trace_sched_kthread_stop(k); + +	get_task_struct(k); +	kthread = to_live_kthread(k);  	if (kthread) {  		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); -		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); +		__kthread_unpark(k, kthread);  		wake_up_process(k);  		wait_for_completion(&kthread->exited);  	}  	ret = k->exit_code; -  	put_task_struct(k); -	trace_sched_kthread_stop_ret(ret); +	trace_sched_kthread_stop_ret(ret);  	return ret;  }  EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d..6a3bccba7e7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)  unsigned long nr_stack_trace_entries;  static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static void print_lockdep_off(const char *bug_msg) +{ +	printk(KERN_DEBUG "%s\n", bug_msg); +	printk(KERN_DEBUG "turning off the locking correctness validator.\n"); +	printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} +  static int save_trace(struct stack_trace *trace)  {  	trace->nr_entries = 0; @@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)  		if (!debug_locks_off_graph_unlock())  			return 0; -		printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");  		dump_stack();  		return 0; @@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)  		}  		raw_local_irq_restore(flags); -		printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");  		dump_stack();  		return NULL;  	} @@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)  		if (!debug_locks_off_graph_unlock())  			return NULL; -		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");  		dump_stack();  		return NULL;  	} @@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,  	struct lock_class *class = hlock_class(hlock);  	struct list_head *hash_head = chainhashentry(chain_key);  	struct lock_chain *chain; -	struct held_lock *hlock_curr, *hlock_next; +	struct held_lock *hlock_curr;  	int i, j;  	/* @@ -2048,8 +2052,7 @@ cache_hit:  		if (!debug_locks_off_graph_unlock())  			return 0; -		printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); -		printk("turning off the locking correctness validator.\n"); +		print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");  		dump_stack();  		return 0;  	} @@ -2057,12 +2060,10 @@ cache_hit:  	chain->chain_key = chain_key;  	chain->irq_context = hlock->irq_context;  	/* Find the first held_lock of current chain */ -	hlock_next = hlock;  	for (i = curr->lockdep_depth - 1; i >= 0; i--) {  		hlock_curr = curr->held_locks + i; -		if (hlock_curr->irq_context != hlock_next->irq_context) +		if (hlock_curr->irq_context != hlock->irq_context)  			break; -		hlock_next = hlock;  	}  	i++;  	chain->depth = curr->lockdep_depth + 1 - i; @@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  #endif  	if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {  		debug_locks_off(); -		printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n", +		print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); +		printk(KERN_DEBUG "depth: %i  max: %lu!\n",  		       curr->lockdep_depth, MAX_LOCK_DEPTH); -		printk("turning off the locking correctness validator.\n");  		lockdep_print_held_locks(current);  		debug_show_all_locks(); @@ -4088,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)  }  EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr)  {  	if (!debug_locks_off())  		return; @@ -4097,21 +4098,22 @@ static void print_held_locks_bug(void)  	printk("\n");  	printk("=====================================\n"); -	printk("[ BUG: %s/%d still has locks held! ]\n", -	       current->comm, task_pid_nr(current)); +	printk("[ BUG: lock held at task exit time! ]\n");  	print_kernel_ident();  	printk("-------------------------------------\n"); -	lockdep_print_held_locks(current); +	printk("%s/%d is exiting with locks still held!\n", +		curr->comm, task_pid_nr(curr)); +	lockdep_print_held_locks(curr); +  	printk("\nstack backtrace:\n");  	dump_stack();  } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task)  { -	if (unlikely(current->lockdep_depth > 0)) -		print_held_locks_bug(); +	if (unlikely(task->lockdep_depth > 0)) +		print_held_locks_bug(task);  } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held);  void debug_show_all_locks(void)  { diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e..ad53a664f11 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -37,6 +37,12 @@  # include <asm/mutex.h>  #endif +/* + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. + */ +#define	MUTEX_SHOW_NO_WAITER(mutex)	(atomic_read(&(mutex)->count) >= 0) +  void  __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)  { @@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)  	spin_lock_init(&lock->wait_lock);  	INIT_LIST_HEAD(&lock->wait_list);  	mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +	lock->spin_mlock = NULL; +#endif  	debug_mutex_init(lock, name, key);  } @@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)  EXPORT_SYMBOL(mutex_lock);  #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + * We don't inline mspin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +struct mspin_node { +	struct mspin_node *next ; +	int		  locked;	/* 1 if lock acquired */ +}; +#define	MLOCK(mutex)	((struct mspin_node **)&((mutex)->spin_mlock)) + +static noinline +void mspin_lock(struct mspin_node **lock, struct mspin_node *node) +{ +	struct mspin_node *prev; + +	/* Init node */ +	node->locked = 0; +	node->next   = NULL; + +	prev = xchg(lock, node); +	if (likely(prev == NULL)) { +		/* Lock acquired */ +		node->locked = 1; +		return; +	} +	ACCESS_ONCE(prev->next) = node; +	smp_wmb(); +	/* Wait until the lock holder passes the lock down */ +	while (!ACCESS_ONCE(node->locked)) +		arch_mutex_cpu_relax(); +} + +static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) +{ +	struct mspin_node *next = ACCESS_ONCE(node->next); + +	if (likely(!next)) { +		/* +		 * Release the lock by setting it to NULL +		 */ +		if (cmpxchg(lock, node, NULL) == node) +			return; +		/* Wait until the next pointer is set */ +		while (!(next = ACCESS_ONCE(node->next))) +			arch_mutex_cpu_relax(); +	} +	ACCESS_ONCE(next->locked) = 1; +	smp_wmb(); +} + +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ +	if (lock->owner != owner) +		return false; + +	/* +	 * Ensure we emit the owner->on_cpu, dereference _after_ checking +	 * lock->owner still matches owner, if that fails, owner might +	 * point to free()d memory, if it still matches, the rcu_read_lock() +	 * ensures the memory stays valid. +	 */ +	barrier(); + +	return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ +	rcu_read_lock(); +	while (owner_running(lock, owner)) { +		if (need_resched()) +			break; + +		arch_mutex_cpu_relax(); +	} +	rcu_read_unlock(); + +	/* +	 * We break out the loop above on need_resched() and when the +	 * owner changed, which is a sign for heavy contention. Return +	 * success only when lock->owner is NULL. +	 */ +	return lock->owner == NULL; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ +	int retval = 1; + +	rcu_read_lock(); +	if (lock->owner) +		retval = lock->owner->on_cpu; +	rcu_read_unlock(); +	/* +	 * if lock->owner is not set, the mutex owner may have just acquired +	 * it and not set the owner yet or the mutex has been released. +	 */ +	return retval; +} +#endif +  static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);  /** @@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	 *  	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock  	 * to serialize everything. +	 * +	 * The mutex spinners are queued up using MCS lock so that only one +	 * spinner can compete for the mutex. However, if mutex spinning isn't +	 * going to happen, there is no point in going through the lock/unlock +	 * overhead.  	 */ +	if (!mutex_can_spin_on_owner(lock)) +		goto slowpath;  	for (;;) {  		struct task_struct *owner; +		struct mspin_node  node;  		/*  		 * If there's an owner, wait for it to either  		 * release the lock or go to sleep.  		 */ +		mspin_lock(MLOCK(lock), &node);  		owner = ACCESS_ONCE(lock->owner); -		if (owner && !mutex_spin_on_owner(lock, owner)) +		if (owner && !mutex_spin_on_owner(lock, owner)) { +			mspin_unlock(MLOCK(lock), &node);  			break; +		} -		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { +		if ((atomic_read(&lock->count) == 1) && +		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {  			lock_acquired(&lock->dep_map, ip);  			mutex_set_owner(lock); +			mspin_unlock(MLOCK(lock), &node);  			preempt_enable();  			return 0;  		} +		mspin_unlock(MLOCK(lock), &node);  		/*  		 * When there's no owner, we might have preempted between the @@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 */  		arch_mutex_cpu_relax();  	} +slowpath:  #endif  	spin_lock_mutex(&lock->wait_lock, flags); @@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	list_add_tail(&waiter.list, &lock->wait_list);  	waiter.task = task; -	if (atomic_xchg(&lock->count, -1) == 1) +	if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))  		goto done;  	lock_contended(&lock->dep_map, ip); @@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 * that when we release the lock, we properly wake up the  		 * other waiters:  		 */ -		if (atomic_xchg(&lock->count, -1) == 1) +		if (MUTEX_SHOW_NO_WAITER(lock) && +		   (atomic_xchg(&lock->count, -1) == 1))  			break;  		/* diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c602..bea15bdf82b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	int nr;  	int rc;  	struct task_struct *task, *me = current; +	int init_pids = thread_group_leader(me) ? 1 : 2;  	/* Don't allow any more processes into the pid namespace */  	disable_pid_allocation(pid_ns); @@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	 */  	for (;;) {  		set_current_state(TASK_UNINTERRUPTIBLE); -		if (pid_ns->nr_hashed == 1) +		if (pid_ns->nr_hashed == init_pids)  			break;  		schedule();  	} diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335..abbdd9e2ac8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)  #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */  #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); -  int console_printk[4] = {  	DEFAULT_CONSOLE_LOGLEVEL,	/* console_loglevel */  	DEFAULT_MESSAGE_LOGLEVEL,	/* default_message_loglevel */ @@ -224,6 +222,7 @@ struct log {  static DEFINE_RAW_SPINLOCK(logbuf_lock);  #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait);  /* the next printk record to read by syslog(READ) or /proc/kmsg */  static u64 syslog_seq;  static u32 syslog_idx; @@ -1957,45 +1956,6 @@ int is_console_locked(void)  	return console_locked;  } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE		512 - -#define PRINTK_PENDING_WAKEUP	0x01 -#define PRINTK_PENDING_SCHED	0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ -	int pending = __this_cpu_xchg(printk_pending, 0); - -	if (pending & PRINTK_PENDING_SCHED) { -		char *buf = __get_cpu_var(printk_sched_buf); -		printk(KERN_WARNING "[sched_delayed] %s", buf); -	} - -	if (pending & PRINTK_PENDING_WAKEUP) -		wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { -	.func = wake_up_klogd_work_func, -	.flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ -	preempt_disable(); -	if (waitqueue_active(&log_wait)) { -		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); -		irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); -	} -	preempt_enable(); -} -  static void console_cont_flush(char *text, size_t size)  {  	unsigned long flags; @@ -2458,6 +2418,44 @@ static int __init printk_late_init(void)  late_initcall(printk_late_init);  #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE		512 + +#define PRINTK_PENDING_WAKEUP	0x01 +#define PRINTK_PENDING_SCHED	0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ +	int pending = __this_cpu_xchg(printk_pending, 0); + +	if (pending & PRINTK_PENDING_SCHED) { +		char *buf = __get_cpu_var(printk_sched_buf); +		printk(KERN_WARNING "[sched_delayed] %s", buf); +	} + +	if (pending & PRINTK_PENDING_WAKEUP) +		wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { +	.func = wake_up_klogd_work_func, +	.flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ +	preempt_disable(); +	if (waitqueue_active(&log_wait)) { +		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); +		irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); +	} +	preempt_enable(); +}  int printk_sched(const char *fmt, ...)  { diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b..d7386986e10 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@  #include <linux/seq_file.h>  #include <linux/device.h>  #include <linux/pfn.h> +#include <linux/mm.h>  #include <asm/io.h> @@ -50,6 +51,14 @@ struct resource_constraint {  static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); +  static void *r_next(struct seq_file *m, void *v, loff_t *pos)  {  	struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init);  #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ +	if (!res) +		return; + +	if (!PageSlab(virt_to_head_page(res))) { +		spin_lock(&bootmem_resource_lock); +		res->sibling = bootmem_resource_free; +		bootmem_resource_free = res; +		spin_unlock(&bootmem_resource_lock); +	} else { +		kfree(res); +	} +} + +static struct resource *alloc_resource(gfp_t flags) +{ +	struct resource *res = NULL; + +	spin_lock(&bootmem_resource_lock); +	if (bootmem_resource_free) { +		res = bootmem_resource_free; +		bootmem_resource_free = res->sibling; +	} +	spin_unlock(&bootmem_resource_lock); + +	if (res) +		memset(res, 0, sizeof(struct resource)); +	else +		res = kzalloc(sizeof(struct resource), flags); + +	return res; +} +  /* Return the conflict entry if you can't request it */  static struct resource * __request_resource(struct resource *root, struct resource *new)  { @@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)  	write_unlock(&resource_lock);  } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments.  Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, +				resource_size_t size)  {  	struct resource *tmp, *parent = res->parent;  	resource_size_t end = start + size - 1;  	int result = -EBUSY; -	write_lock(&resource_lock); -  	if (!parent)  		goto skip; @@ -751,6 +783,26 @@ skip:  	result = 0;   out: +	return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments.  Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, +			resource_size_t size) +{ +	int result; + +	write_lock(&resource_lock); +	result = __adjust_resource(res, start, size);  	write_unlock(&resource_lock);  	return result;  } @@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,  {  	struct resource *parent = root;  	struct resource *conflict; -	struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); +	struct resource *res = alloc_resource(GFP_ATOMIC);  	struct resource *next_res = NULL;  	if (!res) @@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,  		/* conflict covered whole area */  		if (conflict->start <= res->start &&  				conflict->end >= res->end) { -			kfree(res); +			free_resource(res);  			WARN_ON(next_res);  			break;  		} @@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,  			end = res->end;  			res->end = conflict->start - 1;  			if (conflict->end < end) { -				next_res = kzalloc(sizeof(*next_res), -						GFP_ATOMIC); +				next_res = alloc_resource(GFP_ATOMIC);  				if (!next_res) { -					kfree(res); +					free_resource(res);  					break;  				}  				next_res->name = name; @@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,  				   const char *name, int flags)  {  	DECLARE_WAITQUEUE(wait, current); -	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); +	struct resource *res = alloc_resource(GFP_KERNEL);  	if (!res)  		return NULL; @@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,  			continue;  		}  		/* Uhhuh, that didn't work out.. */ -		kfree(res); +		free_resource(res);  		res = NULL;  		break;  	} @@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,  		return -EBUSY;  	release_resource(res); -	kfree(res); +	free_resource(res);  	return 0;  }  EXPORT_SYMBOL(__check_region); @@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,  			write_unlock(&resource_lock);  			if (res->flags & IORESOURCE_MUXED)  				wake_up(&muxed_resource_wait); -			kfree(res); +			free_resource(res);  			return;  		}  		p = &res->sibling; @@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,  }  EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete.  The requested region + * is released from a currently busy memory resource.  The requested region + * must either match exactly or fit into a single busy resource entry.  In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + *   supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + *   assumes that all children remain in the lower address entry for + *   simplicity.  Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, +			resource_size_t start, resource_size_t size) +{ +	struct resource **p; +	struct resource *res; +	struct resource *new_res; +	resource_size_t end; +	int ret = -EINVAL; + +	end = start + size - 1; +	if ((start < parent->start) || (end > parent->end)) +		return ret; + +	/* The alloc_resource() result gets checked later */ +	new_res = alloc_resource(GFP_KERNEL); + +	p = &parent->child; +	write_lock(&resource_lock); + +	while ((res = *p)) { +		if (res->start >= end) +			break; + +		/* look for the next resource if it does not fit into */ +		if (res->start > start || res->end < end) { +			p = &res->sibling; +			continue; +		} + +		if (!(res->flags & IORESOURCE_MEM)) +			break; + +		if (!(res->flags & IORESOURCE_BUSY)) { +			p = &res->child; +			continue; +		} + +		/* found the target resource; let's adjust accordingly */ +		if (res->start == start && res->end == end) { +			/* free the whole entry */ +			*p = res->sibling; +			free_resource(res); +			ret = 0; +		} else if (res->start == start && res->end != end) { +			/* adjust the start */ +			ret = __adjust_resource(res, end + 1, +						res->end - end); +		} else if (res->start != start && res->end == end) { +			/* adjust the end */ +			ret = __adjust_resource(res, res->start, +						start - res->start); +		} else { +			/* split into two entries */ +			if (!new_res) { +				ret = -ENOMEM; +				break; +			} +			new_res->name = res->name; +			new_res->start = end + 1; +			new_res->end = res->end; +			new_res->flags = res->flags; +			new_res->parent = res->parent; +			new_res->sibling = res->sibling; +			new_res->child = NULL; + +			ret = __adjust_resource(res, res->start, +						start - res->start); +			if (ret) +				break; +			res->sibling = new_res; +			new_res = NULL; +		} + +		break; +	} + +	write_unlock(&resource_lock); +	free_resource(new_res); +	return ret; +} +#endif	/* CONFIG_MEMORY_HOTREMOVE */ +  /*   * Managed region resource   */ diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 7890b10084a..1d96dd0d93c 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -14,6 +14,7 @@  #include <linux/spinlock.h>  #include <linux/timer.h>  #include <linux/freezer.h> +#include <linux/stat.h>  #include "rtmutex.h" @@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at  	return curr - buf;  } -static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); -static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);  static struct bus_type rttest_subsys = {  	.name = "rttest", diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492d..c3ae1446461 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)  	u64 this_clock, remote_clock;  	u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: +	/* +	 * Careful here: The local and the remote clock values need to +	 * be read out atomic as we need to compare the values and +	 * then update either the local or the remote side. So the +	 * cmpxchg64 below only protects one readout. +	 * +	 * We must reread via sched_clock_local() in the retry case on +	 * 32bit as an NMI could use sched_clock_local() via the +	 * tracer and hit between the readout of +	 * the low32bit and the high 32bit portion. +	 */ +	this_clock = sched_clock_local(my_scd); +	/* +	 * We must enforce atomic readout on 32bit, otherwise the +	 * update on the remote cpu can hit inbetween the readout of +	 * the low32bit and the high 32bit portion. +	 */ +	remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else +	/* +	 * On 64bit the read of [my]scd->clock is atomic versus the +	 * update, so we can avoid the above 32bit dance. +	 */  	sched_clock_local(my_scd);  again:  	this_clock = my_scd->clock;  	remote_clock = scd->clock; +#endif  	/*  	 * Use the opportunity that we have both locks diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393..42053547e0f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1498,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p)  {  	struct rq *rq = task_rq(p); -	BUG_ON(rq != this_rq()); -	BUG_ON(p == current); +	if (WARN_ON_ONCE(rq != this_rq()) || +	    WARN_ON_ONCE(p == current)) +		return; +  	lockdep_assert_held(&rq->lock);  	if (!raw_spin_trylock(&p->pi_lock)) { @@ -2997,51 +2999,6 @@ void __sched schedule_preempt_disabled(void)  	preempt_disable();  } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ -	if (lock->owner != owner) -		return false; - -	/* -	 * Ensure we emit the owner->on_cpu, dereference _after_ checking -	 * lock->owner still matches owner, if that fails, owner might -	 * point to free()d memory, if it still matches, the rcu_read_lock() -	 * ensures the memory stays valid. -	 */ -	barrier(); - -	return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ -	if (!sched_feat(OWNER_SPIN)) -		return 0; - -	rcu_read_lock(); -	while (owner_running(lock, owner)) { -		if (need_resched()) -			break; - -		arch_mutex_cpu_relax(); -	} -	rcu_read_unlock(); - -	/* -	 * We break out the loop above on need_resched() and when the -	 * owner changed, which is a sign for heavy contention. Return -	 * success only when lock->owner is NULL. -	 */ -	return lock->owner == NULL; -} -#endif -  #ifdef CONFIG_PREEMPT  /*   * this is the entry point to schedule() from in-kernel preemption @@ -4999,7 +4956,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)  }  static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1;  static void  set_table_entry(struct ctl_table *entry, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ed12cbb135f..e93cca92f38 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)  	t = tsk;  	do { -		task_cputime(tsk, &utime, &stime); +		task_cputime(t, &utime, &stime);  		times->utime += utime;  		times->stime += stime;  		times->sum_exec_runtime += task_sched_runtime(t); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395..99399f8e479 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)  SCHED_FEAT(LB_BIAS, true)  /* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - -/*   * Decrement CPU power based on time not spent running tasks   */  SCHED_FEAT(NONTASK_POWER, true) diff --git a/kernel/signal.c b/kernel/signal.c index 2ec870a4c3c..598dc06be42 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)  		if (force_default || ka->sa.sa_handler != SIG_IGN)  			ka->sa.sa_handler = SIG_DFL;  		ka->sa.sa_flags = 0; +#ifdef __ARCH_HAS_SA_RESTORER +		ka->sa.sa_restorer = NULL; +#endif  		sigemptyset(&ka->sa.sa_mask);  		ka++;  	} @@ -2682,7 +2685,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize)  /**   *  sys_rt_sigpending - examine a pending signal that has been raised   *			while blocked - *  @set: stores pending signals + *  @uset: stores pending signals   *  @sigsetsize: size of sigset_t type or larger   */  SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) @@ -2945,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)  static int do_tkill(pid_t tgid, pid_t pid, int sig)  { -	struct siginfo info; +	struct siginfo info = {};  	info.si_signo = sig;  	info.si_errno = 0; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9bde572782..02fc5c93367 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data)  			continue;  		} -		//BUG_ON(td->cpu != smp_processor_id()); +		BUG_ON(td->cpu != smp_processor_id());  		/* Check for state change setup */  		switch (td->status) { @@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)  	}  	get_task_struct(tsk);  	*per_cpu_ptr(ht->store, cpu) = tsk; -	if (ht->create) -		ht->create(cpu); +	if (ht->create) { +		/* +		 * Make sure that the task has actually scheduled out +		 * into park position, before calling the create +		 * callback. At least the migration thread callback +		 * requires that the task is off the runqueue. +		 */ +		if (!wait_task_inactive(tsk, TASK_PARKED)) +			WARN_ON(1); +		else +			ht->create(cpu); +	}  	return 0;  } @@ -209,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp  {  	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); +	if (ht->pre_unpark) +		ht->pre_unpark(cpu);  	kthread_unpark(tsk);  } diff --git a/kernel/softirq.c b/kernel/softirq.c index b4d252fd195..14d7758074a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -323,18 +323,10 @@ void irq_enter(void)  static inline void invoke_softirq(void)  { -	if (!force_irqthreads) { -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED +	if (!force_irqthreads)  		__do_softirq(); -#else -		do_softirq(); -#endif -	} else { -		__local_bh_disable((unsigned long)__builtin_return_address(0), -				SOFTIRQ_OFFSET); +	else  		wakeup_softirqd(); -		__local_bh_enable(SOFTIRQ_OFFSET); -	}  }  /* @@ -342,9 +334,15 @@ static inline void invoke_softirq(void)   */  void irq_exit(void)  { +#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED +	local_irq_disable(); +#else +	WARN_ON_ONCE(!irqs_disabled()); +#endif +  	account_irq_exit_time(current);  	trace_hardirq_exit(); -	sub_preempt_count(IRQ_EXIT_OFFSET); +	sub_preempt_count(HARDIRQ_OFFSET);  	if (!in_interrupt() && local_softirq_pending())  		invoke_softirq(); @@ -354,7 +352,6 @@ void irq_exit(void)  		tick_nohz_irq_exit();  #endif  	rcu_irq_exit(); -	sched_preempt_enable_no_resched();  }  /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 95d178c62d5..c09f2955ae3 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {  	.create			= cpu_stop_create,  	.setup			= cpu_stop_unpark,  	.park			= cpu_stop_park, -	.unpark			= cpu_stop_unpark, +	.pre_unpark		= cpu_stop_unpark,  	.selfparking		= true,  }; diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba..0da73cf73e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd)  	system_state = SYSTEM_RESTART;  	usermodehelper_disable();  	device_shutdown(); -	syscore_shutdown();  }  /** @@ -370,6 +369,7 @@ void kernel_restart(char *cmd)  {  	kernel_restart_prepare(cmd);  	disable_nonboot_cpus(); +	syscore_shutdown();  	if (!cmd)  		printk(KERN_EMERG "Restarting system.\n");  	else @@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)  void kernel_halt(void)  {  	kernel_shutdown_prepare(SYSTEM_HALT); +	disable_nonboot_cpus();  	syscore_shutdown();  	printk(KERN_EMERG "System halted.\n");  	kmsg_dump(KMSG_DUMP_HALT); @@ -2185,9 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,  char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static int __orderly_poweroff(void) +static int __orderly_poweroff(bool force)  { -	int argc;  	char **argv;  	static char *envp[] = {  		"HOME=/", @@ -2196,20 +2196,40 @@ static int __orderly_poweroff(void)  	};  	int ret; -	argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); -	if (argv == NULL) { +	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); +	if (argv) { +		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); +		argv_free(argv); +	} else {  		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", -		       __func__, poweroff_cmd); -		return -ENOMEM; +					 __func__, poweroff_cmd); +		ret = -ENOMEM;  	} -	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, -				      NULL, NULL, NULL); -	argv_free(argv); +	if (ret && force) { +		printk(KERN_WARNING "Failed to start orderly shutdown: " +					"forcing the issue\n"); +		/* +		 * I guess this should try to kick off some daemon to sync and +		 * poweroff asap.  Or not even bother syncing if we're doing an +		 * emergency shutdown? +		 */ +		emergency_sync(); +		kernel_power_off(); +	}  	return ret;  } +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ +	__orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); +  /**   * orderly_poweroff - Trigger an orderly system poweroff   * @force: force poweroff if command execution fails @@ -2219,21 +2239,9 @@ static int __orderly_poweroff(void)   */  int orderly_poweroff(bool force)  { -	int ret = __orderly_poweroff(); - -	if (ret && force) { -		printk(KERN_WARNING "Failed to start orderly shutdown: " -		       "forcing the issue\n"); - -		/* -		 * I guess this should try to kick off some daemon to sync and -		 * poweroff asap.  Or not even bother syncing if we're doing an -		 * emergency shutdown? -		 */ -		emergency_sync(); -		kernel_power_off(); -	} - -	return ret; +	if (force) /* do not override the pending "true" */ +		poweroff_force = true; +	schedule_work(&poweroff_work); +	return 0;  }  EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afc1dc60f3f..9edcf456e0f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;  #endif  extern int pid_max;  extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches;  extern int percpu_pagelist_fraction;  extern int compat_log;  extern int latencytop_enabled; @@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {  		.extra2		= &one,  	},  #endif +	{ +		.procname	= "user_reserve_kbytes", +		.data		= &sysctl_user_reserve_kbytes, +		.maxlen		= sizeof(sysctl_user_reserve_kbytes), +		.mode		= 0644, +		.proc_handler	= proc_doulongvec_minmax, +	}, +	{ +		.procname	= "admin_reserve_kbytes", +		.data		= &sysctl_admin_reserve_kbytes, +		.maxlen		= sizeof(sysctl_admin_reserve_kbytes), +		.mode		= 0644, +		.proc_handler	= proc_doulongvec_minmax, +	},  	{ }  }; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8..7f32fe0e52c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)   */  int tick_check_broadcast_device(struct clock_event_device *dev)  { -	if ((tick_broadcast_device.evtdev && +	if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || +	    (tick_broadcast_device.evtdev &&  	     tick_broadcast_device.evtdev->rating >= dev->rating) ||  	     (dev->features & CLOCK_EVT_FEAT_C3STOP))  		return 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 192473b2279..5e9efd4b83a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -176,6 +176,8 @@ config IRQSOFF_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in irqs-off critical  	  sections, with microsecond accuracy. @@ -198,6 +200,8 @@ config PREEMPT_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in preemption-off critical  	  sections, with microsecond accuracy. @@ -217,6 +221,7 @@ config SCHED_TRACER  	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	select TRACER_MAX_TRACE +	select TRACER_SNAPSHOT  	help  	  This tracer tracks the latency of the highest priority task  	  to be scheduled in, starting from the point it has woken up. @@ -248,6 +253,27 @@ config TRACER_SNAPSHOT  	      echo 1 > /sys/kernel/debug/tracing/snapshot  	      cat snapshot +config TRACER_SNAPSHOT_PER_CPU_SWAP +        bool "Allow snapshot to swap per CPU" +	depends on TRACER_SNAPSHOT +	select RING_BUFFER_ALLOW_SWAP +	help +	  Allow doing a snapshot of a single CPU buffer instead of a +	  full swap (all buffers). If this is set, then the following is +	  allowed: + +	      echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + +	  After which, only the tracing buffer for CPU 2 was swapped with +	  the main tracing buffer, and the other CPU buffers remain the same. + +	  When this is enabled, this adds a little more overhead to the +	  trace recording, as it needs to add some checks to synchronize +	  recording with swaps. But this does not affect the performance +	  of the overall system. This is enabled by default when the preempt +	  or irq latency tracers are enabled, as those need to swap as well +	  and already adds the overhead (plus a lot more). +  config TRACE_BRANCH_PROFILING  	bool  	select GENERIC_TRACER @@ -414,24 +440,28 @@ config PROBE_EVENTS  	def_bool n  config DYNAMIC_FTRACE -	bool "enable/disable ftrace tracepoints dynamically" +	bool "enable/disable function tracing dynamically"  	depends on FUNCTION_TRACER  	depends on HAVE_DYNAMIC_FTRACE  	default y  	help -          This option will modify all the calls to ftrace dynamically -	  (will patch them out of the binary image and replace them -	  with a No-Op instruction) as they are called. A table is -	  created to dynamically enable them again. +	  This option will modify all the calls to function tracing +	  dynamically (will patch them out of the binary image and +	  replace them with a No-Op instruction) on boot up. During +	  compile time, a table is made of all the locations that ftrace +	  can function trace, and this table is linked into the kernel +	  image. When this is enabled, functions can be individually +	  enabled, and the functions not enabled will not affect +	  performance of the system. + +	  See the files in /sys/kernel/debug/tracing: +	    available_filter_functions +	    set_ftrace_filter +	    set_ftrace_notrace  	  This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but  	  otherwise has native performance as long as no tracing is active. -	  The changes to the code are done by a kernel thread that -	  wakes up once a second and checks to see if any ftrace calls -	  were made. If so, it runs stop_machine (stops all CPUS) -	  and modifies the code to jump over the call to ftrace. -  config DYNAMIC_FTRACE_WITH_REGS  	def_bool y  	depends on DYNAMIC_FTRACE @@ -520,6 +550,29 @@ config RING_BUFFER_BENCHMARK  	  If unsure, say N. +config RING_BUFFER_STARTUP_TEST +       bool "Ring buffer startup self test" +       depends on RING_BUFFER +       help +         Run a simple self test on the ring buffer on boot up. Late in the +	 kernel boot sequence, the test will start that kicks off +	 a thread per cpu. Each thread will write various size events +	 into the ring buffer. Another thread is created to send IPIs +	 to each of the threads, where the IPI handler will also write +	 to the ring buffer, to test/stress the nesting ability. +	 If any anomalies are discovered, a warning will be displayed +	 and all ring buffers will be disabled. + +	 The test runs for 10 seconds. This will slow your boot time +	 by at least 10 more seconds. + +	 At the end of the test, statics and more checks are done. +	 It will output the stats of each per cpu buffer. What +	 was written, the sizes, what was read, what was lost, and +	 other similar details. + +	 If unsure, say N +  endif # FTRACE  endif # TRACING_SUPPORT diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272ee..ed58a3216a6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,  	bool blk_tracer = blk_tracer_enabled;  	if (blk_tracer) { -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + len, @@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  	if (blk_tracer) {  		tracing_record_cmdline(current); -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + pdu_len, @@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,  				      struct request_queue *q,  				      struct request *rq)  { -	struct blk_trace *bt = q->blk_trace; - -	/* if control ever passes through here, it's a request based driver */ -	if (unlikely(bt && !bt->rq_based)) -		bt->rq_based = true; -  	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);  } @@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,  	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);  } -static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) +static void blk_add_trace_bio_complete(void *ignore, +				       struct request_queue *q, struct bio *bio, +				       int error)  { -	struct request_queue *q; -	struct blk_trace *bt; - -	if (!bio->bi_bdev) -		return; - -	q = bdev_get_queue(bio->bi_bdev); -	bt = q->blk_trace; - -	/* -	 * Request based drivers will generate both rq and bio completions. -	 * Ignore bio ones. -	 */ -	if (likely(!bt) || bt->rq_based) -		return; -  	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);  } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ab25b88aae5..8a5c017bb50 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,7 +66,7 @@  static struct ftrace_ops ftrace_list_end __read_mostly = {  	.func		= ftrace_stub, -	.flags		= FTRACE_OPS_FL_RECURSION_SAFE, +	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,  };  /* ftrace_enabled is a method to turn ftrace on or off */ @@ -486,7 +486,6 @@ struct ftrace_profile_stat {  #define PROFILES_PER_PAGE					\  	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly;  static int ftrace_profile_enabled __read_mostly;  /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);  static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)  static void *  function_stat_next(void *v, int idx) @@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); -	for (i = 0; i < pages; i++) { +	for (i = 1; i < pages; i++) {  		pg->next = (void *)get_zeroed_page(GFP_KERNEL);  		if (!pg->next)  			goto out_free; @@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  		free_page(tmp);  	} -	free_page((unsigned long)stat->pages);  	stat->pages = NULL;  	stat->start = NULL; @@ -725,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)  	if (!stat->hash)  		return -ENOMEM; -	if (!ftrace_profile_bits) { -		size--; - -		for (; size; size >>= 1) -			ftrace_profile_bits++; -	} -  	/* Preallocate the function profiling pages */  	if (ftrace_profile_pages_init(stat) < 0) {  		kfree(stat->hash); @@ -764,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)  	struct hlist_head *hhd;  	unsigned long key; -	key = hash_long(ip, ftrace_profile_bits); +	key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);  	hhd = &stat->hash[key];  	if (hlist_empty(hhd)) @@ -783,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,  {  	unsigned long key; -	key = hash_long(rec->ip, ftrace_profile_bits); +	key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);  	hlist_add_head_rcu(&rec->node, &stat->hash[key]);  } @@ -1053,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)  static struct pid * const ftrace_swapper_pid = &init_struct_pid; +loff_t +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +{ +	loff_t ret; + +	if (file->f_mode & FMODE_READ) +		ret = seq_lseek(file, offset, whence); +	else +		file->f_pos = ret = 1; + +	return ret; +} +  #ifdef CONFIG_DYNAMIC_FTRACE  #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -1067,7 +1072,7 @@ struct ftrace_func_probe {  	unsigned long		flags;  	unsigned long		ip;  	void			*data; -	struct rcu_head		rcu; +	struct list_head	free_list;  };  struct ftrace_func_entry { @@ -1317,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	struct hlist_head *hhd;  	struct ftrace_hash *old_hash;  	struct ftrace_hash *new_hash; -	unsigned long key;  	int size = src->count;  	int bits = 0;  	int ret; @@ -1360,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	for (i = 0; i < size; i++) {  		hhd = &src->buckets[i];  		hlist_for_each_entry_safe(entry, tn, hhd, hlist) { -			if (bits > 0) -				key = hash_long(entry->ip, bits); -			else -				key = 0;  			remove_hash_entry(src, entry);  			__add_hash_entry(new_hash, entry);  		} @@ -2613,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)   * routine, you can use ftrace_filter_write() for the write   * routine if @flag has FTRACE_ITER_FILTER set, or   * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * ftrace_filter_lseek() should be used as the lseek routine, and   * release must call ftrace_regex_release().   */  int @@ -2697,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)  				 inode, file);  } -loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int whence) -{ -	loff_t ret; - -	if (file->f_mode & FMODE_READ) -		ret = seq_lseek(file, offset, whence); -	else -		file->f_pos = ret = 1; - -	return ret; -} -  static int ftrace_match(char *str, char *regex, int len, int type)  {  	int matched = 0; @@ -2974,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)  } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry)  { -	struct ftrace_func_probe *entry = -		container_of(rhp, struct ftrace_func_probe, rcu); -  	if (entry->ops->free) -		entry->ops->free(&entry->data); +		entry->ops->free(entry->ops, entry->ip, &entry->data);  	kfree(entry);  } -  int  register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  			      void *data)  {  	struct ftrace_func_probe *entry; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct ftrace_hash *hash;  	struct ftrace_page *pg;  	struct dyn_ftrace *rec;  	int type, len, not;  	unsigned long key;  	int count = 0;  	char *search; +	int ret;  	type = filter_parse_regex(glob, strlen(glob), &search, ¬);  	len = strlen(search); @@ -3006,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	mutex_lock(&ftrace_lock); -	if (unlikely(ftrace_disabled)) +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) { +		count = -ENOMEM; +		goto out_unlock; +	} + +	if (unlikely(ftrace_disabled)) { +		count = -ENODEV;  		goto out_unlock; +	}  	do_for_each_ftrace_rec(pg, rec) { @@ -3031,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		 * for each function we find. We call the callback  		 * to give the caller an opportunity to do so.  		 */ -		if (ops->callback) { -			if (ops->callback(rec->ip, &entry->data) < 0) { +		if (ops->init) { +			if (ops->init(ops, rec->ip, &entry->data) < 0) {  				/* caller does not like this func */  				kfree(entry);  				continue;  			}  		} +		ret = enter_record(hash, rec, 0); +		if (ret < 0) { +			kfree(entry); +			count = ret; +			goto out_unlock; +		} +  		entry->ops = ops;  		entry->ip = rec->ip; @@ -3046,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);  	} while_for_each_ftrace_rec(); + +	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	if (ret < 0) +		count = ret; +  	__enable_ftrace_function_probe();   out_unlock:  	mutex_unlock(&ftrace_lock); +	free_ftrace_hash(hash);  	return count;  } @@ -3063,7 +3070,12 @@ static void  __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  				  void *data, int flags)  { +	struct ftrace_func_entry *rec_entry;  	struct ftrace_func_probe *entry; +	struct ftrace_func_probe *p; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct list_head free_list; +	struct ftrace_hash *hash;  	struct hlist_node *tmp;  	char str[KSYM_SYMBOL_LEN];  	int type = MATCH_FULL; @@ -3084,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	}  	mutex_lock(&ftrace_lock); + +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) +		/* Hmm, should report this somehow */ +		goto out_unlock; + +	INIT_LIST_HEAD(&free_list); +  	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {  		struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -3104,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  					continue;  			} -			hlist_del(&entry->node); -			call_rcu(&entry->rcu, ftrace_free_entry_rcu); +			rec_entry = ftrace_lookup_ip(hash, entry->ip); +			/* It is possible more than one entry had this ip */ +			if (rec_entry) +				free_hash_entry(hash, rec_entry); + +			hlist_del_rcu(&entry->node); +			list_add(&entry->free_list, &free_list);  		}  	}  	__disable_ftrace_function_probe(); +	/* +	 * Remove after the disable is called. Otherwise, if the last +	 * probe is removed, a null hash means *all enabled*. +	 */ +	ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	synchronize_sched(); +	list_for_each_entry_safe(entry, p, &free_list, free_list) { +		list_del(&entry->free_list); +		ftrace_free_entry(entry); +	} +		 + out_unlock:  	mutex_unlock(&ftrace_lock); +	free_ftrace_hash(hash);  }  void @@ -3441,14 +3479,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;  static int __init set_ftrace_notrace(char *str)  { -	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); +	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_notrace=", set_ftrace_notrace);  static int __init set_ftrace_filter(char *str)  { -	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); +	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_filter=", set_ftrace_filter); @@ -3571,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = {  	.open = ftrace_filter_open,  	.read = seq_read,  	.write = ftrace_filter_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -3579,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = {  	.open = ftrace_notrace_open,  	.read = seq_read,  	.write = ftrace_notrace_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -3737,7 +3775,8 @@ out:  	if (fail)  		return -EINVAL; -	ftrace_graph_filter_enabled = 1; +	ftrace_graph_filter_enabled = !!(*idx); +  	return 0;  } @@ -3784,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = {  	.open		= ftrace_graph_open,  	.read		= seq_read,  	.write		= ftrace_graph_write, +	.llseek		= ftrace_filter_lseek,  	.release	= ftrace_graph_release, -	.llseek		= seq_lseek,  };  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4131,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  	preempt_disable_notrace();  	trace_recursion_set(TRACE_CONTROL_BIT);  	do_for_each_ftrace_op(op, ftrace_control_list) { -		if (!ftrace_function_local_disabled(op) && +		if (!(op->flags & FTRACE_OPS_FL_STUB) && +		    !ftrace_function_local_disabled(op) &&  		    ftrace_ops_test(op, ip))  			op->func(ip, parent_ip, op, regs);  	} while_for_each_ftrace_op(op); @@ -4439,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = {  	.open		= ftrace_pid_open,  	.write		= ftrace_pid_write,  	.read		= seq_read, -	.llseek		= seq_lseek, +	.llseek		= ftrace_filter_lseek,  	.release	= ftrace_pid_release,  }; @@ -4555,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  		ftrace_startup_sysctl();  		/* we are starting ftrace again */ -		if (ftrace_ops_list != &ftrace_list_end) { -			if (ftrace_ops_list->next == &ftrace_list_end) -				ftrace_trace_function = ftrace_ops_list->func; -			else -				ftrace_trace_function = ftrace_ops_list_func; -		} +		if (ftrace_ops_list != &ftrace_list_end) +			update_ftrace_function();  	} else {  		/* stopping ftrace calls (just send to ftrace_stub) */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6989df2ba19..b59aea2c48c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,13 +8,16 @@  #include <linux/trace_clock.h>  #include <linux/trace_seq.h>  #include <linux/spinlock.h> +#include <linux/irq_work.h>  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/hardirq.h> +#include <linux/kthread.h>	/* for self test */  #include <linux/kmemcheck.h>  #include <linux/module.h>  #include <linux/percpu.h>  #include <linux/mutex.h> +#include <linux/delay.h>  #include <linux/slab.h>  #include <linux/init.h>  #include <linux/hash.h> @@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)  	return ret;  } +struct rb_irq_work { +	struct irq_work			work; +	wait_queue_head_t		waiters; +	bool				waiters_pending; +}; +  /*   * head_page == tail_page && head == tail then buffer is empty.   */ @@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {  	struct list_head		new_pages; /* new pages to add */  	struct work_struct		update_pages_work;  	struct completion		update_done; + +	struct rb_irq_work		irq_work;  };  struct ring_buffer { @@ -497,6 +508,8 @@ struct ring_buffer {  	struct notifier_block		cpu_notify;  #endif  	u64				(*clock)(void); + +	struct rb_irq_work		irq_work;  };  struct ring_buffer_iter { @@ -508,6 +521,118 @@ struct ring_buffer_iter {  	u64				read_stamp;  }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ +	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + +	wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	DEFINE_WAIT(wait); +	struct rb_irq_work *work; + +	/* +	 * Depending on what the caller is waiting for, either any +	 * data in any cpu buffer, or a specific buffer, put the +	 * caller on the appropriate wait queue. +	 */ +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + + +	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + +	/* +	 * The events can happen in critical sections where +	 * checking a work queue can cause deadlocks. +	 * After adding a task to the queue, this flag is set +	 * only to notify events to try to wake up the queue +	 * using irq_work. +	 * +	 * We don't clear it even if the buffer is no longer +	 * empty. The flag only causes the next event to run +	 * irq_work to do the work queue wake up. The worse +	 * that can happen if we race with !trace_empty() is that +	 * an event will cause an irq_work to try to wake up +	 * an empty queue. +	 * +	 * There's no reason to protect this flag either, as +	 * the work queue and irq_work logic will do the necessary +	 * synchronization for the wake ups. The only thing +	 * that is necessary is that the wake up happens after +	 * a task has been queued. It's OK for spurious wake ups. +	 */ +	work->waiters_pending = true; + +	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) +		schedule(); + +	finish_wait(&work->waiters, &wait); +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +			  struct file *filp, poll_table *poll_table) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct rb_irq_work *work; + +	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) +		return POLLIN | POLLRDNORM; + +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + +	work->waiters_pending = true; +	poll_wait(filp, &work->waiters, poll_table); + +	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) +		return POLLIN | POLLRDNORM; +	return 0; +} +  /* buffer may be either ring_buffer or ring_buffer_per_cpu */  #define RB_WARN_ON(b, cond)						\  	({								\ @@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)  	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);  	init_completion(&cpu_buffer->update_done); +	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&cpu_buffer->irq_work.waiters);  	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),  			    GFP_KERNEL, cpu_to_node(cpu)); @@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	buffer->clock = trace_clock_local;  	buffer->reader_lock_key = key; +	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&buffer->irq_work.waiters); +  	/* need at least two pages */  	if (nr_pages < 2)  		nr_pages = 2; @@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  			if (!cpu_buffer->nr_pages_to_update)  				continue; -			if (cpu_online(cpu)) +			/* The update must run on the CPU that is being updated. */ +			preempt_disable(); +			if (cpu == smp_processor_id() || !cpu_online(cpu)) { +				rb_update_pages(cpu_buffer); +				cpu_buffer->nr_pages_to_update = 0; +			} else { +				/* +				 * Can not disable preemption for schedule_work_on() +				 * on PREEMPT_RT. +				 */ +				preempt_enable();  				schedule_work_on(cpu,  						&cpu_buffer->update_pages_work); -			else -				rb_update_pages(cpu_buffer); +				preempt_disable(); +			} +			preempt_enable();  		}  		/* wait for all the updates to complete */ @@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  		get_online_cpus(); -		if (cpu_online(cpu_id)) { +		preempt_disable(); +		/* The update must run on the CPU that is being updated. */ +		if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) +			rb_update_pages(cpu_buffer); +		else { +			/* +			 * Can not disable preemption for schedule_work_on() +			 * on PREEMPT_RT. +			 */ +			preempt_enable();  			schedule_work_on(cpu_id,  					 &cpu_buffer->update_pages_work);  			wait_for_completion(&cpu_buffer->update_done); -		} else -			rb_update_pages(cpu_buffer); +			preempt_disable(); +		} +		preempt_enable();  		cpu_buffer->nr_pages_to_update = 0;  		put_online_cpus(); @@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,  	rb_end_commit(cpu_buffer);  } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ +	if (buffer->irq_work.waiters_pending) { +		buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&buffer->irq_work.work); +	} + +	if (cpu_buffer->irq_work.waiters_pending) { +		cpu_buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&cpu_buffer->irq_work.work); +	} +} +  /**   * ring_buffer_unlock_commit - commit a reserved   * @buffer: The buffer to commit to @@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	trace_recursive_unlock();  	preempt_enable_notrace(); @@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	ret = 0;   out:  	preempt_enable_notrace(); @@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  }  #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { +	struct ring_buffer	*buffer; +	unsigned long		events; +	unsigned long		bytes_written; +	unsigned long		bytes_alloc; +	unsigned long		bytes_dropped; +	unsigned long		events_nested; +	unsigned long		bytes_written_nested; +	unsigned long		bytes_alloc_nested; +	unsigned long		bytes_dropped_nested; +	int			min_size_nested; +	int			max_size_nested; +	int			max_size; +	int			min_size; +	int			cpu; +	int			cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE	1048576 + +static char rb_string[] __initdata = +	"abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" +	"?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" +	"!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { +	int size; +	char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ +	struct ring_buffer_event *event; +	struct rb_item *item; +	bool started; +	int event_len; +	int size; +	int len; +	int cnt; + +	/* Have nested writes different that what is written */ +	cnt = data->cnt + (nested ? 27 : 0); + +	/* Multiply cnt by ~e, to make some unique increment */ +	size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + +	len = size + sizeof(struct rb_item); + +	started = rb_test_started; +	/* read rb_test_started before checking buffer enabled */ +	smp_rmb(); + +	event = ring_buffer_lock_reserve(data->buffer, len); +	if (!event) { +		/* Ignore dropped events before test starts. */ +		if (started) { +			if (nested) +				data->bytes_dropped += len; +			else +				data->bytes_dropped_nested += len; +		} +		return len; +	} + +	event_len = ring_buffer_event_length(event); + +	if (RB_WARN_ON(data->buffer, event_len < len)) +		goto out; + +	item = ring_buffer_event_data(event); +	item->size = size; +	memcpy(item->str, rb_string, size); + +	if (nested) { +		data->bytes_alloc_nested += event_len; +		data->bytes_written_nested += len; +		data->events_nested++; +		if (!data->min_size_nested || len < data->min_size_nested) +			data->min_size_nested = len; +		if (len > data->max_size_nested) +			data->max_size_nested = len; +	} else { +		data->bytes_alloc += event_len; +		data->bytes_written += len; +		data->events++; +		if (!data->min_size || len < data->min_size) +			data->max_size = len; +		if (len > data->max_size) +			data->max_size = len; +	} + + out: +	ring_buffer_unlock_commit(data->buffer, event); + +	return 0; +} + +static __init int rb_test(void *arg) +{ +	struct rb_test_data *data = arg; + +	while (!kthread_should_stop()) { +		rb_write_something(data, false); +		data->cnt++; + +		set_current_state(TASK_INTERRUPTIBLE); +		/* Now sleep between a min of 100-300us and a max of 1ms */ +		usleep_range(((data->cnt % 3) + 1) * 100, 1000); +	} + +	return 0; +} + +static __init void rb_ipi(void *ignore) +{ +	struct rb_test_data *data; +	int cpu = smp_processor_id(); + +	data = &rb_data[cpu]; +	rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ +	while (!kthread_should_stop()) { + +		/* Send an IPI to all cpus to write data! */ +		smp_call_function(rb_ipi, NULL, 1); +		/* No sleep, but for non preempt, let others run */ +		schedule(); +	} + +	return 0; +} + +static __init int test_ringbuffer(void) +{ +	struct task_struct *rb_hammer; +	struct ring_buffer *buffer; +	int cpu; +	int ret = 0; + +	pr_info("Running ring buffer tests...\n"); + +	buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); +	if (WARN_ON(!buffer)) +		return 0; + +	/* Disable buffer so that threads can't write to it yet */ +	ring_buffer_record_off(buffer); + +	for_each_online_cpu(cpu) { +		rb_data[cpu].buffer = buffer; +		rb_data[cpu].cpu = cpu; +		rb_data[cpu].cnt = cpu; +		rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], +						 "rbtester/%d", cpu); +		if (WARN_ON(!rb_threads[cpu])) { +			pr_cont("FAILED\n"); +			ret = -1; +			goto out_free; +		} + +		kthread_bind(rb_threads[cpu], cpu); + 		wake_up_process(rb_threads[cpu]); +	} + +	/* Now create the rb hammer! */ +	rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); +	if (WARN_ON(!rb_hammer)) { +		pr_cont("FAILED\n"); +		ret = -1; +		goto out_free; +	} + +	ring_buffer_record_on(buffer); +	/* +	 * Show buffer is enabled before setting rb_test_started. +	 * Yes there's a small race window where events could be +	 * dropped and the thread wont catch it. But when a ring +	 * buffer gets enabled, there will always be some kind of +	 * delay before other CPUs see it. Thus, we don't care about +	 * those dropped events. We care about events dropped after +	 * the threads see that the buffer is active. +	 */ +	smp_wmb(); +	rb_test_started = true; + +	set_current_state(TASK_INTERRUPTIBLE); +	/* Just run for 10 seconds */; +	schedule_timeout(10 * HZ); + +	kthread_stop(rb_hammer); + + out_free: +	for_each_online_cpu(cpu) { +		if (!rb_threads[cpu]) +			break; +		kthread_stop(rb_threads[cpu]); +	} +	if (ret) { +		ring_buffer_free(buffer); +		return ret; +	} + +	/* Report! */ +	pr_info("finished\n"); +	for_each_online_cpu(cpu) { +		struct ring_buffer_event *event; +		struct rb_test_data *data = &rb_data[cpu]; +		struct rb_item *item; +		unsigned long total_events; +		unsigned long total_dropped; +		unsigned long total_written; +		unsigned long total_alloc; +		unsigned long total_read = 0; +		unsigned long total_size = 0; +		unsigned long total_len = 0; +		unsigned long total_lost = 0; +		unsigned long lost; +		int big_event_size; +		int small_event_size; + +		ret = -1; + +		total_events = data->events + data->events_nested; +		total_written = data->bytes_written + data->bytes_written_nested; +		total_alloc = data->bytes_alloc + data->bytes_alloc_nested; +		total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + +		big_event_size = data->max_size + data->max_size_nested; +		small_event_size = data->min_size + data->min_size_nested; + +		pr_info("CPU %d:\n", cpu); +		pr_info("              events:    %ld\n", total_events); +		pr_info("       dropped bytes:    %ld\n", total_dropped); +		pr_info("       alloced bytes:    %ld\n", total_alloc); +		pr_info("       written bytes:    %ld\n", total_written); +		pr_info("       biggest event:    %d\n", big_event_size); +		pr_info("      smallest event:    %d\n", small_event_size); + +		if (RB_WARN_ON(buffer, total_dropped)) +			break; + +		ret = 0; + +		while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { +			total_lost += lost; +			item = ring_buffer_event_data(event); +			total_len += ring_buffer_event_length(event); +			total_size += item->size + sizeof(struct rb_item); +			if (memcmp(&item->str[0], rb_string, item->size) != 0) { +				pr_info("FAILED!\n"); +				pr_info("buffer had: %.*s\n", item->size, item->str); +				pr_info("expected:   %.*s\n", item->size, rb_string); +				RB_WARN_ON(buffer, 1); +				ret = -1; +				break; +			} +			total_read++; +		} +		if (ret) +			break; + +		ret = -1; + +		pr_info("         read events:   %ld\n", total_read); +		pr_info("         lost events:   %ld\n", total_lost); +		pr_info("        total events:   %ld\n", total_lost + total_read); +		pr_info("  recorded len bytes:   %ld\n", total_len); +		pr_info(" recorded size bytes:   %ld\n", total_size); +		if (total_lost) +			pr_info(" With dropped events, record len and size may not match\n" +				" alloced and written from above\n"); +		if (!total_lost) { +			if (RB_WARN_ON(buffer, total_len != total_alloc || +				       total_size != total_written)) +				break; +		} +		if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) +			break; + +		ret = 0; +	} +	if (!ret) +		pr_info("Ring buffer PASSED!\n"); + +	ring_buffer_free(buffer); +	return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c2e2c231037..581630a6387 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@  /*   * ring buffer based function tracer   * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>   * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>   *   * Originally taken from the RT patch by: @@ -19,7 +19,6 @@  #include <linux/seq_file.h>  #include <linux/notifier.h>  #include <linux/irqflags.h> -#include <linux/irq_work.h>  #include <linux/debugfs.h>  #include <linux/pagemap.h>  #include <linux/hardirq.h> @@ -48,7 +47,7 @@   * On boot up, the ring buffer is set to the minimum size, so that   * we do not waste memory on systems that are not using tracing.   */ -int ring_buffer_expanded; +bool ring_buffer_expanded;  /*   * We need to change this state when a selftest is running. @@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)  static DEFINE_PER_CPU(bool, trace_cmdline_save);  /* - * When a reader is waiting for data, then this variable is - * set to true. - */ -static bool trace_wakeup_needed; - -static struct irq_work trace_work_wakeup; - -/*   * Kill all tracing for good (never come back).   * It is initialized to 1 but will turn to zero if the initialization   * of the tracer is successful. But that is the only place that sets @@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);  static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;  static char *default_bootup_tracer; +static bool allocate_snapshot; +  static int __init set_cmdline_ftrace(char *str)  { -	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); +	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);  	default_bootup_tracer = bootup_tracer_buf;  	/* We are using ftrace early, expand it */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true;  	return 1;  }  __setup("ftrace=", set_cmdline_ftrace); @@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str)  }  __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init boot_alloc_snapshot(char *str) +{ +	allocate_snapshot = true; +	/* We also need the main ring buffer expanded */ +	ring_buffer_expanded = true; +	return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); +  static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;  static char *trace_boot_options __initdata;  static int __init set_trace_boot_options(char *str)  { -	strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); +	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);  	trace_boot_options = trace_boot_options_buf;  	return 0;  } @@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)   */  static struct trace_array	global_trace; -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +LIST_HEAD(ftrace_trace_arrays);  int filter_current_check_discard(struct ring_buffer *buffer,  				 struct ftrace_event_call *call, void *rec, @@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)  	u64 ts;  	/* Early boot up does not have a buffer yet */ -	if (!global_trace.buffer) +	if (!global_trace.trace_buffer.buffer)  		return trace_clock_local(); -	ts = ring_buffer_time_stamp(global_trace.buffer, cpu); -	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); +	ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); +	ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);  	return ts;  } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array	max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); -  int tracing_is_enabled(void)  {  	return tracing_is_on(); @@ -249,9 +237,6 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;  /* trace_types holds a link list of available tracers. */  static struct tracer		*trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer		*current_trace __read_mostly = &nop_trace; -  /*   * trace_types_lock is used to protect the trace_types list.   */ @@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);  static inline void trace_access_lock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		/* gain it for accessing the whole ring buffer. */  		down_write(&all_cpu_access_lock);  	} else {  		/* gain it for accessing a cpu ring buffer. */ -		/* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ +		/* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */  		down_read(&all_cpu_access_lock);  		/* Secondly block other access to this @cpu ring buffer. */ @@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)  static inline void trace_access_unlock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		up_write(&all_cpu_access_lock);  	} else {  		mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)  #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); -  /* trace_flags holds trace_options default values */  unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |  	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |  	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | -	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; - -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); - -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -static void trace_wake_up(struct irq_work *work) -{ -	wake_up_all(&trace_wait); - -} +	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;  /**   * tracing_on - enable tracing buffers @@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)   */  void tracing_on(void)  { -	if (global_trace.buffer) -		ring_buffer_record_on(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		ring_buffer_record_on(global_trace.trace_buffer.buffer);  	/*  	 * This flag is only looked at when buffers haven't been  	 * allocated yet. We don't really care about the race @@ -385,6 +351,196 @@ void tracing_on(void)  EXPORT_SYMBOL_GPL(tracing_on);  /** + * __trace_puts - write a constant string into the trace buffer. + * @ip:	   The address of the caller + * @str:   The constant string to write + * @size:  The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct print_entry *entry; +	unsigned long irq_flags; +	int alloc; + +	alloc = sizeof(*entry) + size + 2; /* possible \n added */ + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,  +					  irq_flags, preempt_count()); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip = ip; + +	memcpy(&entry->buf, str, size); + +	/* Add a newline if necessary */ +	if (entry->buf[size - 1] != '\n') { +		entry->buf[size] = '\n'; +		entry->buf[size + 1] = '\0'; +	} else +		entry->buf[size] = '\0'; + +	__buffer_unlock_commit(buffer, event); + +	return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip:	   The address of the caller + * @str:   The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct bputs_entry *entry; +	unsigned long irq_flags; +	int size = sizeof(struct bputs_entry); + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, +					  irq_flags, preempt_count()); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip			= ip; +	entry->str			= str; + +	__buffer_unlock_commit(buffer, event); + +	return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *tracer = tr->current_trace; +	unsigned long flags; + +	if (in_nmi()) { +		internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); +		internal_trace_puts("*** snapshot is being ignored        ***\n"); +		return; +	} + +	if (!tr->allocated_snapshot) { +		internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); +		internal_trace_puts("*** stopping trace here!   ***\n"); +		tracing_off(); +		return; +	} + +	/* Note, snapshot can not be used when the tracer uses it */ +	if (tracer->use_max_tr) { +		internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); +		internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); +		return; +	} + +	local_irq_save(flags); +	update_max_tr(tr, current, smp_processor_id()); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ +	int ret; + +	if (!tr->allocated_snapshot) { + +		/* allocate spare buffer */ +		ret = resize_buffer_duplicate_size(&tr->max_buffer, +				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS); +		if (ret < 0) +			return ret; + +		tr->allocated_snapshot = true; +	} + +	return 0; +} + +void free_snapshot(struct trace_array *tr) +{ +	/* +	 * We don't free the ring buffer. instead, resize it because +	 * The max_tr ring buffer has some state (e.g. ring->clock) and +	 * we want preserve it. +	 */ +	ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); +	set_buffer_entries(&tr->max_buffer, 1); +	tracing_reset_online_cpus(&tr->max_buffer); +	tr->allocated_snapshot = false; +} + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ +	struct trace_array *tr = &global_trace; +	int ret; + +	ret = alloc_snapshot(tr); +	if (WARN_ON(ret < 0)) +		return; + +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ +	WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_alloc(void) +{ +	/* Give warning */ +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +/**   * tracing_off - turn off tracing buffers   *   * This function stops the tracing buffers from recording data. @@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);   */  void tracing_off(void)  { -	if (global_trace.buffer) -		ring_buffer_record_off(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		ring_buffer_record_off(global_trace.trace_buffer.buffer);  	/*  	 * This flag is only looked at when buffers haven't been  	 * allocated yet. We don't really care about the race @@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);   */  int tracing_is_on(void)  { -	if (global_trace.buffer) -		return ring_buffer_record_is_on(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);  	return !global_trace.buffer_disabled;  }  EXPORT_SYMBOL_GPL(tracing_is_on); @@ -479,6 +635,7 @@ static const char *trace_options[] = {  	"disable_on_free",  	"irq-info",  	"markers", +	"function-trace",  	NULL  }; @@ -490,6 +647,8 @@ static struct {  	{ trace_clock_local,	"local",	1 },  	{ trace_clock_global,	"global",	1 },  	{ trace_clock_counter,	"counter",	0 }, +	{ trace_clock_jiffies,	"uptime",	1 }, +	{ trace_clock,		"perf",		1 },  	ARCH_TRACE_CLOCKS  }; @@ -670,13 +829,14 @@ unsigned long __read_mostly	tracing_max_latency;  static void  __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct trace_array_cpu *data = tr->data[cpu]; -	struct trace_array_cpu *max_data; +	struct trace_buffer *trace_buf = &tr->trace_buffer; +	struct trace_buffer *max_buf = &tr->max_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); +	struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); -	max_tr.cpu = cpu; -	max_tr.time_start = data->preempt_timestamp; +	max_buf->cpu = cpu; +	max_buf->time_start = data->preempt_timestamp; -	max_data = max_tr.data[cpu];  	max_data->saved_latency = tracing_max_latency;  	max_data->critical_start = data->critical_start;  	max_data->critical_end = data->critical_end; @@ -704,23 +864,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  void  update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct ring_buffer *buf = tr->buffer; +	struct ring_buffer *buf; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (!current_trace->allocated_snapshot) { +	if (!tr->allocated_snapshot) {  		/* Only the nop tracer should hit this when disabling */ -		WARN_ON_ONCE(current_trace != &nop_trace); +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return;  	}  	arch_spin_lock(&ftrace_max_lock); -	tr->buffer = max_tr.buffer; -	max_tr.buffer = buf; +	buf = tr->trace_buffer.buffer; +	tr->trace_buffer.buffer = tr->max_buffer.buffer; +	tr->max_buffer.buffer = buf;  	__update_max_tr(tr, tsk, cpu);  	arch_spin_unlock(&ftrace_max_lock); @@ -739,16 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  {  	int ret; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) +	if (tr->allocated_snapshot) { +		/* Only the nop tracer should hit this when disabling */ +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return; +	}  	arch_spin_lock(&ftrace_max_lock); -	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); +	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);  	if (ret == -EBUSY) {  		/* @@ -757,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  		 * the max trace buffer (no one writes directly to it)  		 * and flag that it failed.  		 */ -		trace_array_printk(&max_tr, _THIS_IP_, +		trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,  			"Failed to swap buffers due to commit in progress\n");  	} @@ -770,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  static void default_wait_pipe(struct trace_iterator *iter)  { -	DEFINE_WAIT(wait); +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return; -	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); +	ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *saved_tracer = tr->current_trace; +	int ret; + +	if (!type->selftest || tracing_selftest_disabled) +		return 0;  	/* -	 * The events can happen in critical sections where -	 * checking a work queue can cause deadlocks. -	 * After adding a task to the queue, this flag is set -	 * only to notify events to try to wake up the queue -	 * using irq_work. -	 * -	 * We don't clear it even if the buffer is no longer -	 * empty. The flag only causes the next event to run -	 * irq_work to do the work queue wake up. The worse -	 * that can happen if we race with !trace_empty() is that -	 * an event will cause an irq_work to try to wake up -	 * an empty queue. -	 * -	 * There's no reason to protect this flag either, as -	 * the work queue and irq_work logic will do the necessary -	 * synchronization for the wake ups. The only thing -	 * that is necessary is that the wake up happens after -	 * a task has been queued. It's OK for spurious wake ups. +	 * Run a selftest on this tracer. +	 * Here we reset the trace buffer, and set the current +	 * tracer to be this tracer. The tracer can then run some +	 * internal tracing to verify that everything is in order. +	 * If we fail, we do not register this tracer.  	 */ -	trace_wakeup_needed = true; +	tracing_reset_online_cpus(&tr->trace_buffer); -	if (trace_empty(iter)) -		schedule(); +	tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		/* If we expanded the buffers, make sure the max is expanded too */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, +					   RING_BUFFER_ALL_CPUS); +		tr->allocated_snapshot = true; +	} +#endif -	finish_wait(&trace_wait, &wait); +	/* the test is responsible for initializing and enabling */ +	pr_info("Testing tracer %s: ", type->name); +	ret = type->selftest(type, tr); +	/* the test is responsible for resetting too */ +	tr->current_trace = saved_tracer; +	if (ret) { +		printk(KERN_CONT "FAILED!\n"); +		/* Add the warning after printing 'FAILED' */ +		WARN_ON(1); +		return -1; +	} +	/* Only reset on passing, to avoid touching corrupted buffers */ +	tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		tr->allocated_snapshot = false; + +		/* Shrink the max buffer again */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, 1, +					   RING_BUFFER_ALL_CPUS); +	} +#endif + +	printk(KERN_CONT "PASSED\n"); +	return 0;  } +#else +static inline int run_tracer_selftest(struct tracer *type) +{ +	return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */  /**   * register_tracer - register a tracer with the ftrace system. @@ -847,57 +1052,9 @@ int register_tracer(struct tracer *type)  	if (!type->wait_pipe)  		type->wait_pipe = default_wait_pipe; - -#ifdef CONFIG_FTRACE_STARTUP_TEST -	if (type->selftest && !tracing_selftest_disabled) { -		struct tracer *saved_tracer = current_trace; -		struct trace_array *tr = &global_trace; - -		/* -		 * Run a selftest on this tracer. -		 * Here we reset the trace buffer, and set the current -		 * tracer to be this tracer. The tracer can then run some -		 * internal tracing to verify that everything is in order. -		 * If we fail, we do not register this tracer. -		 */ -		tracing_reset_online_cpus(tr); - -		current_trace = type; - -		if (type->use_max_tr) { -			/* If we expanded the buffers, make sure the max is expanded too */ -			if (ring_buffer_expanded) -				ring_buffer_resize(max_tr.buffer, trace_buf_size, -						   RING_BUFFER_ALL_CPUS); -			type->allocated_snapshot = true; -		} - -		/* the test is responsible for initializing and enabling */ -		pr_info("Testing tracer %s: ", type->name); -		ret = type->selftest(type, tr); -		/* the test is responsible for resetting too */ -		current_trace = saved_tracer; -		if (ret) { -			printk(KERN_CONT "FAILED!\n"); -			/* Add the warning after printing 'FAILED' */ -			WARN_ON(1); -			goto out; -		} -		/* Only reset on passing, to avoid touching corrupted buffers */ -		tracing_reset_online_cpus(tr); - -		if (type->use_max_tr) { -			type->allocated_snapshot = false; - -			/* Shrink the max buffer again */ -			if (ring_buffer_expanded) -				ring_buffer_resize(max_tr.buffer, 1, -						   RING_BUFFER_ALL_CPUS); -		} - -		printk(KERN_CONT "PASSED\n"); -	} -#endif +	ret = run_tracer_selftest(type); +	if (ret < 0) +		goto out;  	type->next = trace_types;  	trace_types = type; @@ -917,7 +1074,7 @@ int register_tracer(struct tracer *type)  	tracing_set_tracer(type->name);  	default_bootup_tracer = NULL;  	/* disable other selftests, since this will break it. */ -	tracing_selftest_disabled = 1; +	tracing_selftest_disabled = true;  #ifdef CONFIG_FTRACE_STARTUP_TEST  	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",  	       type->name); @@ -927,9 +1084,9 @@ int register_tracer(struct tracer *type)  	return ret;  } -void tracing_reset(struct trace_array *tr, int cpu) +void tracing_reset(struct trace_buffer *buf, int cpu)  { -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = buf->buffer;  	if (!buffer)  		return; @@ -943,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)  	ring_buffer_record_enable(buffer);  } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf)  { -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = buf->buffer;  	int cpu;  	if (!buffer) @@ -956,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)  	/* Make sure all commits have finished */  	synchronize_sched(); -	tr->time_start = ftrace_now(tr->cpu); +	buf->time_start = ftrace_now(buf->cpu);  	for_each_online_cpu(cpu)  		ring_buffer_reset_cpu(buffer, cpu); @@ -966,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)  void tracing_reset_current(int cpu)  { -	tracing_reset(&global_trace, cpu); +	tracing_reset(&global_trace.trace_buffer, cpu);  } -void tracing_reset_current_online_cpus(void) +void tracing_reset_all_online_cpus(void)  { -	tracing_reset_online_cpus(&global_trace); +	struct trace_array *tr; + +	mutex_lock(&trace_types_lock); +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE +		tracing_reset_online_cpus(&tr->max_buffer); +#endif +	} +	mutex_unlock(&trace_types_lock);  }  #define SAVED_CMDLINES 128 @@ -994,7 +1160,7 @@ static void trace_init_cmdlines(void)  int is_tracing_stopped(void)  { -	return trace_stop_count; +	return global_trace.stop_count;  }  /** @@ -1026,12 +1192,12 @@ void tracing_start(void)  	if (tracing_disabled)  		return; -	raw_spin_lock_irqsave(&tracing_start_lock, flags); -	if (--trace_stop_count) { -		if (trace_stop_count < 0) { +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (--global_trace.stop_count) { +		if (global_trace.stop_count < 0) {  			/* Someone screwed up their debugging */  			WARN_ON_ONCE(1); -			trace_stop_count = 0; +			global_trace.stop_count = 0;  		}  		goto out;  	} @@ -1039,19 +1205,52 @@ void tracing_start(void)  	/* Prevent the buffers from switching */  	arch_spin_lock(&ftrace_max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); +#endif  	arch_spin_unlock(&ftrace_max_lock);  	ftrace_start();   out: -	raw_spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	if (tracing_disabled) +		return; + +	/* If global, we need to also start the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_start(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); + +	if (--tr->stop_count) { +		if (tr->stop_count < 0) { +			/* Someone screwed up their debugging */ +			WARN_ON_ONCE(1); +			tr->stop_count = 0; +		} +		goto out; +	} + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_enable(buffer); + + out: +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  /** @@ -1066,25 +1265,48 @@ void tracing_stop(void)  	unsigned long flags;  	ftrace_stop(); -	raw_spin_lock_irqsave(&tracing_start_lock, flags); -	if (trace_stop_count++) +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (global_trace.stop_count++)  		goto out;  	/* Prevent the buffers from switching */  	arch_spin_lock(&ftrace_max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); +#endif  	arch_spin_unlock(&ftrace_max_lock);   out: -	raw_spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	/* If global, we need to also stop the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_stop(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); +	if (tr->stop_count++) +		goto out; + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_disable(buffer); + + out: +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  void trace_stop_cmdline_recording(void); @@ -1217,11 +1439,6 @@ void  __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)  {  	__this_cpu_write(trace_cmdline_save, true); -	if (trace_wakeup_needed) { -		trace_wakeup_needed = false; -		/* irq_work_queue() supplies it's own memory barriers */ -		irq_work_queue(&trace_work_wakeup); -	}  	ring_buffer_unlock_commit(buffer, event);  } @@ -1245,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,  EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);  struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, +			  struct ftrace_event_file *ftrace_file, +			  int type, unsigned long len, +			  unsigned long flags, int pc) +{ +	*current_rb = ftrace_file->tr->trace_buffer.buffer; +	return trace_buffer_lock_reserve(*current_rb, +					 type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +struct ring_buffer_event *  trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,  				  int type, unsigned long len,  				  unsigned long flags, int pc)  { -	*current_rb = global_trace.buffer; +	*current_rb = global_trace.trace_buffer.buffer;  	return trace_buffer_lock_reserve(*current_rb,  					 type, len, flags, pc);  } @@ -1288,7 +1517,7 @@ trace_function(struct trace_array *tr,  	       int pc)  {  	struct ftrace_event_call *call = &event_function; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ftrace_entry *entry; @@ -1429,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,  void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,  		   int pc)  { -	__ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); +	__ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);  }  /**   * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers)   */ -void trace_dump_stack(void) +void trace_dump_stack(int skip)  {  	unsigned long flags; @@ -1444,8 +1674,13 @@ void trace_dump_stack(void)  	local_save_flags(flags); -	/* skipping 3 traces, seems to get us at the caller of this function */ -	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); +	/* +	 * Skip 3 more, seems to get us at the caller of +	 * this function. +	 */ +	skip += 3; +	__ftrace_trace_stack(global_trace.trace_buffer.buffer, +			     flags, skip, preempt_count(), NULL);  }  static DEFINE_PER_CPU(int, user_stack_count); @@ -1615,7 +1850,7 @@ void trace_printk_init_buffers(void)  	 * directly here. If the global_trace.buffer is already  	 * allocated here, then this was called by module code.  	 */ -	if (global_trace.buffer) +	if (global_trace.trace_buffer.buffer)  		tracing_start_cmdline_record();  } @@ -1675,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	local_save_flags(flags);  	size = sizeof(*entry) + sizeof(u32) * len; -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,  					  flags, pc);  	if (!event) @@ -1698,27 +1933,12 @@ out:  }  EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, -		       unsigned long ip, const char *fmt, ...) -{ -	int ret; -	va_list ap; - -	if (!(trace_flags & TRACE_ITER_PRINTK)) -		return 0; - -	va_start(ap, fmt); -	ret = trace_array_vprintk(tr, ip, fmt, ap); -	va_end(ap); -	return ret; -} - -int trace_array_vprintk(struct trace_array *tr, -			unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, +		      unsigned long ip, const char *fmt, va_list args)  {  	struct ftrace_event_call *call = &event_print;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer;  	int len = 0, size, pc;  	struct print_entry *entry;  	unsigned long flags; @@ -1746,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,  	local_save_flags(flags);  	size = sizeof(*entry) + len + 1; -	buffer = tr->buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,  					  flags, pc);  	if (!event) @@ -1767,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,  	return len;  } +int trace_array_vprintk(struct trace_array *tr, +			unsigned long ip, const char *fmt, va_list args) +{ +	return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, +		       unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = trace_array_vprintk(tr, ip, fmt, ap); +	va_end(ap); +	return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = __trace_array_vprintk(buffer, ip, fmt, ap); +	va_end(ap); +	return ret; +} +  int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  {  	return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1792,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,  	if (buf_iter)  		event = ring_buffer_iter_peek(buf_iter, ts);  	else -		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, +		event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,  					 lost_events);  	if (event) { @@ -1807,7 +2062,7 @@ static struct trace_entry *  __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  		  unsigned long *missing_events, u64 *ent_ts)  { -	struct ring_buffer *buffer = iter->tr->buffer; +	struct ring_buffer *buffer = iter->trace_buffer->buffer;  	struct trace_entry *ent, *next = NULL;  	unsigned long lost_events = 0, next_lost = 0;  	int cpu_file = iter->cpu_file; @@ -1820,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  	 * If we are in a per_cpu trace file, don't bother by iterating over  	 * all cpu and peek directly.  	 */ -	if (cpu_file > TRACE_PIPE_ALL_CPU) { +	if (cpu_file > RING_BUFFER_ALL_CPUS) {  		if (ring_buffer_empty_cpu(buffer, cpu_file))  			return NULL;  		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1884,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)  static void trace_consume(struct trace_iterator *iter)  { -	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, +	ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,  			    &iter->lost_events);  } @@ -1917,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)  void tracing_iter_reset(struct trace_iterator *iter, int cpu)  { -	struct trace_array *tr = iter->tr;  	struct ring_buffer_event *event;  	struct ring_buffer_iter *buf_iter;  	unsigned long entries = 0;  	u64 ts; -	tr->data[cpu]->skipped_entries = 0; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;  	buf_iter = trace_buffer_iter(iter, cpu);  	if (!buf_iter) @@ -1937,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  	 * by the timestamp being before the start of the buffer.  	 */  	while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { -		if (ts >= iter->tr->time_start) +		if (ts >= iter->trace_buffer->time_start)  			break;  		entries++;  		ring_buffer_read(buf_iter, NULL);  	} -	tr->data[cpu]->skipped_entries = entries; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;  }  /* @@ -1953,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  static void *s_start(struct seq_file *m, loff_t *pos)  {  	struct trace_iterator *iter = m->private; +	struct trace_array *tr = iter->tr;  	int cpu_file = iter->cpu_file;  	void *p = NULL;  	loff_t l = 0; @@ -1965,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)  	 * will point to the same string as current_trace->name.  	 */  	mutex_lock(&trace_types_lock); -	if (unlikely(current_trace && iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock); +#ifdef CONFIG_TRACER_MAX_TRACE  	if (iter->snapshot && iter->trace->use_max_tr)  		return ERR_PTR(-EBUSY); +#endif  	if (!iter->snapshot)  		atomic_inc(&trace_record_cmdline_disabled); @@ -1980,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)  		iter->cpu = 0;  		iter->idx = -1; -		if (cpu_file == TRACE_PIPE_ALL_CPU) { +		if (cpu_file == RING_BUFFER_ALL_CPUS) {  			for_each_tracing_cpu(cpu)  				tracing_iter_reset(iter, cpu);  		} else @@ -2012,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)  {  	struct trace_iterator *iter = m->private; +#ifdef CONFIG_TRACER_MAX_TRACE  	if (iter->snapshot && iter->trace->use_max_tr)  		return; +#endif  	if (!iter->snapshot)  		atomic_dec(&trace_record_cmdline_disabled); +  	trace_access_unlock(iter->cpu_file);  	trace_event_read_unlock();  }  static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +get_total_entries(struct trace_buffer *buf, +		  unsigned long *total, unsigned long *entries)  {  	unsigned long count;  	int cpu; @@ -2031,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e  	*entries = 0;  	for_each_tracing_cpu(cpu) { -		count = ring_buffer_entries_cpu(tr->buffer, cpu); +		count = ring_buffer_entries_cpu(buf->buffer, cpu);  		/*  		 * If this buffer has skipped entries, then we hold all  		 * entries for the trace and we need to ignore the  		 * ones before the time stamp.  		 */ -		if (tr->data[cpu]->skipped_entries) { -			count -= tr->data[cpu]->skipped_entries; +		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { +			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;  			/* total is the same as the entries */  			*total += count;  		} else  			*total += count + -				ring_buffer_overrun_cpu(tr->buffer, cpu); +				ring_buffer_overrun_cpu(buf->buffer, cpu);  		*entries += count;  	}  } @@ -2060,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)  	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");  } -static void print_event_info(struct trace_array *tr, struct seq_file *m) +static void print_event_info(struct trace_buffer *buf, struct seq_file *m)  {  	unsigned long total;  	unsigned long entries; -	get_total_entries(tr, &total, &entries); +	get_total_entries(buf, &total, &entries);  	seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",  		   entries, total, num_online_cpus());  	seq_puts(m, "#\n");  } -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)  { -	print_event_info(tr, m); +	print_event_info(buf, m);  	seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");  	seq_puts(m, "#              | |       |          |         |\n");  } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)  { -	print_event_info(tr, m); +	print_event_info(buf, m);  	seq_puts(m, "#                              _-----=> irqs-off\n");  	seq_puts(m, "#                             / _----=> need-resched\n");  	seq_puts(m, "#                            | / _---=> hardirq/softirq\n"); @@ -2094,16 +2355,16 @@ void  print_trace_header(struct seq_file *m, struct trace_iterator *iter)  {  	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); -	struct trace_array *tr = iter->tr; -	struct trace_array_cpu *data = tr->data[tr->cpu]; -	struct tracer *type = current_trace; +	struct trace_buffer *buf = iter->trace_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); +	struct tracer *type = iter->trace;  	unsigned long entries;  	unsigned long total;  	const char *name = "preemption";  	name = type->name; -	get_total_entries(tr, &total, &entries); +	get_total_entries(buf, &total, &entries);  	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",  		   name, UTS_RELEASE); @@ -2114,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  		   nsecs_to_usecs(data->saved_latency),  		   entries,  		   total, -		   tr->cpu, +		   buf->cpu,  #if defined(CONFIG_PREEMPT_NONE)  		   "server",  #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -2165,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)  	if (cpumask_test_cpu(iter->cpu, iter->started))  		return; -	if (iter->tr->data[iter->cpu]->skipped_entries) +	if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)  		return;  	cpumask_set_cpu(iter->cpu, iter->started); @@ -2288,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)  	int cpu;  	/* If we are looking at one CPU buffer, only check that one */ -	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { +	if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {  		cpu = iter->cpu_file;  		buf_iter = trace_buffer_iter(iter, cpu);  		if (buf_iter) {  			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  		return 1; @@ -2307,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)  			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  	} @@ -2331,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)  			return ret;  	} +	if (iter->ent->type == TRACE_BPUTS && +			trace_flags & TRACE_ITER_PRINTK && +			trace_flags & TRACE_ITER_PRINTK_MSGONLY) +		return trace_print_bputs_msg_only(iter); +  	if (iter->ent->type == TRACE_BPRINT &&  			trace_flags & TRACE_ITER_PRINTK &&  			trace_flags & TRACE_ITER_PRINTK_MSGONLY) @@ -2385,9 +2651,9 @@ void trace_default_header(struct seq_file *m)  	} else {  		if (!(trace_flags & TRACE_ITER_VERBOSE)) {  			if (trace_flags & TRACE_ITER_IRQ_INFO) -				print_func_help_header_irq(iter->tr, m); +				print_func_help_header_irq(iter->trace_buffer, m);  			else -				print_func_help_header(iter->tr, m); +				print_func_help_header(iter->trace_buffer, m);  		}  	}  } @@ -2400,6 +2666,50 @@ static void test_ftrace_alive(struct seq_file *m)  	seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");  } +#ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ +	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); +	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); +	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n"); +	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); +	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n"); +	seq_printf(m, "#                       is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ +	seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP +	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); +	seq_printf(m, "#                      Takes a snapshot of the main buffer for this cpu.\n"); +#else +	seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); +	seq_printf(m, "#                     Must use main snapshot file to allocate.\n"); +#endif +	seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); +	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n"); +	seq_printf(m, "#                       is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ +	if (iter->tr->allocated_snapshot) +		seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); +	else +		seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + +	seq_printf(m, "# Snapshot commands:\n"); +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +		show_snapshot_main_help(m); +	else +		show_snapshot_percpu_help(m); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif +  static int s_show(struct seq_file *m, void *v)  {  	struct trace_iterator *iter = v; @@ -2411,7 +2721,9 @@ static int s_show(struct seq_file *m, void *v)  			seq_puts(m, "#\n");  			test_ftrace_alive(m);  		} -		if (iter->trace && iter->trace->print_header) +		if (iter->snapshot && trace_empty(iter)) +			print_snapshot_help(m, iter); +		else if (iter->trace && iter->trace->print_header)  			iter->trace->print_header(m);  		else  			trace_default_header(m); @@ -2452,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {  static struct trace_iterator *  __tracing_open(struct inode *inode, struct file *file, bool snapshot)  { -	long cpu_file = (long) inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct trace_iterator *iter;  	int cpu; @@ -2477,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	if (!iter->trace)  		goto fail; -	*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))  		goto fail; -	if (current_trace->print_max || snapshot) -		iter->tr = &max_tr; +	iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE +	/* Currently only the top directory has a snapshot */ +	if (tr->current_trace->print_max || snapshot) +		iter->trace_buffer = &tr->max_buffer;  	else -		iter->tr = &global_trace; +#endif +		iter->trace_buffer = &tr->trace_buffer;  	iter->snapshot = snapshot;  	iter->pos = -1;  	mutex_init(&iter->mutex); -	iter->cpu_file = cpu_file; +	iter->cpu_file = tc->cpu;  	/* Notify the tracer early; before we stop tracing. */  	if (iter->trace && iter->trace->open)  		iter->trace->open(iter);  	/* Annotate start of buffers if we had overruns */ -	if (ring_buffer_overruns(iter->tr->buffer)) +	if (ring_buffer_overruns(iter->trace_buffer->buffer))  		iter->iter_flags |= TRACE_FILE_ANNOTATE;  	/* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -2505,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	/* stop the trace while dumping if we are not opening "snapshot" */  	if (!iter->snapshot) -		tracing_stop(); +		tracing_stop_tr(tr); -	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter->buffer_iter[cpu] = -				ring_buffer_read_prepare(iter->tr->buffer, cpu); +				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		}  		ring_buffer_read_prepare_sync();  		for_each_tracing_cpu(cpu) { @@ -2520,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	} else {  		cpu = iter->cpu_file;  		iter->buffer_iter[cpu] = -			ring_buffer_read_prepare(iter->tr->buffer, cpu); +			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		ring_buffer_read_prepare_sync();  		ring_buffer_read_start(iter->buffer_iter[cpu]);  		tracing_iter_reset(iter, cpu);  	} +	tr->ref++; +  	mutex_unlock(&trace_types_lock);  	return iter; @@ -2552,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)  {  	struct seq_file *m = file->private_data;  	struct trace_iterator *iter; +	struct trace_array *tr;  	int cpu;  	if (!(file->f_mode & FMODE_READ))  		return 0;  	iter = m->private; +	tr = iter->tr;  	mutex_lock(&trace_types_lock); + +	WARN_ON(!tr->ref); +	tr->ref--; +  	for_each_tracing_cpu(cpu) {  		if (iter->buffer_iter[cpu])  			ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2570,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)  	if (!iter->snapshot)  		/* reenable tracing if it was previously enabled */ -		tracing_start(); +		tracing_start_tr(tr);  	mutex_unlock(&trace_types_lock);  	mutex_destroy(&iter->mutex); @@ -2589,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)  	/* If this file was open for write, then erase contents */  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) { -		long cpu = (long) inode->i_private; +		struct trace_cpu *tc = inode->i_private; +		struct trace_array *tr = tc->tr; -		if (cpu == TRACE_PIPE_ALL_CPU) -			tracing_reset_online_cpus(&global_trace); +		if (tc->cpu == RING_BUFFER_ALL_CPUS) +			tracing_reset_online_cpus(&tr->trace_buffer);  		else -			tracing_reset(&global_trace, cpu); +			tracing_reset(&tr->trace_buffer, tc->cpu);  	}  	if (file->f_mode & FMODE_READ) { @@ -2741,8 +3068,9 @@ static ssize_t  tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		      size_t count, loff_t *ppos)  { -	int err, cpu; +	struct trace_array *tr = filp->private_data;  	cpumask_var_t tracing_cpumask_new; +	int err, cpu;  	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))  		return -ENOMEM; @@ -2762,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		 */  		if (cpumask_test_cpu(cpu, tracing_cpumask) &&  				!cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_inc(&global_trace.data[cpu]->disabled); -			ring_buffer_record_disable_cpu(global_trace.buffer, cpu); +			atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);  		}  		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&  				cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_dec(&global_trace.data[cpu]->disabled); -			ring_buffer_record_enable_cpu(global_trace.buffer, cpu); +			atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);  		}  	}  	arch_spin_unlock(&ftrace_max_lock); @@ -2797,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {  static int tracing_trace_options_show(struct seq_file *m, void *v)  {  	struct tracer_opt *trace_opts; +	struct trace_array *tr = m->private;  	u32 tracer_flags;  	int i;  	mutex_lock(&trace_types_lock); -	tracer_flags = current_trace->flags->val; -	trace_opts = current_trace->flags->opts; +	tracer_flags = tr->current_trace->flags->val; +	trace_opts = tr->current_trace->flags->opts;  	for (i = 0; trace_options[i]; i++) {  		if (trace_flags & (1 << i)) @@ -2857,11 +3186,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)  	return -EINVAL;  } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ +	if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) +		return -1; + +	return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)  {  	/* do nothing if flag is already set */  	if (!!(trace_flags & mask) == !!enabled) -		return; +		return 0; + +	/* Give the tracer a chance to approve the change */ +	if (tr->current_trace->flag_changed) +		if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) +			return -EINVAL;  	if (enabled)  		trace_flags |= mask; @@ -2871,18 +3214,24 @@ static void set_tracer_flags(unsigned int mask, int enabled)  	if (mask == TRACE_ITER_RECORD_CMD)  		trace_event_enable_cmd_record(enabled); -	if (mask == TRACE_ITER_OVERWRITE) -		ring_buffer_change_overwrite(global_trace.buffer, enabled); +	if (mask == TRACE_ITER_OVERWRITE) { +		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE +		ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#endif +	}  	if (mask == TRACE_ITER_PRINTK)  		trace_printk_start_stop_comm(enabled); + +	return 0;  } -static int trace_set_options(char *option) +static int trace_set_options(struct trace_array *tr, char *option)  {  	char *cmp;  	int neg = 0; -	int ret = 0; +	int ret = -ENODEV;  	int i;  	cmp = strstrip(option); @@ -2892,19 +3241,20 @@ static int trace_set_options(char *option)  		cmp += 2;  	} +	mutex_lock(&trace_types_lock); +  	for (i = 0; trace_options[i]; i++) {  		if (strcmp(cmp, trace_options[i]) == 0) { -			set_tracer_flags(1 << i, !neg); +			ret = set_tracer_flag(tr, 1 << i, !neg);  			break;  		}  	}  	/* If no option could be set, test the specific tracer options */ -	if (!trace_options[i]) { -		mutex_lock(&trace_types_lock); -		ret = set_tracer_option(current_trace, cmp, neg); -		mutex_unlock(&trace_types_lock); -	} +	if (!trace_options[i]) +		ret = set_tracer_option(tr->current_trace, cmp, neg); + +	mutex_unlock(&trace_types_lock);  	return ret;  } @@ -2913,7 +3263,10 @@ static ssize_t  tracing_trace_options_write(struct file *filp, const char __user *ubuf,  			size_t cnt, loff_t *ppos)  { +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private;  	char buf[64]; +	int ret;  	if (cnt >= sizeof(buf))  		return -EINVAL; @@ -2923,7 +3276,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,  	buf[cnt] = 0; -	trace_set_options(buf); +	ret = trace_set_options(tr, buf); +	if (ret < 0) +		return ret;  	*ppos += cnt; @@ -2934,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)  {  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_trace_options_show, NULL); + +	return single_open(file, tracing_trace_options_show, inode->i_private);  }  static const struct file_operations tracing_iter_fops = { @@ -2947,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {  static const char readme_msg[] =  	"tracing mini-HOWTO:\n\n" -	"# mount -t debugfs nodev /sys/kernel/debug\n\n" -	"# cat /sys/kernel/debug/tracing/available_tracers\n" -	"wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"nop\n" -	"# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"wakeup\n" -	"# cat /sys/kernel/debug/tracing/trace_options\n" -	"noprint-parent nosym-offset nosym-addr noverbose\n" -	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" -	"# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" -	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" -	"# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" +	"# echo 0 > tracing_on : quick way to disable tracing\n" +	"# echo 1 > tracing_on : quick way to re-enable tracing\n\n" +	" Important files:\n" +	"  trace\t\t\t- The static contents of the buffer\n" +	"\t\t\t  To clear the buffer write into this file: echo > trace\n" +	"  trace_pipe\t\t- A consuming read to see the contents of the buffer\n" +	"  current_tracer\t- function and latency tracers\n" +	"  available_tracers\t- list of configured tracers for current_tracer\n" +	"  buffer_size_kb\t- view and modify size of per cpu buffer\n" +	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n" +	"  trace_clock\t\t-change the clock used to order events\n" +	"       local:   Per cpu clock but may not be synced across CPUs\n" +	"      global:   Synced across CPUs but slows tracing down.\n" +	"     counter:   Not a clock, but just an increment\n" +	"      uptime:   Jiffy counter from time of boot\n" +	"        perf:   Same clock that perf events use\n" +#ifdef CONFIG_X86_64 +	"     x86-tsc:   TSC cycle counter\n" +#endif +	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n" +	"  tracing_cpumask\t- Limit which CPUs to trace\n" +	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n" +	"\t\t\t  Remove sub-buffer with rmdir\n" +	"  trace_options\t\t- Set format or modify how tracing happens\n" +	"\t\t\t  Disable an option by adding a suffix 'no' to the option name\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"\n  available_filter_functions - list of functions that can be filtered on\n" +	"  set_ftrace_filter\t- echo function name in here to only trace these functions\n" +	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"            modules: Can select a group via module\n" +	"             Format: :mod:<module-name>\n" +	"             example: echo :mod:ext3 > set_ftrace_filter\n" +	"            triggers: a command to perform when function is hit\n" +	"              Format: <function>:<trigger>[:count]\n" +	"             trigger: traceon, traceoff\n" +	"                      enable_event:<system>:<event>\n" +	"                      disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE +	"                      stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"                      snapshot\n" +#endif +	"             example: echo do_fault:traceoff > set_ftrace_filter\n" +	"                      echo do_trap:traceoff:3 > set_ftrace_filter\n" +	"             The first one will disable tracing every time do_fault is hit\n" +	"             The second will disable tracing at most 3 times when do_trap is hit\n" +	"               The first time do trap is hit and it disables tracing, the counter\n" +	"               will decrement to 2. If tracing is already disabled, the counter\n" +	"               will not decrement. It only decrements when the trigger did work\n" +	"             To remove trigger without count:\n" +	"               echo '!<function>:<trigger> > set_ftrace_filter\n" +	"             To remove trigger with a count:\n" +	"               echo '!<function>:<trigger>:0 > set_ftrace_filter\n" +	"  set_ftrace_notrace\t- echo function name in here to never trace.\n" +	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"            modules: Can select a group via module command :mod:\n" +	"            Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER +	"  set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	"  set_graph_function\t- Trace the nested calls of a function (function_graph)\n" +	"  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"\n  snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" +	"\t\t\t  Read the contents for more information\n" +#endif +#ifdef CONFIG_STACKTRACE +	"  stack_trace\t\t- Shows the max stack trace when active\n" +	"  stack_max_size\t- Shows current max stack size that was traced\n" +	"\t\t\t  Write into this file to reset the max size (trigger a new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" +#endif +#endif /* CONFIG_STACKTRACE */  ;  static ssize_t @@ -3032,11 +3452,12 @@ static ssize_t  tracing_set_trace_read(struct file *filp, char __user *ubuf,  		       size_t cnt, loff_t *ppos)  { +	struct trace_array *tr = filp->private_data;  	char buf[MAX_TRACER_SIZE+2];  	int r;  	mutex_lock(&trace_types_lock); -	r = sprintf(buf, "%s\n", current_trace->name); +	r = sprintf(buf, "%s\n", tr->current_trace->name);  	mutex_unlock(&trace_types_lock);  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3044,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,  int tracer_init(struct tracer *t, struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	return t->init(tr);  } -static void set_buffer_entries(struct trace_array *tr, unsigned long val) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)  {  	int cpu; +  	for_each_tracing_cpu(cpu) -		tr->data[cpu]->entries = val; +		per_cpu_ptr(buf->data, cpu)->entries = val;  } +#ifdef CONFIG_TRACER_MAX_TRACE  /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_array *tr, -					struct trace_array *size_tr, int cpu_id) +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id)  {  	int cpu, ret = 0;  	if (cpu_id == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) { -			ret = ring_buffer_resize(tr->buffer, -					size_tr->data[cpu]->entries, cpu); +			ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);  			if (ret < 0)  				break; -			tr->data[cpu]->entries = size_tr->data[cpu]->entries; +			per_cpu_ptr(trace_buf->data, cpu)->entries = +				per_cpu_ptr(size_buf->data, cpu)->entries;  		}  	} else { -		ret = ring_buffer_resize(tr->buffer, -					size_tr->data[cpu_id]->entries, cpu_id); +		ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);  		if (ret == 0) -			tr->data[cpu_id]->entries = -				size_tr->data[cpu_id]->entries; +			per_cpu_ptr(trace_buf->data, cpu_id)->entries = +				per_cpu_ptr(size_buf->data, cpu_id)->entries;  	}  	return ret;  } +#endif /* CONFIG_TRACER_MAX_TRACE */ -static int __tracing_resize_ring_buffer(unsigned long size, int cpu) +static int __tracing_resize_ring_buffer(struct trace_array *tr, +					unsigned long size, int cpu)  {  	int ret; @@ -3089,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)  	 * we use the size that was given, and we can forget about  	 * expanding it later.  	 */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true;  	/* May be called before buffers are initialized */ -	if (!global_trace.buffer) +	if (!tr->trace_buffer.buffer)  		return 0; -	ret = ring_buffer_resize(global_trace.buffer, size, cpu); +	ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);  	if (ret < 0)  		return ret; -	if (!current_trace->use_max_tr) +#ifdef CONFIG_TRACER_MAX_TRACE +	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || +	    !tr->current_trace->use_max_tr)  		goto out; -	ret = ring_buffer_resize(max_tr.buffer, size, cpu); +	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);  	if (ret < 0) { -		int r = resize_buffer_duplicate_size(&global_trace, -						     &global_trace, cpu); +		int r = resize_buffer_duplicate_size(&tr->trace_buffer, +						     &tr->trace_buffer, cpu);  		if (r < 0) {  			/*  			 * AARGH! We are left with different @@ -3128,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)  	}  	if (cpu == RING_BUFFER_ALL_CPUS) -		set_buffer_entries(&max_tr, size); +		set_buffer_entries(&tr->max_buffer, size);  	else -		max_tr.data[cpu]->entries = size; +		per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;   out: +#endif /* CONFIG_TRACER_MAX_TRACE */ +  	if (cpu == RING_BUFFER_ALL_CPUS) -		set_buffer_entries(&global_trace, size); +		set_buffer_entries(&tr->trace_buffer, size);  	else -		global_trace.data[cpu]->entries = size; +		per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;  	return ret;  } -static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, +					  unsigned long size, int cpu_id)  {  	int ret = size; @@ -3155,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)  		}  	} -	ret = __tracing_resize_ring_buffer(size, cpu_id); +	ret = __tracing_resize_ring_buffer(tr, size, cpu_id);  	if (ret < 0)  		ret = -ENOMEM; @@ -3182,7 +3613,7 @@ int tracing_update_buffers(void)  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) -		ret = __tracing_resize_ring_buffer(trace_buf_size, +		ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,  						RING_BUFFER_ALL_CPUS);  	mutex_unlock(&trace_types_lock); @@ -3192,7 +3623,7 @@ int tracing_update_buffers(void)  struct trace_option_dentry;  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer);  static void  destroy_trace_option_files(struct trace_option_dentry *topts); @@ -3202,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)  	static struct trace_option_dentry *topts;  	struct trace_array *tr = &global_trace;  	struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE  	bool had_max_tr; +#endif  	int ret = 0;  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) { -		ret = __tracing_resize_ring_buffer(trace_buf_size, +		ret = __tracing_resize_ring_buffer(tr, trace_buf_size,  						RING_BUFFER_ALL_CPUS);  		if (ret < 0)  			goto out; @@ -3223,15 +3656,21 @@ static int tracing_set_tracer(const char *buf)  		ret = -EINVAL;  		goto out;  	} -	if (t == current_trace) +	if (t == tr->current_trace)  		goto out;  	trace_branch_disable(); -	if (current_trace->reset) -		current_trace->reset(tr); -	had_max_tr = current_trace->allocated_snapshot; -	current_trace = &nop_trace; +	tr->current_trace->enabled = false; + +	if (tr->current_trace->reset) +		tr->current_trace->reset(tr); + +	/* Current trace needs to be nop_trace before synchronize_sched */ +	tr->current_trace = &nop_trace; + +#ifdef CONFIG_TRACER_MAX_TRACE +	had_max_tr = tr->allocated_snapshot;  	if (had_max_tr && !t->use_max_tr) {  		/* @@ -3242,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)  		 * so a synchronized_sched() is sufficient.  		 */  		synchronize_sched(); -		/* -		 * We don't free the ring buffer. instead, resize it because -		 * The max_tr ring buffer has some state (e.g. ring->clock) and -		 * we want preserve it. -		 */ -		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); -		set_buffer_entries(&max_tr, 1); -		tracing_reset_online_cpus(&max_tr); -		current_trace->allocated_snapshot = false; +		free_snapshot(tr);  	} +#endif  	destroy_trace_option_files(topts); -	topts = create_trace_option_files(t); +	topts = create_trace_option_files(tr, t); + +#ifdef CONFIG_TRACER_MAX_TRACE  	if (t->use_max_tr && !had_max_tr) { -		/* we need to make per cpu buffer sizes equivalent */ -		ret = resize_buffer_duplicate_size(&max_tr, &global_trace, -						   RING_BUFFER_ALL_CPUS); +		ret = alloc_snapshot(tr);  		if (ret < 0)  			goto out; -		t->allocated_snapshot = true;  	} +#endif  	if (t->init) {  		ret = tracer_init(t, tr); @@ -3270,7 +3702,8 @@ static int tracing_set_tracer(const char *buf)  			goto out;  	} -	current_trace = t; +	tr->current_trace = t; +	tr->current_trace->enabled = true;  	trace_branch_enable(tr);   out:  	mutex_unlock(&trace_types_lock); @@ -3344,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,  static int tracing_open_pipe(struct inode *inode, struct file *filp)  { -	long cpu_file = (long) inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct trace_iterator *iter;  	int ret = 0; @@ -3369,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  		ret = -ENOMEM;  		goto fail;  	} -	*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {  		ret = -ENOMEM; @@ -3386,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  	if (trace_clocks[trace_clock_id].in_ns)  		iter->iter_flags |= TRACE_FILE_TIME_IN_NS; -	iter->cpu_file = cpu_file; -	iter->tr = &global_trace; +	iter->cpu_file = tc->cpu; +	iter->tr = tc->tr; +	iter->trace_buffer = &tc->tr->trace_buffer;  	mutex_init(&iter->mutex);  	filp->private_data = iter; @@ -3426,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)  }  static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)  { -	struct trace_iterator *iter = filp->private_data; +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return POLLIN | POLLRDNORM; -	if (trace_flags & TRACE_ITER_BLOCK) { +	if (trace_flags & TRACE_ITER_BLOCK)  		/*  		 * Always select as readable when in blocking mode  		 */  		return POLLIN | POLLRDNORM; -	} else { -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; -		poll_wait(filp, &trace_wait, poll_table); -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; +	else +		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, +					     filp, poll_table); +} -		return 0; -	} +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ +	struct trace_iterator *iter = filp->private_data; + +	return trace_poll(iter, filp, poll_table);  }  /* @@ -3509,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  		  size_t cnt, loff_t *ppos)  {  	struct trace_iterator *iter = filp->private_data; +	struct trace_array *tr = iter->tr;  	ssize_t sret;  	/* return any leftover data */ @@ -3520,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	/* @@ -3677,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		.ops		= &tracing_pipe_buf_ops,  		.spd_release	= tracing_spd_release_pipe,  	}; +	struct trace_array *tr = iter->tr;  	ssize_t ret;  	size_t rem;  	unsigned int i; @@ -3686,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	mutex_lock(&iter->mutex); @@ -3749,43 +4190,19 @@ out_err:  	goto out;  } -struct ftrace_entries_info { -	struct trace_array	*tr; -	int			cpu; -}; - -static int tracing_entries_open(struct inode *inode, struct file *filp) -{ -	struct ftrace_entries_info *info; - -	if (tracing_disabled) -		return -ENODEV; - -	info = kzalloc(sizeof(*info), GFP_KERNEL); -	if (!info) -		return -ENOMEM; - -	info->tr = &global_trace; -	info->cpu = (unsigned long)inode->i_private; - -	filp->private_data = info; - -	return 0; -} -  static ssize_t  tracing_entries_read(struct file *filp, char __user *ubuf,  		     size_t cnt, loff_t *ppos)  { -	struct ftrace_entries_info *info = filp->private_data; -	struct trace_array *tr = info->tr; +	struct trace_cpu *tc = filp->private_data; +	struct trace_array *tr = tc->tr;  	char buf[64];  	int r = 0;  	ssize_t ret;  	mutex_lock(&trace_types_lock); -	if (info->cpu == RING_BUFFER_ALL_CPUS) { +	if (tc->cpu == RING_BUFFER_ALL_CPUS) {  		int cpu, buf_size_same;  		unsigned long size; @@ -3795,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,  		for_each_tracing_cpu(cpu) {  			/* fill in the size from first enabled cpu */  			if (size == 0) -				size = tr->data[cpu]->entries; -			if (size != tr->data[cpu]->entries) { +				size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; +			if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {  				buf_size_same = 0;  				break;  			} @@ -3812,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,  		} else  			r = sprintf(buf, "X\n");  	} else -		r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); +		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);  	mutex_unlock(&trace_types_lock); @@ -3824,7 +4241,7 @@ static ssize_t  tracing_entries_write(struct file *filp, const char __user *ubuf,  		      size_t cnt, loff_t *ppos)  { -	struct ftrace_entries_info *info = filp->private_data; +	struct trace_cpu *tc = filp->private_data;  	unsigned long val;  	int ret; @@ -3839,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	/* value is in KB */  	val <<= 10; -	ret = tracing_resize_ring_buffer(val, info->cpu); +	ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);  	if (ret < 0)  		return ret; @@ -3848,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	return cnt;  } -static int -tracing_entries_release(struct inode *inode, struct file *filp) -{ -	struct ftrace_entries_info *info = filp->private_data; - -	kfree(info); - -	return 0; -} -  static ssize_t  tracing_total_entries_read(struct file *filp, char __user *ubuf,  				size_t cnt, loff_t *ppos) @@ -3869,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,  	mutex_lock(&trace_types_lock);  	for_each_tracing_cpu(cpu) { -		size += tr->data[cpu]->entries >> 10; +		size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;  		if (!ring_buffer_expanded)  			expanded_size += trace_buf_size >> 10;  	} @@ -3899,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,  static int  tracing_free_buffer_release(struct inode *inode, struct file *filp)  { +	struct trace_array *tr = inode->i_private; +  	/* disable tracing ? */  	if (trace_flags & TRACE_ITER_STOP_ON_FREE)  		tracing_off();  	/* resize the ring buffer to 0 */ -	tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); +	tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);  	return 0;  } @@ -3972,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	local_save_flags(irq_flags);  	size = sizeof(*entry) + cnt + 2; /* possible \n added */ -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,  					  irq_flags, preempt_count());  	if (!event) { @@ -4014,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  static int tracing_clock_show(struct seq_file *m, void *v)  { +	struct trace_array *tr = m->private;  	int i;  	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)  		seq_printf(m,  			"%s%s%s%s", i ? " " : "", -			i == trace_clock_id ? "[" : "", trace_clocks[i].name, -			i == trace_clock_id ? "]" : ""); +			i == tr->clock_id ? "[" : "", trace_clocks[i].name, +			i == tr->clock_id ? "]" : "");  	seq_putc(m, '\n');  	return 0; @@ -4029,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)  static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  				   size_t cnt, loff_t *fpos)  { +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private;  	char buf[64];  	const char *clockstr;  	int i; @@ -4050,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  	if (i == ARRAY_SIZE(trace_clocks))  		return -EINVAL; -	trace_clock_id = i; -  	mutex_lock(&trace_types_lock); -	ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); -	if (max_tr.buffer) -		ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); +	tr->clock_id = i; + +	ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);  	/*  	 * New clock may not be consistent with the previous clock.  	 * Reset the buffer so that it doesn't have incomparable timestamps.  	 */ -	tracing_reset_online_cpus(&global_trace); -	tracing_reset_online_cpus(&max_tr); +	tracing_reset_online_cpus(&global_trace.trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) +		ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); +	tracing_reset_online_cpus(&global_trace.max_buffer); +#endif  	mutex_unlock(&trace_types_lock); @@ -4076,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)  {  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_clock_show, NULL); + +	return single_open(file, tracing_clock_show, inode->i_private);  } +struct ftrace_buffer_info { +	struct trace_iterator	iter; +	void			*spare; +	unsigned int		read; +}; +  #ifdef CONFIG_TRACER_SNAPSHOT  static int tracing_snapshot_open(struct inode *inode, struct file *file)  { +	struct trace_cpu *tc = inode->i_private;  	struct trace_iterator *iter; +	struct seq_file *m;  	int ret = 0;  	if (file->f_mode & FMODE_READ) {  		iter = __tracing_open(inode, file, true);  		if (IS_ERR(iter))  			ret = PTR_ERR(iter); +	} else { +		/* Writes still need the seq_file to hold the private data */ +		m = kzalloc(sizeof(*m), GFP_KERNEL); +		if (!m) +			return -ENOMEM; +		iter = kzalloc(sizeof(*iter), GFP_KERNEL); +		if (!iter) { +			kfree(m); +			return -ENOMEM; +		} +		iter->tr = tc->tr; +		iter->trace_buffer = &tc->tr->max_buffer; +		iter->cpu_file = tc->cpu; +		m->private = iter; +		file->private_data = m;  	} +  	return ret;  } @@ -4097,6 +4537,9 @@ static ssize_t  tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  { +	struct seq_file *m = filp->private_data; +	struct trace_iterator *iter = m->private; +	struct trace_array *tr = iter->tr;  	unsigned long val;  	int ret; @@ -4110,42 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,  	mutex_lock(&trace_types_lock); -	if (current_trace->use_max_tr) { +	if (tr->current_trace->use_max_tr) {  		ret = -EBUSY;  		goto out;  	}  	switch (val) {  	case 0: -		if (current_trace->allocated_snapshot) { -			/* free spare buffer */ -			ring_buffer_resize(max_tr.buffer, 1, -					   RING_BUFFER_ALL_CPUS); -			set_buffer_entries(&max_tr, 1); -			tracing_reset_online_cpus(&max_tr); -			current_trace->allocated_snapshot = false; +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break;  		} +		if (tr->allocated_snapshot) +			free_snapshot(tr);  		break;  	case 1: -		if (!current_trace->allocated_snapshot) { -			/* allocate spare buffer */ -			ret = resize_buffer_duplicate_size(&max_tr, -					&global_trace, RING_BUFFER_ALL_CPUS); +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break; +		} +#endif +		if (!tr->allocated_snapshot) { +			ret = alloc_snapshot(tr);  			if (ret < 0)  				break; -			current_trace->allocated_snapshot = true;  		} -  		local_irq_disable();  		/* Now, we're going to swap */ -		update_max_tr(&global_trace, current, smp_processor_id()); +		if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +			update_max_tr(tr, current, smp_processor_id()); +		else +			update_max_tr_single(tr, current, iter->cpu_file);  		local_irq_enable();  		break;  	default: -		if (current_trace->allocated_snapshot) -			tracing_reset_online_cpus(&max_tr); -		else -			ret = -EINVAL; +		if (tr->allocated_snapshot) { +			if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +				tracing_reset_online_cpus(&tr->max_buffer); +			else +				tracing_reset(&tr->max_buffer, iter->cpu_file); +		}  		break;  	} @@ -4157,6 +4606,51 @@ out:  	mutex_unlock(&trace_types_lock);  	return ret;  } + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ +	struct seq_file *m = file->private_data; + +	if (file->f_mode & FMODE_READ) +		return tracing_release(inode, file); + +	/* If write only, the seq_file is just a stub */ +	if (m) +		kfree(m->private); +	kfree(m); + +	return 0; +} + +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, +				    size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, +		   struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_buffer_info *info; +	int ret; + +	ret = tracing_buffers_open(inode, filp); +	if (ret < 0) +		return ret; + +	info = filp->private_data; + +	if (info->iter.trace->use_max_tr) { +		tracing_buffers_release(inode, filp); +		return -EBUSY; +	} + +	info->iter.snapshot = true; +	info->iter.trace_buffer = &info->iter.tr->max_buffer; + +	return ret; +} +  #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4184,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {  };  static const struct file_operations tracing_entries_fops = { -	.open		= tracing_entries_open, +	.open		= tracing_open_generic,  	.read		= tracing_entries_read,  	.write		= tracing_entries_write, -	.release	= tracing_entries_release,  	.llseek		= generic_file_llseek,  }; @@ -4222,20 +4715,23 @@ static const struct file_operations snapshot_fops = {  	.read		= seq_read,  	.write		= tracing_snapshot_write,  	.llseek		= tracing_seek, -	.release	= tracing_release, +	.release	= tracing_snapshot_release,  }; -#endif /* CONFIG_TRACER_SNAPSHOT */ -struct ftrace_buffer_info { -	struct trace_array	*tr; -	void			*spare; -	int			cpu; -	unsigned int		read; +static const struct file_operations snapshot_raw_fops = { +	.open		= snapshot_raw_open, +	.read		= tracing_buffers_read, +	.release	= tracing_buffers_release, +	.splice_read	= tracing_buffers_splice_read, +	.llseek		= no_llseek,  }; +#endif /* CONFIG_TRACER_SNAPSHOT */ +  static int tracing_buffers_open(struct inode *inode, struct file *filp)  { -	int cpu = (int)(long)inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct ftrace_buffer_info *info;  	if (tracing_disabled) @@ -4245,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)  	if (!info)  		return -ENOMEM; -	info->tr	= &global_trace; -	info->cpu	= cpu; -	info->spare	= NULL; +	mutex_lock(&trace_types_lock); + +	tr->ref++; + +	info->iter.tr		= tr; +	info->iter.cpu_file	= tc->cpu; +	info->iter.trace	= tr->current_trace; +	info->iter.trace_buffer = &tr->trace_buffer; +	info->spare		= NULL;  	/* Force reading ring buffer for first read */ -	info->read	= (unsigned int)-1; +	info->read		= (unsigned int)-1;  	filp->private_data = info; +	mutex_unlock(&trace_types_lock); +  	return nonseekable_open(inode, filp);  } +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ +	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter; + +	return trace_poll(iter, filp, poll_table); +} +  static ssize_t  tracing_buffers_read(struct file *filp, char __user *ubuf,  		     size_t count, loff_t *ppos)  {  	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter;  	ssize_t ret; -	size_t size; +	ssize_t size;  	if (!count)  		return 0; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		size = -EBUSY; +		goto out_unlock; +	} +#endif +  	if (!info->spare) -		info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); +		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, +							  iter->cpu_file); +	size = -ENOMEM;  	if (!info->spare) -		return -ENOMEM; +		goto out_unlock;  	/* Do we have previous read data to read? */  	if (info->read < PAGE_SIZE)  		goto read; -	trace_access_lock(info->cpu); -	ret = ring_buffer_read_page(info->tr->buffer, + again: +	trace_access_lock(iter->cpu_file); +	ret = ring_buffer_read_page(iter->trace_buffer->buffer,  				    &info->spare,  				    count, -				    info->cpu, 0); -	trace_access_unlock(info->cpu); -	if (ret < 0) -		return 0; +				    iter->cpu_file, 0); +	trace_access_unlock(iter->cpu_file); -	info->read = 0; +	if (ret < 0) { +		if (trace_empty(iter)) { +			if ((filp->f_flags & O_NONBLOCK)) { +				size = -EAGAIN; +				goto out_unlock; +			} +			mutex_unlock(&trace_types_lock); +			iter->trace->wait_pipe(iter); +			mutex_lock(&trace_types_lock); +			if (signal_pending(current)) { +				size = -EINTR; +				goto out_unlock; +			} +			goto again; +		} +		size = 0; +		goto out_unlock; +	} -read: +	info->read = 0; + read:  	size = PAGE_SIZE - info->read;  	if (size > count)  		size = count;  	ret = copy_to_user(ubuf, info->spare + info->read, size); -	if (ret == size) -		return -EFAULT; +	if (ret == size) { +		size = -EFAULT; +		goto out_unlock; +	}  	size -= ret;  	*ppos += size;  	info->read += size; + out_unlock: +	mutex_unlock(&trace_types_lock); +  	return size;  }  static int tracing_buffers_release(struct inode *inode, struct file *file)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter; + +	mutex_lock(&trace_types_lock); + +	WARN_ON(!iter->tr->ref); +	iter->tr->ref--;  	if (info->spare) -		ring_buffer_free_read_page(info->tr->buffer, info->spare); +		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);  	kfree(info); +	mutex_unlock(&trace_types_lock); +  	return 0;  } @@ -4375,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			    unsigned int flags)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter;  	struct partial_page partial_def[PIPE_DEF_BUFFERS];  	struct page *pages_def[PIPE_DEF_BUFFERS];  	struct splice_pipe_desc spd = { @@ -4387,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  	};  	struct buffer_ref *ref;  	int entries, size, i; -	size_t ret; +	ssize_t ret; -	if (splice_grow_spd(pipe, &spd)) -		return -ENOMEM; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		ret = -EBUSY; +		goto out; +	} +#endif + +	if (splice_grow_spd(pipe, &spd)) { +		ret = -ENOMEM; +		goto out; +	}  	if (*ppos & (PAGE_SIZE - 1)) {  		ret = -EINVAL; @@ -4405,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		len &= PAGE_MASK;  	} -	trace_access_lock(info->cpu); -	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: +	trace_access_lock(iter->cpu_file); +	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);  	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {  		struct page *page; @@ -4417,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			break;  		ref->ref = 1; -		ref->buffer = info->tr->buffer; -		ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); +		ref->buffer = iter->trace_buffer->buffer; +		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);  		if (!ref->page) {  			kfree(ref);  			break;  		}  		r = ring_buffer_read_page(ref->buffer, &ref->page, -					  len, info->cpu, 1); +					  len, iter->cpu_file, 1);  		if (r < 0) {  			ring_buffer_free_read_page(ref->buffer, ref->page);  			kfree(ref); @@ -4449,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		spd.nr_pages++;  		*ppos += PAGE_SIZE; -		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); +		entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);  	} -	trace_access_unlock(info->cpu); +	trace_access_unlock(iter->cpu_file);  	spd.nr_pages = i;  	/* did we read anything? */  	if (!spd.nr_pages) { -		if (flags & SPLICE_F_NONBLOCK) +		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {  			ret = -EAGAIN; -		else -			ret = 0; -		/* TODO: block */ -		goto out; +			goto out; +		} +		mutex_unlock(&trace_types_lock); +		iter->trace->wait_pipe(iter); +		mutex_lock(&trace_types_lock); +		if (signal_pending(current)) { +			ret = -EINTR; +			goto out; +		} +		goto again;  	}  	ret = splice_to_pipe(pipe, &spd);  	splice_shrink_spd(&spd);  out: +	mutex_unlock(&trace_types_lock); +  	return ret;  }  static const struct file_operations tracing_buffers_fops = {  	.open		= tracing_buffers_open,  	.read		= tracing_buffers_read, +	.poll		= tracing_buffers_poll,  	.release	= tracing_buffers_release,  	.splice_read	= tracing_buffers_splice_read,  	.llseek		= no_llseek, @@ -4483,12 +5060,14 @@ static ssize_t  tracing_stats_read(struct file *filp, char __user *ubuf,  		   size_t count, loff_t *ppos)  { -	unsigned long cpu = (unsigned long)filp->private_data; -	struct trace_array *tr = &global_trace; +	struct trace_cpu *tc = filp->private_data; +	struct trace_array *tr = tc->tr; +	struct trace_buffer *trace_buf = &tr->trace_buffer;  	struct trace_seq *s;  	unsigned long cnt;  	unsigned long long t;  	unsigned long usec_rem; +	int cpu = tc->cpu;  	s = kmalloc(sizeof(*s), GFP_KERNEL);  	if (!s) @@ -4496,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,  	trace_seq_init(s); -	cnt = ring_buffer_entries_cpu(tr->buffer, cpu); +	cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "entries: %ld\n", cnt); -	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "overrun: %ld\n", cnt); -	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "commit overrun: %ld\n", cnt); -	cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); +	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "bytes: %ld\n", cnt);  	if (trace_clocks[trace_clock_id].in_ns) {  		/* local or global for trace_clock */ -		t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); +		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));  		usec_rem = do_div(t, USEC_PER_SEC);  		trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",  								t, usec_rem); -		t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); +		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));  		usec_rem = do_div(t, USEC_PER_SEC);  		trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);  	} else {  		/* counter or tsc mode for trace_clock */  		trace_seq_printf(s, "oldest event ts: %llu\n", -				ring_buffer_oldest_event_ts(tr->buffer, cpu)); +				ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));  		trace_seq_printf(s, "now ts: %llu\n", -				ring_buffer_time_stamp(tr->buffer, cpu)); +				ring_buffer_time_stamp(trace_buf->buffer, cpu));  	} -	cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); +	cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "dropped events: %ld\n", cnt); -	cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); +	cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "read events: %ld\n", cnt);  	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4582,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {  	.read		= tracing_read_dyn_info,  	.llseek		= generic_file_llseek,  }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ -static struct dentry *d_tracer; +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ +	tracing_snapshot(); +} -struct dentry *tracing_init_dentry(void) +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ +	unsigned long *count = (long *)data; + +	if (!*count) +		return; + +	if (*count != -1) +		(*count)--; + +	tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *data) +{ +	long count = (long)data; + +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "snapshot"); + +	if (count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", count); + +	return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { +	.func			= ftrace_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { +	.func			= ftrace_count_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +			       char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; +	void *count = (void *)-1; +	char *number; +	int ret; + +	/* hash funcs only work with set_ftrace_filter */ +	if (!enable) +		return -EINVAL; + +	ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	} + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	if (!strlen(number)) +		goto out_reg; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, (unsigned long *)&count); +	if (ret) +		return ret; + + out_reg: +	ret = register_ftrace_function_probe(glob, ops, count); + +	if (ret >= 0) +		alloc_snapshot(&global_trace); + +	return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { +	.name			= "snapshot", +	.func			= ftrace_trace_snapshot_callback, +}; + +static int register_snapshot_cmd(void)  { -	static int once; +	return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -	if (d_tracer) -		return d_tracer; +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +{ +	if (tr->dir) +		return tr->dir;  	if (!debugfs_initialized())  		return NULL; -	d_tracer = debugfs_create_dir("tracing", NULL); +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		tr->dir = debugfs_create_dir("tracing", NULL); -	if (!d_tracer && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'tracing'\n"); -		return NULL; -	} +	if (!tr->dir) +		pr_warn_once("Could not create debugfs directory 'tracing'\n"); -	return d_tracer; +	return tr->dir;  } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ +	return tracing_init_dentry_tr(&global_trace); +} -static struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)  { -	static int once;  	struct dentry *d_tracer; -	if (d_percpu) -		return d_percpu; - -	d_tracer = tracing_init_dentry(); +	if (tr->percpu_dir) +		return tr->percpu_dir; +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	d_percpu = debugfs_create_dir("per_cpu", d_tracer); +	tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); -	if (!d_percpu && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'per_cpu'\n"); -		return NULL; -	} +	WARN_ONCE(!tr->percpu_dir, +		  "Could not create debugfs directory 'per_cpu/%d'\n", cpu); -	return d_percpu; +	return tr->percpu_dir;  } -static void tracing_init_debugfs_percpu(long cpu) +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)  { -	struct dentry *d_percpu = tracing_dentry_percpu(); +	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);  	struct dentry *d_cpu;  	char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -4651,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)  	/* per cpu trace_pipe */  	trace_create_file("trace_pipe", 0444, d_cpu, -			(void *) cpu, &tracing_pipe_fops); +			(void *)&data->trace_cpu, &tracing_pipe_fops);  	/* per cpu trace */  	trace_create_file("trace", 0644, d_cpu, -			(void *) cpu, &tracing_fops); +			(void *)&data->trace_cpu, &tracing_fops);  	trace_create_file("trace_pipe_raw", 0444, d_cpu, -			(void *) cpu, &tracing_buffers_fops); +			(void *)&data->trace_cpu, &tracing_buffers_fops);  	trace_create_file("stats", 0444, d_cpu, -			(void *) cpu, &tracing_stats_fops); +			(void *)&data->trace_cpu, &tracing_stats_fops);  	trace_create_file("buffer_size_kb", 0444, d_cpu, -			(void *) cpu, &tracing_entries_fops); +			(void *)&data->trace_cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_cpu, +			  (void *)&data->trace_cpu, &snapshot_fops); + +	trace_create_file("snapshot_raw", 0444, d_cpu, +			(void *)&data->trace_cpu, &snapshot_raw_fops); +#endif  }  #ifdef CONFIG_FTRACE_SELFTEST @@ -4675,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)  struct trace_option_dentry {  	struct tracer_opt		*opt;  	struct tracer_flags		*flags; +	struct trace_array		*tr;  	struct dentry			*entry;  }; @@ -4710,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (!!(topt->flags->val & topt->opt->bit) != val) {  		mutex_lock(&trace_types_lock); -		ret = __set_tracer_option(current_trace, topt->flags, +		ret = __set_tracer_option(topt->tr->current_trace, topt->flags,  					  topt->opt, !val);  		mutex_unlock(&trace_types_lock);  		if (ret) @@ -4749,6 +5438,7 @@ static ssize_t  trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  			 loff_t *ppos)  { +	struct trace_array *tr = &global_trace;  	long index = (long)filp->private_data;  	unsigned long val;  	int ret; @@ -4759,7 +5449,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (val != 0 && val != 1)  		return -EINVAL; -	set_tracer_flags(1 << index, val); + +	mutex_lock(&trace_types_lock); +	ret = set_tracer_flag(tr, 1 << index, val); +	mutex_unlock(&trace_types_lock); + +	if (ret < 0) +		return ret;  	*ppos += cnt; @@ -4789,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,  } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr)  {  	struct dentry *d_tracer; -	static struct dentry *t_options; -	if (t_options) -		return t_options; +	if (tr->options) +		return tr->options; -	d_tracer = tracing_init_dentry(); +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	t_options = debugfs_create_dir("options", d_tracer); -	if (!t_options) { +	tr->options = debugfs_create_dir("options", d_tracer); +	if (!tr->options) {  		pr_warning("Could not create debugfs directory 'options'\n");  		return NULL;  	} -	return t_options; +	return tr->options;  }  static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, +			 struct trace_option_dentry *topt,  			 struct tracer_flags *flags,  			 struct tracer_opt *opt)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	topt->flags = flags;  	topt->opt = opt; +	topt->tr = tr;  	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,  				    &trace_options_fops); @@ -4830,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,  }  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer)  {  	struct trace_option_dentry *topts;  	struct tracer_flags *flags; @@ -4855,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)  		return NULL;  	for (cnt = 0; opts[cnt].name; cnt++) -		create_trace_option_file(&topts[cnt], flags, +		create_trace_option_file(tr, &topts[cnt], flags,  					 &opts[cnt]);  	return topts; @@ -4878,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)  }  static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, +			      const char *option, long index)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return NULL; @@ -4890,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)  				    &trace_options_core_fops);  } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr)  {  	struct dentry *t_options;  	int i; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	for (i = 0; trace_options[i]; i++) -		create_trace_option_core_file(trace_options[i], i); +		create_trace_option_core_file(tr, trace_options[i], i);  }  static ssize_t @@ -4908,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,  	       size_t cnt, loff_t *ppos)  {  	struct trace_array *tr = filp->private_data; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	char buf[64];  	int r; @@ -4927,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  		size_t cnt, loff_t *ppos)  {  	struct trace_array *tr = filp->private_data; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	unsigned long val;  	int ret; @@ -4939,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  		mutex_lock(&trace_types_lock);  		if (val) {  			ring_buffer_record_on(buffer); -			if (current_trace->start) -				current_trace->start(tr); +			if (tr->current_trace->start) +				tr->current_trace->start(tr);  		} else {  			ring_buffer_record_off(buffer); -			if (current_trace->stop) -				current_trace->stop(tr); +			if (tr->current_trace->stop) +				tr->current_trace->stop(tr);  		}  		mutex_unlock(&trace_types_lock);  	} @@ -4961,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {  	.llseek		= default_llseek,  }; +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) +{ +	int cpu; + +	for_each_tracing_cpu(cpu) { +		memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); +		per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; +		per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; +	} +} + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ +	enum ring_buffer_flags rb_flags; + +	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + +	buf->buffer = ring_buffer_alloc(size, rb_flags); +	if (!buf->buffer) +		return -ENOMEM; + +	buf->data = alloc_percpu(struct trace_array_cpu); +	if (!buf->data) { +		ring_buffer_free(buf->buffer); +		return -ENOMEM; +	} + +	init_trace_buffers(tr, buf); + +	/* Allocate the first page for all buffers */ +	set_buffer_entries(&tr->trace_buffer, +			   ring_buffer_size(tr->trace_buffer.buffer, 0)); + +	return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ +	int ret; + +	ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); +	if (ret) +		return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE +	ret = allocate_trace_buffer(tr, &tr->max_buffer, +				    allocate_snapshot ? size : 1); +	if (WARN_ON(ret)) { +		ring_buffer_free(tr->trace_buffer.buffer); +		free_percpu(tr->trace_buffer.data); +		return -ENOMEM; +	} +	tr->allocated_snapshot = allocate_snapshot; + +	/* +	 * Only the top level trace array gets its snapshot allocated +	 * from the kernel command line. +	 */ +	allocate_snapshot = false; +#endif +	return 0; +} + +static int new_instance_create(const char *name) +{ +	struct trace_array *tr; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -EEXIST; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) +			goto out_unlock; +	} + +	ret = -ENOMEM; +	tr = kzalloc(sizeof(*tr), GFP_KERNEL); +	if (!tr) +		goto out_unlock; + +	tr->name = kstrdup(name, GFP_KERNEL); +	if (!tr->name) +		goto out_free_tr; + +	raw_spin_lock_init(&tr->start_lock); + +	tr->current_trace = &nop_trace; + +	INIT_LIST_HEAD(&tr->systems); +	INIT_LIST_HEAD(&tr->events); + +	if (allocate_trace_buffers(tr, trace_buf_size) < 0) +		goto out_free_tr; + +	/* Holder for file callbacks */ +	tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; +	tr->trace_cpu.tr = tr; + +	tr->dir = debugfs_create_dir(name, trace_instance_dir); +	if (!tr->dir) +		goto out_free_tr; + +	ret = event_trace_add_tracer(tr->dir, tr); +	if (ret) +		goto out_free_tr; + +	init_tracer_debugfs(tr, tr->dir); + +	list_add(&tr->list, &ftrace_trace_arrays); + +	mutex_unlock(&trace_types_lock); + +	return 0; + + out_free_tr: +	if (tr->trace_buffer.buffer) +		ring_buffer_free(tr->trace_buffer.buffer); +	kfree(tr->name); +	kfree(tr); + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; + +} + +static int instance_delete(const char *name) +{ +	struct trace_array *tr; +	int found = 0; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -ENODEV; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) { +			found = 1; +			break; +		} +	} +	if (!found) +		goto out_unlock; + +	ret = -EBUSY; +	if (tr->ref) +		goto out_unlock; + +	list_del(&tr->list); + +	event_trace_del_tracer(tr); +	debugfs_remove_recursive(tr->dir); +	free_percpu(tr->trace_buffer.data); +	ring_buffer_free(tr->trace_buffer.buffer); + +	kfree(tr->name); +	kfree(tr); + +	ret = 0; + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the new_instance_create() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = new_instance_create(dentry->d_iname); + +	mutex_lock(&inode->i_mutex); + +	return ret; +} + +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* The caller did a dget() on dentry */ +	mutex_unlock(&dentry->d_inode->i_mutex); + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the instance_delete() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = instance_delete(dentry->d_iname); + +	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); +	mutex_lock(&dentry->d_inode->i_mutex); + +	return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { +	.lookup		= simple_lookup, +	.mkdir		= instance_mkdir, +	.rmdir		= instance_rmdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ +	trace_instance_dir = debugfs_create_dir("instances", d_tracer); +	if (WARN_ON(!trace_instance_dir)) +		return; + +	/* Hijack the dir inode operations, to allow mkdir */ +	trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ +	int cpu; + +	trace_create_file("trace_options", 0644, d_tracer, +			  tr, &tracing_iter_fops); + +	trace_create_file("trace", 0644, d_tracer, +			(void *)&tr->trace_cpu, &tracing_fops); + +	trace_create_file("trace_pipe", 0444, d_tracer, +			(void *)&tr->trace_cpu, &tracing_pipe_fops); + +	trace_create_file("buffer_size_kb", 0644, d_tracer, +			(void *)&tr->trace_cpu, &tracing_entries_fops); + +	trace_create_file("buffer_total_size_kb", 0444, d_tracer, +			  tr, &tracing_total_entries_fops); + +	trace_create_file("free_buffer", 0644, d_tracer, +			  tr, &tracing_free_buffer_fops); + +	trace_create_file("trace_marker", 0220, d_tracer, +			  tr, &tracing_mark_fops); + +	trace_create_file("trace_clock", 0644, d_tracer, tr, +			  &trace_clock_fops); + +	trace_create_file("tracing_on", 0644, d_tracer, +			    tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_tracer, +			  (void *)&tr->trace_cpu, &snapshot_fops); +#endif + +	for_each_tracing_cpu(cpu) +		tracing_init_debugfs_percpu(tr, cpu); + +} +  static __init int tracer_init_debugfs(void)  {  	struct dentry *d_tracer; -	int cpu;  	trace_access_lock_init();  	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; -	trace_create_file("trace_options", 0644, d_tracer, -			NULL, &tracing_iter_fops); +	init_tracer_debugfs(&global_trace, d_tracer);  	trace_create_file("tracing_cpumask", 0644, d_tracer, -			NULL, &tracing_cpumask_fops); - -	trace_create_file("trace", 0644, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops); +			&global_trace, &tracing_cpumask_fops);  	trace_create_file("available_tracers", 0444, d_tracer,  			&global_trace, &show_traces_fops); @@ -4996,44 +5981,17 @@ static __init int tracer_init_debugfs(void)  	trace_create_file("README", 0444, d_tracer,  			NULL, &tracing_readme_fops); -	trace_create_file("trace_pipe", 0444, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - -	trace_create_file("buffer_size_kb", 0644, d_tracer, -			(void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); - -	trace_create_file("buffer_total_size_kb", 0444, d_tracer, -			&global_trace, &tracing_total_entries_fops); - -	trace_create_file("free_buffer", 0644, d_tracer, -			&global_trace, &tracing_free_buffer_fops); - -	trace_create_file("trace_marker", 0220, d_tracer, -			NULL, &tracing_mark_fops); -  	trace_create_file("saved_cmdlines", 0444, d_tracer,  			NULL, &tracing_saved_cmdlines_fops); -	trace_create_file("trace_clock", 0644, d_tracer, NULL, -			  &trace_clock_fops); - -	trace_create_file("tracing_on", 0644, d_tracer, -			    &global_trace, &rb_simple_fops); -  #ifdef CONFIG_DYNAMIC_FTRACE  	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,  			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);  #endif -#ifdef CONFIG_TRACER_SNAPSHOT -	trace_create_file("snapshot", 0644, d_tracer, -			  (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); -#endif - -	create_trace_options_dir(); +	create_trace_instances(d_tracer); -	for_each_tracing_cpu(cpu) -		tracing_init_debugfs_percpu(cpu); +	create_trace_options_dir(&global_trace);  	return 0;  } @@ -5089,8 +6047,8 @@ void  trace_printk_seq(struct trace_seq *s)  {  	/* Probably should print a warning here. */ -	if (s->len >= 1000) -		s->len = 1000; +	if (s->len >= TRACE_MAX_PRINT) +		s->len = TRACE_MAX_PRINT;  	/* should be zero ended, but we are paranoid. */  	s->buffer[s->len] = 0; @@ -5103,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)  void trace_init_global_iter(struct trace_iterator *iter)  {  	iter->tr = &global_trace; -	iter->trace = current_trace; -	iter->cpu_file = TRACE_PIPE_ALL_CPU; +	iter->trace = iter->tr->current_trace; +	iter->cpu_file = RING_BUFFER_ALL_CPUS; +	iter->trace_buffer = &global_trace.trace_buffer;  } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)  { -	static arch_spinlock_t ftrace_dump_lock = -		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  	/* use static because iter can be a bit big for the stack */  	static struct trace_iterator iter; +	static atomic_t dump_running;  	unsigned int old_userobj; -	static int dump_ran;  	unsigned long flags;  	int cnt = 0, cpu; -	/* only one dump */ -	local_irq_save(flags); -	arch_spin_lock(&ftrace_dump_lock); -	if (dump_ran) -		goto out; - -	dump_ran = 1; +	/* Only allow one dump user at a time. */ +	if (atomic_inc_return(&dump_running) != 1) { +		atomic_dec(&dump_running); +		return; +	} +	/* +	 * Always turn off tracing when we dump. +	 * We don't need to show trace output of what happens +	 * between multiple crashes. +	 * +	 * If the user does a sysrq-z, then they can re-enable +	 * tracing with echo 1 > tracing_on. +	 */  	tracing_off(); -	/* Did function tracer already get disabled? */ -	if (ftrace_is_dead()) { -		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); -		printk("#          MAY BE MISSING FUNCTION EVENTS\n"); -	} - -	if (disable_tracing) -		ftrace_kill(); +	local_irq_save(flags);  	/* Simulate the iterator */  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);  	}  	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -5152,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  	switch (oops_dump_mode) {  	case DUMP_ALL: -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  		break;  	case DUMP_ORIG:  		iter.cpu_file = raw_smp_processor_id(); @@ -5161,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		goto out_enable;  	default:  		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  	}  	printk(KERN_TRACE "Dumping ftrace buffer:\n"); +	/* Did function tracer already get disabled? */ +	if (ftrace_is_dead()) { +		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); +		printk("#          MAY BE MISSING FUNCTION EVENTS\n"); +	} +  	/*  	 * We need to stop all tracing on all CPUS to read the  	 * the next buffer. This is a bit expensive, but is @@ -5205,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		printk(KERN_TRACE "---------------------------------\n");   out_enable: -	/* Re-enable tracing if requested */ -	if (!disable_tracing) { -		trace_flags |= old_userobj; +	trace_flags |= old_userobj; -		for_each_tracing_cpu(cpu) { -			atomic_dec(&iter.tr->data[cpu]->disabled); -		} -		tracing_on(); +	for_each_tracing_cpu(cpu) { +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	} - - out: -	arch_spin_unlock(&ftrace_dump_lock); + 	atomic_dec(&dump_running);  	local_irq_restore(flags);  } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ -	__ftrace_dump(true, oops_dump_mode); -}  EXPORT_SYMBOL_GPL(ftrace_dump);  __init static int tracer_alloc_buffers(void)  {  	int ring_buf_size; -	enum ring_buffer_flags rb_flags; -	int i;  	int ret = -ENOMEM; @@ -5252,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)  	else  		ring_buf_size = 1; -	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; -  	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);  	cpumask_copy(tracing_cpumask, cpu_all_mask); +	raw_spin_lock_init(&global_trace.start_lock); +  	/* TODO: make the number of buffers hot pluggable with CPUS */ -	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); -	if (!global_trace.buffer) { +	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {  		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");  		WARN_ON(1);  		goto out_free_cpumask;  	} +  	if (global_trace.buffer_disabled)  		tracing_off(); - -#ifdef CONFIG_TRACER_MAX_TRACE -	max_tr.buffer = ring_buffer_alloc(1, rb_flags); -	if (!max_tr.buffer) { -		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); -		WARN_ON(1); -		ring_buffer_free(global_trace.buffer); -		goto out_free_cpumask; -	} -#endif - -	/* Allocate the first page for all buffers */ -	for_each_tracing_cpu(i) { -		global_trace.data[i] = &per_cpu(global_trace_cpu, i); -		max_tr.data[i] = &per_cpu(max_tr_data, i); -	} - -	set_buffer_entries(&global_trace, -			   ring_buffer_size(global_trace.buffer, 0)); -#ifdef CONFIG_TRACER_MAX_TRACE -	set_buffer_entries(&max_tr, 1); -#endif -  	trace_init_cmdlines(); -	init_irq_work(&trace_work_wakeup, trace_wake_up);  	register_tracer(&nop_trace); +	global_trace.current_trace = &nop_trace; +  	/* All seems OK, enable tracing */  	tracing_disabled = 0; @@ -5303,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)  	register_die_notifier(&trace_die_notifier); +	global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + +	/* Holder for file callbacks */ +	global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; +	global_trace.trace_cpu.tr = &global_trace; + +	INIT_LIST_HEAD(&global_trace.systems); +	INIT_LIST_HEAD(&global_trace.events); +	list_add(&global_trace.list, &ftrace_trace_arrays); +  	while (trace_boot_options) {  		char *option;  		option = strsep(&trace_boot_options, ","); -		trace_set_options(option); +		trace_set_options(&global_trace, option);  	} +	register_snapshot_cmd(); +  	return 0;  out_free_cpumask: +	free_percpu(global_trace.trace_buffer.data); +#ifdef CONFIG_TRACER_MAX_TRACE +	free_percpu(global_trace.max_buffer.data); +#endif  	free_cpumask_var(tracing_cpumask);  out_free_buffer_mask:  	free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d5..9e014582e76 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,11 @@  #include <linux/trace_seq.h>  #include <linux/ftrace_event.h> +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h>		/* For NR_SYSCALLS	     */ +#include <asm/syscall.h>	/* some archs define it here */ +#endif +  enum trace_type {  	__TRACE_FIRST_TYPE = 0, @@ -29,6 +34,7 @@ enum trace_type {  	TRACE_GRAPH_ENT,  	TRACE_USER_STACK,  	TRACE_BLK, +	TRACE_BPUTS,  	__TRACE_LAST_TYPE,  }; @@ -127,12 +133,21 @@ enum trace_flag_type {  #define TRACE_BUF_SIZE		1024 +struct trace_array; + +struct trace_cpu { +	struct trace_array	*tr; +	struct dentry		*dir; +	int			cpu; +}; +  /*   * The CPU trace array - it consists of thousands of trace entries   * plus some other descriptor data: (for example which task started   * the trace, etc.)   */  struct trace_array_cpu { +	struct trace_cpu	trace_cpu;  	atomic_t		disabled;  	void			*buffer_page;	/* ring buffer spare */ @@ -151,20 +166,83 @@ struct trace_array_cpu {  	char			comm[TASK_COMM_LEN];  }; +struct tracer; + +struct trace_buffer { +	struct trace_array		*tr; +	struct ring_buffer		*buffer; +	struct trace_array_cpu __percpu	*data; +	cycle_t				time_start; +	int				cpu; +}; +  /*   * The trace array - an array of per-CPU trace arrays. This is the   * highest level data structure that individual tracers deal with.   * They have on/off state as well:   */  struct trace_array { -	struct ring_buffer	*buffer; -	int			cpu; +	struct list_head	list; +	char			*name; +	struct trace_buffer	trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	/* +	 * The max_buffer is used to snapshot the trace when a maximum +	 * latency is reached, or when the user initiates a snapshot. +	 * Some tracers will use this to store a maximum trace while +	 * it continues examining live traces. +	 * +	 * The buffers for the max_buffer are set up the same as the trace_buffer +	 * When a snapshot is taken, the buffer of the max_buffer is swapped +	 * with the buffer of the trace_buffer and the buffers are reset for +	 * the trace_buffer so the tracing can continue. +	 */ +	struct trace_buffer	max_buffer; +	bool			allocated_snapshot; +#endif  	int			buffer_disabled; -	cycle_t			time_start; +	struct trace_cpu	trace_cpu;	/* place holder */ +#ifdef CONFIG_FTRACE_SYSCALLS +	int			sys_refcount_enter; +	int			sys_refcount_exit; +	DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +	DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +#endif +	int			stop_count; +	int			clock_id; +	struct tracer		*current_trace; +	unsigned int		flags; +	raw_spinlock_t		start_lock; +	struct dentry		*dir; +	struct dentry		*options; +	struct dentry		*percpu_dir; +	struct dentry		*event_dir; +	struct list_head	systems; +	struct list_head	events;  	struct task_struct	*waiter; -	struct trace_array_cpu	*data[NR_CPUS]; +	int			ref; +}; + +enum { +	TRACE_ARRAY_FL_GLOBAL	= (1 << 0)  }; +extern struct list_head ftrace_trace_arrays; + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ +	struct trace_array *tr; + +	tr = list_entry(ftrace_trace_arrays.prev, +			typeof(*tr), list); +	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); +	return tr; +} +  #define FTRACE_CMP_TYPE(var, type) \  	__builtin_types_compatible_p(typeof(var), type *) @@ -200,6 +278,7 @@ extern void __ftrace_bad_type(void);  		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\  		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\  		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\ +		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\  			  TRACE_MMIO_RW);				\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\ @@ -283,11 +362,16 @@ struct tracer {  	enum print_line_t	(*print_line)(struct trace_iterator *iter);  	/* If you handled the flag setting, return 0 */  	int			(*set_flag)(u32 old_flags, u32 bit, int set); +	/* Return 0 if OK with change, else return non-zero */ +	int			(*flag_changed)(struct tracer *tracer, +						u32 mask, int set);  	struct tracer		*next;  	struct tracer_flags	*flags;  	bool			print_max; +	bool			enabled; +#ifdef CONFIG_TRACER_MAX_TRACE  	bool			use_max_tr; -	bool			allocated_snapshot; +#endif  }; @@ -423,8 +507,6 @@ static __always_inline void trace_clear_recursion(int bit)  	current->trace_recursion = val;  } -#define TRACE_PIPE_ALL_CPU	-1 -  static inline struct ring_buffer_iter *  trace_buffer_iter(struct trace_iterator *iter, int cpu)  { @@ -435,10 +517,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)  int tracer_init(struct tracer *t, struct trace_array *tr);  int tracing_is_enabled(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf);  void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp);  struct dentry *trace_create_file(const char *name,  				 umode_t mode, @@ -446,6 +528,7 @@ struct dentry *trace_create_file(const char *name,  				 void *data,  				 const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr);  struct dentry *tracing_init_dentry(void);  struct ring_buffer_event; @@ -579,7 +662,7 @@ extern int DYN_FTRACE_TEST_NAME(void);  #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2  extern int DYN_FTRACE_TEST_NAME2(void); -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded;  extern bool tracing_selftest_disabled;  DECLARE_PER_CPU(int, ftrace_cpu_disabled); @@ -615,6 +698,8 @@ trace_array_vprintk(struct trace_array *tr,  		    unsigned long ip, const char *fmt, va_list args);  int trace_array_printk(struct trace_array *tr,  		       unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...);  void trace_printk_seq(struct trace_seq *s);  enum print_line_t print_trace_line(struct trace_iterator *iter); @@ -782,6 +867,7 @@ enum trace_iterator_flags {  	TRACE_ITER_STOP_ON_FREE		= 0x400000,  	TRACE_ITER_IRQ_INFO		= 0x800000,  	TRACE_ITER_MARKERS		= 0x1000000, +	TRACE_ITER_FUNCTION		= 0x2000000,  };  /* @@ -828,8 +914,8 @@ enum {  struct ftrace_event_field {  	struct list_head	link; -	char			*name; -	char			*type; +	const char		*name; +	const char		*type;  	int			filter_type;  	int			offset;  	int			size; @@ -847,12 +933,19 @@ struct event_filter {  struct event_subsystem {  	struct list_head	list;  	const char		*name; -	struct dentry		*entry;  	struct event_filter	*filter; -	int			nr_events;  	int			ref_count;  }; +struct ftrace_subsystem_dir { +	struct list_head		list; +	struct event_subsystem		*subsystem; +	struct trace_array		*tr; +	struct dentry			*entry; +	int				ref_count; +	int				nr_events; +}; +  #define FILTER_PRED_INVALID	((unsigned short)-1)  #define FILTER_PRED_IS_RIGHT	(1 << 15)  #define FILTER_PRED_FOLD	(1 << 15) @@ -902,22 +995,20 @@ struct filter_pred {  	unsigned short		right;  }; -extern struct list_head ftrace_common_fields; -  extern enum regex_type  filter_parse_regex(char *buff, int len, char **search, int *not);  extern void print_event_filter(struct ftrace_event_call *call,  			       struct trace_seq *s);  extern int apply_event_filter(struct ftrace_event_call *call,  			      char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  					char *filter_string);  extern void print_subsystem_event_filter(struct event_subsystem *system,  					 struct trace_seq *s);  extern int filter_assign_type(const char *type); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name);  static inline int  filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -934,6 +1025,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,  }  extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr);  extern struct mutex event_mutex;  extern struct list_head ftrace_events; @@ -943,6 +1036,19 @@ extern const char *__stop___trace_bprintk_fmt[];  void trace_printk_init_buffers(void);  void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))  #undef FTRACE_ENTRY  #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 95e96842ed2..d594da0dc03 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  {  	struct ftrace_event_call *call = &event_branch;  	struct trace_array *tr = branch_tracer; +	struct trace_array_cpu *data;  	struct ring_buffer_event *event;  	struct trace_branch *entry;  	struct ring_buffer *buffer; @@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) +	data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	if (atomic_inc_return(&data->disabled) != 1)  		goto out;  	pc = preempt_count(); -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,  					  sizeof(*entry), flags, pc);  	if (!event) @@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  		__buffer_unlock_commit(buffer, event);   out: -	atomic_dec(&tr->data[cpu]->disabled); +	atomic_dec(&data->disabled);  	local_irq_restore(flags);  } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aa8f5f48dae..26dc348332b 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -57,6 +57,16 @@ u64 notrace trace_clock(void)  	return local_clock();  } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + */ +u64 notrace trace_clock_jiffies(void) +{ +	u64 jiffy = jiffies - INITIAL_JIFFIES; + +	/* Return nsecs */ +	return (u64)jiffies_to_usecs(jiffy) * 1000ULL; +}  /*   * trace_clock_global(): special globally coherent trace clock diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4108e1250ca..e2d027ac66a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,  		__dynamic_array(	u32,	buf	)  	), -	F_printk("%08lx fmt:%p", -		 __entry->ip, __entry->fmt), +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->fmt),  	FILTER_OTHER  ); @@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,  		__dynamic_array(	char,	buf	)  	), -	F_printk("%08lx %s", -		 __entry->ip, __entry->buf), +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->buf), + +	FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + +	TRACE_BPUTS, + +	F_STRUCT( +		__field(	unsigned long,	ip	) +		__field(	const char *,	str	) +	), + +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->str),  	FILTER_OTHER  ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e9b284250..53582e982e5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];  EXPORT_SYMBOL_GPL(event_storage);  LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static LIST_HEAD(ftrace_common_fields); -struct list_head * +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		struct ftrace_event_file *___n;				\ +		list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file()		\ +	} + +static struct list_head *  trace_get_fields(struct ftrace_event_call *event_call)  {  	if (!event_call->class->get_fields) @@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)  	return event_call->class->get_fields(event_call);  } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ +	struct ftrace_event_field *field; + +	list_for_each_entry(field, head, link) { +		if (!strcmp(field->name, name)) +			return field; +	} + +	return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ +	struct ftrace_event_field *field; +	struct list_head *head; + +	field = __find_event_field(&ftrace_common_fields, name); +	if (field) +		return field; + +	head = trace_get_fields(call); +	return __find_event_field(head, name); +} +  static int __trace_define_field(struct list_head *head, const char *type,  				const char *name, int offset, int size,  				int is_signed, int filter_type)  {  	struct ftrace_event_field *field; -	field = kzalloc(sizeof(*field), GFP_KERNEL); +	field = kmem_cache_alloc(field_cachep, GFP_TRACE);  	if (!field)  		goto err; -	field->name = kstrdup(name, GFP_KERNEL); -	if (!field->name) -		goto err; - -	field->type = kstrdup(type, GFP_KERNEL); -	if (!field->type) -		goto err; +	field->name = name; +	field->type = type;  	if (filter_type == FILTER_OTHER)  		field->filter_type = filter_assign_type(type); @@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,  	return 0;  err: -	if (field) -		kfree(field->name); -	kfree(field); +	kmem_cache_free(field_cachep, field);  	return -ENOMEM;  } @@ -120,7 +158,7 @@ static int trace_define_common_fields(void)  	return ret;  } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call)  {  	struct ftrace_event_field *field, *next;  	struct list_head *head; @@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)  	head = trace_get_fields(call);  	list_for_each_entry_safe(field, next, head, link) {  		list_del(&field->link); -		kfree(field->type); -		kfree(field->name); -		kfree(field); +		kmem_cache_free(field_cachep, field);  	}  } @@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);  int ftrace_event_reg(struct ftrace_event_call *call,  		     enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER:  		return tracepoint_probe_register(call->name,  						 call->class->probe, -						 call); +						 file);  	case TRACE_REG_UNREGISTER:  		tracepoint_probe_unregister(call->name,  					    call->class->probe, -					    call); +					    file);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);  void trace_event_enable_cmd_record(bool enable)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) +	do_for_each_event_file(tr, file) { + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))  			continue;  		if (enable) {  			tracing_start_cmdline_record(); -			call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} else {  			tracing_stop_cmdline_record(); -			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} -	} +	} while_for_each_event_file();  	mutex_unlock(&event_mutex);  } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, -					int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, +					 int enable, int soft_disable)  { +	struct ftrace_event_call *call = file->event_call;  	int ret = 0; +	int disable;  	switch (enable) {  	case 0: -		if (call->flags & TRACE_EVENT_FL_ENABLED) { -			call->flags &= ~TRACE_EVENT_FL_ENABLED; -			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { +		/* +		 * When soft_disable is set and enable is cleared, we want +		 * to clear the SOFT_DISABLED flag but leave the event in the +		 * state that it was. That is, if the event was enabled and +		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED +		 * is set we do not want the event to be enabled before we +		 * clear the bit. +		 * +		 * When soft_disable is not set but the SOFT_MODE flag is, +		 * we do nothing. Do not disable the tracepoint, otherwise +		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. +		 */ +		if (soft_disable) { +			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; +			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); +		} else +			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + +		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { +			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); +			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {  				tracing_stop_cmdline_record(); -				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			call->class->reg(call, TRACE_REG_UNREGISTER, NULL); +			call->class->reg(call, TRACE_REG_UNREGISTER, file);  		} +		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ +		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) +			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);  		break;  	case 1: -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { +		/* +		 * When soft_disable is set and enable is set, we want to +		 * register the tracepoint for the event, but leave the event +		 * as is. That means, if the event was already enabled, we do +		 * nothing (but set SOFT_MODE). If the event is disabled, we +		 * set SOFT_DISABLED before enabling the event tracepoint, so +		 * it still seems to be disabled. +		 */ +		if (!soft_disable) +			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +		else +			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + +			/* Keep the event disabled, when going to SOFT_MODE. */ +			if (soft_disable) +				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +  			if (trace_flags & TRACE_ITER_RECORD_CMD) {  				tracing_start_cmdline_record(); -				call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); +			ret = call->class->reg(call, TRACE_REG_REGISTER, file);  			if (ret) {  				tracing_stop_cmdline_record();  				pr_info("event trace: Could not enable event "  					"%s\n", call->name);  				break;  			} -			call->flags |= TRACE_EVENT_FL_ENABLED; +			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + +			/* WAS_ENABLED gets set but never cleared. */ +			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;  		}  		break;  	} @@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,  	return ret;  } -static void ftrace_clear_events(void) +static int ftrace_event_enable_disable(struct ftrace_event_file *file, +				       int enable)  { -	struct ftrace_event_call *call; +	return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ +	struct ftrace_event_file *file;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		ftrace_event_enable_disable(call, 0); +	list_for_each_entry(file, &tr->events, list) { +		ftrace_event_enable_disable(file, 0);  	}  	mutex_unlock(&event_mutex);  } @@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system)  	if (--system->ref_count)  		return; +	list_del(&system->list); +  	if (filter) {  		kfree(filter->filter_string);  		kfree(filter);  	} -	kfree(system->name);  	kfree(system);  } @@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system)  	system->ref_count++;  } -static void put_system(struct event_subsystem *system) +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	dir->ref_count++; +	__get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	/* If the subsystem is about to be freed, the dir must be too */ +	WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + +	__put_system(dir->subsystem); +	if (!--dir->ref_count) +		kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir)  {  	mutex_lock(&event_mutex); -	__put_system(system); +	__put_system_dir(dir);  	mutex_unlock(&event_mutex);  }  /*   * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.   */ -static int __ftrace_set_clr_event(const char *match, const char *sub, -				  const char *event, int set) +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, +				  const char *sub, const char *event, int set)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	int ret = -EINVAL;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call;  		if (!call->name || !call->class || !call->class->reg)  			continue; @@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,  		if (event && strcmp(event, call->name) != 0)  			continue; -		ftrace_event_enable_disable(call, set); +		ftrace_event_enable_disable(file, set);  		ret = 0;  	} @@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,  	return ret;  } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)  {  	char *event = NULL, *sub = NULL, *match; @@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set)  			event = NULL;  	} -	return __ftrace_set_clr_event(match, sub, event, set); +	return __ftrace_set_clr_event(tr, match, sub, event, set);  }  /** @@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set)   */  int trace_set_clr_event(const char *system, const char *event, int set)  { -	return __ftrace_set_clr_event(NULL, system, event, set); +	struct trace_array *tr = top_trace_array(); + +	return __ftrace_set_clr_event(tr, NULL, system, event, set);  }  EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  {  	struct trace_parser parser; +	struct seq_file *m = file->private_data; +	struct trace_array *tr = m->private;  	ssize_t read, ret;  	if (!cnt) @@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		parser.buffer[parser.idx] = 0; -		ret = ftrace_set_clr_event(parser.buffer + !set, set); +		ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);  		if (ret)  			goto out_put;  	} @@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  static void *  t_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { +	list_for_each_entry_continue(file, &tr->events, list) { +		call = file->event_call;  		/*  		 * The ftrace subsystem is for showing formats only.  		 * They can not be enabled or disabled via the event files.  		 */  		if (call->class && call->class->reg) -			return call; +			return file;  	}  	return NULL; @@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  static void *t_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = t_next(m, call, &l); -		if (!call) +		file = t_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static void *  s_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { -		if (call->flags & TRACE_EVENT_FL_ENABLED) -			return call; +	list_for_each_entry_continue(file, &tr->events, list) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) +			return file;  	}  	return NULL; @@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)  static void *s_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = s_next(m, call, &l); -		if (!call) +		file = s_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static int t_show(struct seq_file *m, void *v)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call = file->event_call;  	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)  		seq_printf(m, "%s:", call->class->system); @@ -494,25 +617,31 @@ static ssize_t  event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file = filp->private_data;  	char *buf; -	if (call->flags & TRACE_EVENT_FL_ENABLED) -		buf = "1\n"; -	else +	if (file->flags & FTRACE_EVENT_FL_ENABLED) { +		if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) +			buf = "0*\n"; +		else +			buf = "1\n"; +	} else  		buf = "0\n"; -	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));  }  static ssize_t  event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file = filp->private_data;  	unsigned long val;  	int ret; +	if (!file) +		return -EINVAL; +  	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);  	if (ret)  		return ret; @@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	case 0:  	case 1:  		mutex_lock(&event_mutex); -		ret = ftrace_event_enable_disable(call, val); +		ret = ftrace_event_enable_disable(file, val);  		mutex_unlock(&event_mutex);  		break; @@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		   loff_t *ppos)  {  	const char set_to_char[4] = { '?', '0', '1', 'X' }; -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = dir->tr;  	char buf[2];  	int set = 0;  	int ret;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (!call->name || !call->class || !call->class->reg)  			continue; @@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		 * or if all events or cleared, or if we have  		 * a mixture.  		 */ -		set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); +		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));  		/*  		 * If we have a mixture, no need to look further. @@ -584,7 +717,8 @@ static ssize_t  system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		    loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	const char *name = NULL;  	unsigned long val;  	ssize_t ret; @@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (system)  		name = system->name; -	ret = __ftrace_set_clr_event(NULL, name, NULL, val); +	ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);  	if (ret)  		goto out; @@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems);  static int subsystem_open(struct inode *inode, struct file *filp)  {  	struct event_subsystem *system = NULL; +	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ +	struct trace_array *tr;  	int ret; -	if (!inode->i_private) -		goto skip_search; -  	/* Make sure the system still exists */  	mutex_lock(&event_mutex); -	list_for_each_entry(system, &event_subsystems, list) { -		if (system == inode->i_private) { -			/* Don't open systems with no events */ -			if (!system->nr_events) { -				system = NULL; -				break; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		list_for_each_entry(dir, &tr->systems, list) { +			if (dir == inode->i_private) { +				/* Don't open systems with no events */ +				if (dir->nr_events) { +					__get_system_dir(dir); +					system = dir->subsystem; +				} +				goto exit_loop;  			} -			__get_system(system); -			break;  		}  	} + exit_loop:  	mutex_unlock(&event_mutex); -	if (system != inode->i_private) +	if (!system)  		return -ENODEV; - skip_search: +	/* Some versions of gcc think dir can be uninitialized here */ +	WARN_ON(!dir); + +	ret = tracing_open_generic(inode, filp); +	if (ret < 0) +		put_system(dir); + +	return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_subsystem_dir *dir; +	struct trace_array *tr = inode->i_private; +	int ret; + +	/* Make a temporary dir that has no system but points to tr */ +	dir = kzalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) +		return -ENOMEM; + +	dir->tr = tr; +  	ret = tracing_open_generic(inode, filp); -	if (ret < 0 && system) -		put_system(system); +	if (ret < 0) +		kfree(dir); + +	filp->private_data = dir;  	return ret;  }  static int subsystem_release(struct inode *inode, struct file *file)  { -	struct event_subsystem *system = inode->i_private; +	struct ftrace_subsystem_dir *dir = file->private_data; -	if (system) -		put_system(system); +	/* +	 * If dir->subsystem is NULL, then this is a temporary +	 * descriptor that was made for a trace_array to enable +	 * all subsystems. +	 */ +	if (dir->subsystem) +		put_system(dir); +	else +		kfree(dir);  	return 0;  } @@ -890,7 +1056,8 @@ static ssize_t  subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  		      loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct trace_seq *s;  	int r; @@ -915,7 +1082,7 @@ static ssize_t  subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data;  	char *buf;  	int err; @@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	}  	buf[cnt] = '\0'; -	err = apply_subsystem_event_filter(system, buf); +	err = apply_subsystem_event_filter(dir, buf);  	free_page((unsigned long) buf);  	if (err < 0)  		return err; @@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = {  	.release = subsystem_release,  }; +static const struct file_operations ftrace_tr_enable_fops = { +	.open = system_tr_open, +	.read = system_enable_read, +	.write = system_enable_write, +	.llseek = default_llseek, +	.release = subsystem_release, +}; +  static const struct file_operations ftrace_show_header_fops = {  	.open = tracing_open_generic,  	.read = show_header,  	.llseek = default_llseek,  }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, +		  const struct seq_operations *seq_ops)  { -	static struct dentry *d_tracer; -	static struct dentry *d_events; - -	if (d_events) -		return d_events; - -	d_tracer = tracing_init_dentry(); -	if (!d_tracer) -		return NULL; +	struct seq_file *m; +	int ret; -	d_events = debugfs_create_dir("events", d_tracer); -	if (!d_events) -		pr_warning("Could not create debugfs " -			   "'events' directory\n"); +	ret = seq_open(file, seq_ops); +	if (ret < 0) +		return ret; +	m = file->private_data; +	/* copy tr over to seq ops */ +	m->private = inode->i_private; -	return d_events; +	return ret;  }  static int @@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)  {  	const struct seq_operations *seq_ops = &show_event_seq_ops; -	return seq_open(file, seq_ops); +	return ftrace_event_open(inode, file, seq_ops);  }  static int  ftrace_event_set_open(struct inode *inode, struct file *file)  {  	const struct seq_operations *seq_ops = &show_set_event_seq_ops; +	struct trace_array *tr = inode->i_private;  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) -		ftrace_clear_events(); +		ftrace_clear_events(tr); -	return seq_open(file, seq_ops); +	return ftrace_event_open(inode, file, seq_ops); +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ +	struct event_subsystem *system; + +	/* need to create new entry */ +	system = kmalloc(sizeof(*system), GFP_KERNEL); +	if (!system) +		return NULL; + +	system->ref_count = 1; +	system->name = name; + +	system->filter = NULL; + +	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); +	if (!system->filter) +		goto out_free; + +	list_add(&system->list, &event_subsystems); + +	return system; + + out_free: +	kfree(system); +	return NULL;  }  static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, +		    struct ftrace_event_file *file, struct dentry *parent)  { +	struct ftrace_subsystem_dir *dir;  	struct event_subsystem *system;  	struct dentry *entry;  	/* First see if we did not already create this dir */ -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { +		system = dir->subsystem;  		if (strcmp(system->name, name) == 0) { -			system->nr_events++; -			return system->entry; +			dir->nr_events++; +			file->system = dir; +			return dir->entry;  		}  	} -	/* need to create new entry */ -	system = kmalloc(sizeof(*system), GFP_KERNEL); -	if (!system) { -		pr_warning("No memory to create event subsystem %s\n", -			   name); -		return d_events; -	} - -	system->entry = debugfs_create_dir(name, d_events); -	if (!system->entry) { -		pr_warning("Could not create event subsystem %s\n", -			   name); -		kfree(system); -		return d_events; -	} - -	system->nr_events = 1; -	system->ref_count = 1; -	system->name = kstrdup(name, GFP_KERNEL); -	if (!system->name) { -		debugfs_remove(system->entry); -		kfree(system); -		return d_events; +	/* Now see if the system itself exists. */ +	list_for_each_entry(system, &event_subsystems, list) { +		if (strcmp(system->name, name) == 0) +			break;  	} +	/* Reset system variable when not found */ +	if (&system->list == &event_subsystems) +		system = NULL; -	list_add(&system->list, &event_subsystems); +	dir = kmalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) +		goto out_fail; -	system->filter = NULL; +	if (!system) { +		system = create_new_subsystem(name); +		if (!system) +			goto out_free; +	} else +		__get_system(system); -	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); -	if (!system->filter) { -		pr_warning("Could not allocate filter for subsystem " -			   "'%s'\n", name); -		return system->entry; +	dir->entry = debugfs_create_dir(name, parent); +	if (!dir->entry) { +		pr_warning("Failed to create system directory %s\n", name); +		__put_system(system); +		goto out_free;  	} -	entry = debugfs_create_file("filter", 0644, system->entry, system, +	dir->tr = tr; +	dir->ref_count = 1; +	dir->nr_events = 1; +	dir->subsystem = system; +	file->system = dir; + +	entry = debugfs_create_file("filter", 0644, dir->entry, dir,  				    &ftrace_subsystem_filter_fops);  	if (!entry) {  		kfree(system->filter);  		system->filter = NULL; -		pr_warning("Could not create debugfs " -			   "'%s/filter' entry\n", name); +		pr_warning("Could not create debugfs '%s/filter' entry\n", name);  	} -	trace_create_file("enable", 0644, system->entry, system, +	trace_create_file("enable", 0644, dir->entry, dir,  			  &ftrace_system_enable_fops); -	return system->entry; +	list_add(&dir->list, &tr->systems); + +	return dir->entry; + + out_free: +	kfree(dir); + out_fail: +	/* Only print this message if failed on memory allocation */ +	if (!dir || !system) +		pr_warning("No memory to create event subsystem %s\n", +			   name); +	return NULL;  }  static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, +event_create_dir(struct dentry *parent, +		 struct ftrace_event_file *file,  		 const struct file_operations *id,  		 const struct file_operations *enable,  		 const struct file_operations *filter,  		 const struct file_operations *format)  { +	struct ftrace_event_call *call = file->event_call; +	struct trace_array *tr = file->tr;  	struct list_head *head; +	struct dentry *d_events;  	int ret;  	/*  	 * If the trace point header did not define TRACE_SYSTEM  	 * then the system would be called "TRACE_SYSTEM".  	 */ -	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) -		d_events = event_subsystem_dir(call->class->system, d_events); +	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { +		d_events = event_subsystem_dir(tr, call->class->system, file, parent); +		if (!d_events) +			return -ENOMEM; +	} else +		d_events = parent; -	call->dir = debugfs_create_dir(call->name, d_events); -	if (!call->dir) { -		pr_warning("Could not create debugfs " -			   "'%s' directory\n", call->name); +	file->dir = debugfs_create_dir(call->name, d_events); +	if (!file->dir) { +		pr_warning("Could not create debugfs '%s' directory\n", +			   call->name);  		return -1;  	}  	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) -		trace_create_file("enable", 0644, call->dir, call, +		trace_create_file("enable", 0644, file->dir, file,  				  enable);  #ifdef CONFIG_PERF_EVENTS  	if (call->event.type && call->class->reg) -		trace_create_file("id", 0444, call->dir, call, +		trace_create_file("id", 0444, file->dir, call,  		 		  id);  #endif @@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,  		if (ret < 0) {  			pr_warning("Could not initialize trace point"  				   " events/%s\n", call->name); -			return ret; +			return -1;  		}  	} -	trace_create_file("filter", 0644, call->dir, call, +	trace_create_file("filter", 0644, file->dir, call,  			  filter); -	trace_create_file("format", 0444, call->dir, call, +	trace_create_file("format", 0444, file->dir, call,  			  format);  	return 0;  } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ +	if (!dir) +		return; + +	if (!--dir->nr_events) { +		debugfs_remove_recursive(dir->entry); +		list_del(&dir->list); +		__put_system_dir(dir); +	} +} + +static void remove_event_from_tracers(struct ftrace_event_call *call) +{ +	struct ftrace_event_file *file; +	struct trace_array *tr; + +	do_for_each_event_file_safe(tr, file) { + +		if (file->event_call != call) +			continue; + +		list_del(&file->list); +		debugfs_remove_recursive(file->dir); +		remove_subsystem(file->system); +		kmem_cache_free(file_cachep, file); + +		/* +		 * The do_for_each_event_file_safe() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); +} +  static void event_remove(struct ftrace_event_call *call)  { -	ftrace_event_enable_disable(call, 0); +	struct trace_array *tr; +	struct ftrace_event_file *file; + +	do_for_each_event_file(tr, file) { +		if (file->event_call != call) +			continue; +		ftrace_event_enable_disable(file, 0); +		/* +		 * The do_for_each_event_file() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); +  	if (call->event.funcs)  		__unregister_ftrace_event(&call->event); +	remove_event_from_tracers(call);  	list_del(&call->list);  } @@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call)  }  static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, -		       const struct file_operations *id, -		       const struct file_operations *enable, -		       const struct file_operations *filter, -		       const struct file_operations *format) +__register_event(struct ftrace_event_call *call, struct module *mod)  { -	struct dentry *d_events;  	int ret;  	ret = event_init(call);  	if (ret < 0)  		return ret; -	d_events = event_trace_events_dir(); -	if (!d_events) -		return -ENOENT; - -	ret = event_create_dir(call, d_events, id, enable, filter, format); -	if (!ret) -		list_add(&call->list, &ftrace_events); +	list_add(&call->list, &ftrace_events);  	call->mod = mod; -	return ret; +	return 0; +} + +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, +		      struct trace_array *tr, +		      const struct file_operations *id, +		      const struct file_operations *enable, +		      const struct file_operations *filter, +		      const struct file_operations *format) +{ +	struct ftrace_event_file *file; + +	file = kmem_cache_alloc(file_cachep, GFP_TRACE); +	if (!file) +		return -ENOMEM; + +	file->event_call = call; +	file->tr = tr; +	list_add(&file->list, &tr->events); + +	return event_create_dir(tr->event_dir, file, id, enable, filter, format); +} + +/* + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, +			    struct trace_array *tr) +{ +	struct ftrace_event_file *file; + +	file = kmem_cache_alloc(file_cachep, GFP_TRACE); +	if (!file) +		return -ENOMEM; + +	file->event_call = call; +	file->tr = tr; +	list_add(&file->list, &tr->events); + +	return 0;  } +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call, +				   struct ftrace_module_file_ops *file_ops); +  /* Add an additional event_call dynamically */  int trace_add_event_call(struct ftrace_event_call *call)  {  	int ret;  	mutex_lock(&event_mutex); -	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, -				     &ftrace_enable_fops, -				     &ftrace_event_filter_fops, -				     &ftrace_event_format_fops); -	mutex_unlock(&event_mutex); -	return ret; -} -static void remove_subsystem_dir(const char *name) -{ -	struct event_subsystem *system; - -	if (strcmp(name, TRACE_SYSTEM) == 0) -		return; +	ret = __register_event(call, NULL); +	if (ret >= 0) +		__add_event_to_tracers(call, NULL); -	list_for_each_entry(system, &event_subsystems, list) { -		if (strcmp(system->name, name) == 0) { -			if (!--system->nr_events) { -				debugfs_remove_recursive(system->entry); -				list_del(&system->list); -				__put_system(system); -			} -			break; -		} -	} +	mutex_unlock(&event_mutex); +	return ret;  }  /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Must be called under locking both of event_mutex and trace_event_sem.   */  static void __trace_remove_event_call(struct ftrace_event_call *call)  {  	event_remove(call);  	trace_destroy_fields(call);  	destroy_preds(call); -	debugfs_remove_recursive(call->dir); -	remove_subsystem_dir(call->class->system);  }  /* Remove an event_call */  void trace_remove_event_call(struct ftrace_event_call *call)  {  	mutex_lock(&event_mutex); -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	__trace_remove_event_call(call); -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	mutex_unlock(&event_mutex);  } @@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops {  };  static struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ +	/* +	 * As event_calls are added in groups by module, +	 * when we find one file_ops, we don't need to search for +	 * each call in that module, as the rest should be the +	 * same. Only search for a new one if the last one did +	 * not match. +	 */ +	if (file_ops && mod == file_ops->mod) +		return file_ops; + +	list_for_each_entry(file_ops, &ftrace_module_file_list, list) { +		if (file_ops->mod == mod) +			return file_ops; +	} +	return NULL; +} + +static struct ftrace_module_file_ops *  trace_create_file_ops(struct module *mod)  {  	struct ftrace_module_file_ops *file_ops; @@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod)  		return;  	for_each_event(call, start, end) { -		__trace_add_event_call(*call, mod, -				       &file_ops->id, &file_ops->enable, -				       &file_ops->filter, &file_ops->format); +		__register_event(*call, mod); +		__add_event_to_tracers(*call, file_ops);  	}  } @@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod)  {  	struct ftrace_module_file_ops *file_ops;  	struct ftrace_event_call *call, *p; -	bool found = false; +	bool clear_trace = false; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	list_for_each_entry_safe(call, p, &ftrace_events, list) {  		if (call->mod == mod) { -			found = true; +			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) +				clear_trace = true;  			__trace_remove_event_call(call);  		}  	} @@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod)  		list_del(&file_ops->list);  		kfree(file_ops);  	} +	up_write(&trace_event_sem);  	/*  	 * It is safest to reset the ring buffer if the module being unloaded -	 * registered any events. +	 * registered any events that were used. The only worry is if +	 * a new module gets loaded, and takes on the same id as the events +	 * of this module. When printing out the buffer, traced events left +	 * over from this module may be passed to the new module events and +	 * unexpected results may occur.  	 */ -	if (found) -		tracing_reset_current_online_cpus(); -	up_write(&trace_event_mutex); +	if (clear_trace) +		tracing_reset_all_online_cpus();  }  static int trace_module_notify(struct notifier_block *self, @@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self,  	return 0;  } + +static int +__trace_add_new_mod_event(struct ftrace_event_call *call, +			  struct trace_array *tr, +			  struct ftrace_module_file_ops *file_ops) +{ +	return __trace_add_new_event(call, tr, +				     &file_ops->id, &file_ops->enable, +				     &file_ops->filter, &file_ops->format); +} +  #else -static int trace_module_notify(struct notifier_block *self, -			       unsigned long val, void *data) +static inline struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ +	return NULL; +} +static inline int trace_module_notify(struct notifier_block *self, +				      unsigned long val, void *data)  {  	return 0;  } +static inline int +__trace_add_new_mod_event(struct ftrace_event_call *call, +			  struct trace_array *tr, +			  struct ftrace_module_file_ops *file_ops) +{ +	return -ENODEV; +}  #endif /* CONFIG_MODULES */ +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_module_file_ops *file_ops = NULL; +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		if (call->mod) { +			/* +			 * Directories for events by modules need to +			 * keep module ref counts when opened (as we don't +			 * want the module to disappear when reading one +			 * of these files). The file_ops keep account of +			 * the module ref count. +			 */ +			file_ops = find_ftrace_file_ops(file_ops, call->mod); +			if (!file_ops) +				continue; /* Warn? */ +			ret = __trace_add_new_mod_event(call, tr, file_ops); +			if (ret < 0) +				pr_warning("Could not create directory for event %s\n", +					   call->name); +			continue; +		} +		ret = __trace_add_new_event(call, tr, +					    &ftrace_event_id_fops, +					    &ftrace_enable_fops, +					    &ftrace_event_filter_fops, +					    &ftrace_event_format_fops); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   call->name); +	} +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR	"enable_event" +#define DISABLE_EVENT_STR	"disable_event" + +struct event_probe_data { +	struct ftrace_event_file	*file; +	unsigned long			count; +	int				ref; +	bool				enable; +}; + +static struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system,  const char *event) +{ +	struct ftrace_event_file *file; +	struct ftrace_event_call *call; + +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call; + +		if (!call->name || !call->class || !call->class->reg) +			continue; + +		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) +			continue; + +		if (strcmp(event, call->name) == 0 && +		    strcmp(system, call->class->system) == 0) +			return file; +	} +	return NULL; +} + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (data->enable) +		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +	else +		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (!data->count) +		return; + +	/* Skip if the event is in a state we want to switch to */ +	if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) +		return; + +	if (data->count != -1) +		(data->count)--; + +	event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *_data) +{ +	struct event_probe_data *data = _data; + +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "%s:%s:%s", +		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, +		   data->file->event_call->class->system, +		   data->file->event_call->name); + +	if (data->count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", data->count); + +	return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	data->ref++; +	return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (WARN_ON_ONCE(data->ref <= 0)) +		return; + +	data->ref--; +	if (!data->ref) { +		/* Remove the SOFT_MODE flag */ +		__ftrace_event_enable_disable(data->file, 0, 1); +		module_put(data->file->event_call->mod); +		kfree(data); +	} +	*pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, +		  char *glob, char *cmd, char *param, int enabled) +{ +	struct trace_array *tr = top_trace_array(); +	struct ftrace_event_file *file; +	struct ftrace_probe_ops *ops; +	struct event_probe_data *data; +	const char *system; +	const char *event; +	char *number; +	bool enable; +	int ret; + +	/* hash funcs only work with set_ftrace_filter */ +	if (!enabled) +		return -EINVAL; + +	if (!param) +		return -EINVAL; + +	system = strsep(¶m, ":"); +	if (!param) +		return -EINVAL; + +	event = strsep(¶m, ":"); + +	mutex_lock(&event_mutex); + +	ret = -EINVAL; +	file = find_event_file(tr, system, event); +	if (!file) +		goto out; + +	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + +	if (enable) +		ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; +	else +		ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		ret = 0; +		goto out; +	} + +	ret = -ENOMEM; +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		goto out; + +	data->enable = enable; +	data->count = -1; +	data->file = file; + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	ret = -EINVAL; +	if (!strlen(number)) +		goto out_free; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, &data->count); +	if (ret) +		goto out_free; + + out_reg: +	/* Don't let event modules unload while probe registered */ +	ret = try_module_get(file->event_call->mod); +	if (!ret) +		goto out_free; + +	ret = __ftrace_event_enable_disable(file, 1, 1); +	if (ret < 0) +		goto out_put; +	ret = register_ftrace_function_probe(glob, ops, data); +	if (!ret) +		goto out_disable; + out: +	mutex_unlock(&event_mutex); +	return ret; + + out_disable: +	__ftrace_event_enable_disable(file, 0, 1); + out_put: +	module_put(file->event_call->mod); + out_free: +	kfree(data); +	goto out; +} + +static struct ftrace_func_command event_enable_cmd = { +	.name			= ENABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { +	.name			= DISABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static __init int register_event_cmds(void) +{ +	int ret; + +	ret = register_ftrace_command(&event_enable_cmd); +	if (WARN_ON(ret < 0)) +		return ret; +	ret = register_ftrace_command(&event_disable_cmd); +	if (WARN_ON(ret < 0)) +		unregister_ftrace_command(&event_enable_cmd); +	return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file; +	int ret; + + +	list_for_each_entry(file, &tr->events, list) { +		ret = event_create_dir(tr->event_dir, file, +				       &ftrace_event_id_fops, +				       &ftrace_enable_fops, +				       &ftrace_event_filter_fops, +				       &ftrace_event_format_fops); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   file->event_call->name); +	} +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		/* Early boot up should not have any modules loaded */ +		if (WARN_ON_ONCE(call->mod)) +			continue; + +		ret = __trace_early_add_new_event(call, tr); +		if (ret < 0) +			pr_warning("Could not create early event %s\n", +				   call->name); +	} +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file, *next; + +	list_for_each_entry_safe(file, next, &tr->events, list) { +		list_del(&file->list); +		debugfs_remove_recursive(file->dir); +		remove_subsystem(file->system); +		kmem_cache_free(file_cachep, file); +	} +} + +static void +__add_event_to_tracers(struct ftrace_event_call *call, +		       struct ftrace_module_file_ops *file_ops) +{ +	struct trace_array *tr; + +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (file_ops) +			__trace_add_new_mod_event(call, tr, file_ops); +		else +			__trace_add_new_event(call, tr, +					      &ftrace_event_id_fops, +					      &ftrace_enable_fops, +					      &ftrace_event_filter_fops, +					      &ftrace_event_format_fops); +	} +} +  static struct notifier_block trace_module_nb = {  	.notifier_call = trace_module_notify,  	.priority = 0, @@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;  static __init int setup_trace_event(char *str)  {  	strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); -	ring_buffer_expanded = 1; -	tracing_selftest_disabled = 1; +	ring_buffer_expanded = true; +	tracing_selftest_disabled = true;  	return 1;  }  __setup("trace_event=", setup_trace_event); +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) +{ +	struct dentry *d_events; +	struct dentry *entry; + +	entry = debugfs_create_file("set_event", 0644, parent, +				    tr, &ftrace_set_event_fops); +	if (!entry) { +		pr_warning("Could not create debugfs 'set_event' entry\n"); +		return -ENOMEM; +	} + +	d_events = debugfs_create_dir("events", parent); +	if (!d_events) { +		pr_warning("Could not create debugfs 'events' directory\n"); +		return -ENOMEM; +	} + +	/* ring buffer internal formats */ +	trace_create_file("header_page", 0444, d_events, +			  ring_buffer_print_page_header, +			  &ftrace_show_header_fops); + +	trace_create_file("header_event", 0444, d_events, +			  ring_buffer_print_entry_header, +			  &ftrace_show_header_fops); + +	trace_create_file("enable", 0644, d_events, +			  tr, &ftrace_tr_enable_fops); + +	tr->event_dir = d_events; + +	return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_early_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ +	/* Disable any running events */ +	__ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + +	mutex_lock(&event_mutex); + +	down_write(&trace_event_sem); +	__trace_remove_event_dirs(tr); +	debugfs_remove_recursive(tr->event_dir); +	up_write(&trace_event_sem); + +	tr->event_dir = NULL; + +	mutex_unlock(&event_mutex); + +	return 0; +} + +static __init int event_trace_memsetup(void) +{ +	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); +	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); +	return 0; +} +  static __init int event_trace_enable(void)  { +	struct trace_array *tr = top_trace_array();  	struct ftrace_event_call **iter, *call;  	char *buf = bootup_event_buf;  	char *token; @@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void)  			list_add(&call->list, &ftrace_events);  	} +	/* +	 * We need the top trace array to have a working set of trace +	 * points at early init, before the debug files and directories +	 * are created. Create the file entries now, and attach them +	 * to the actual file dentries later. +	 */ +	__trace_early_add_events(tr); +  	while (true) {  		token = strsep(&buf, ","); @@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void)  		if (!*token)  			continue; -		ret = ftrace_set_clr_event(token, 1); +		ret = ftrace_set_clr_event(tr, token, 1);  		if (ret)  			pr_warn("Failed to enable trace event: %s\n", token);  	}  	trace_printk_start_comm(); +	register_event_cmds(); +  	return 0;  }  static __init int event_trace_init(void)  { -	struct ftrace_event_call *call; +	struct trace_array *tr;  	struct dentry *d_tracer;  	struct dentry *entry; -	struct dentry *d_events;  	int ret; +	tr = top_trace_array(); +  	d_tracer = tracing_init_dentry();  	if (!d_tracer)  		return 0;  	entry = debugfs_create_file("available_events", 0444, d_tracer, -				    NULL, &ftrace_avail_fops); +				    tr, &ftrace_avail_fops);  	if (!entry)  		pr_warning("Could not create debugfs "  			   "'available_events' entry\n"); -	entry = debugfs_create_file("set_event", 0644, d_tracer, -				    NULL, &ftrace_set_event_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_event' entry\n"); - -	d_events = event_trace_events_dir(); -	if (!d_events) -		return 0; - -	/* ring buffer internal formats */ -	trace_create_file("header_page", 0444, d_events, -			  ring_buffer_print_page_header, -			  &ftrace_show_header_fops); - -	trace_create_file("header_event", 0444, d_events, -			  ring_buffer_print_entry_header, -			  &ftrace_show_header_fops); - -	trace_create_file("enable", 0644, d_events, -			  NULL, &ftrace_system_enable_fops); -  	if (trace_define_common_fields())  		pr_warning("tracing: Failed to allocate common fields"); -	/* -	 * Early initialization already enabled ftrace event. -	 * Now it's only necessary to create the event directory. -	 */ -	list_for_each_entry(call, &ftrace_events, list) { - -		ret = event_create_dir(call, d_events, -				       &ftrace_event_id_fops, -				       &ftrace_enable_fops, -				       &ftrace_event_filter_fops, -				       &ftrace_event_format_fops); -		if (ret < 0) -			event_remove(call); -	} +	ret = early_event_add_tracer(d_tracer, tr); +	if (ret) +		return ret;  	ret = register_module_notifier(&trace_module_nb);  	if (ret) @@ -1568,6 +2399,7 @@ static __init int event_trace_init(void)  	return 0;  } +early_initcall(event_trace_memsetup);  core_initcall(event_trace_enable);  fs_initcall(event_trace_init); @@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void)   */  static __init void event_trace_self_tests(void)  { +	struct ftrace_subsystem_dir *dir; +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	struct event_subsystem *system; +	struct trace_array *tr;  	int ret; +	tr = top_trace_array(); +  	pr_info("Running tests on trace events:\n"); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call;  		/* Only test those that have a probe */  		if (!call->class || !call->class->probe) @@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void)  		 * If an event is already enabled, someone is using  		 * it and the self test should not be on.  		 */ -		if (call->flags & TRACE_EVENT_FL_ENABLED) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) {  			pr_warning("Enabled event during self test!\n");  			WARN_ON_ONCE(1);  			continue;  		} -		ftrace_event_enable_disable(call, 1); +		ftrace_event_enable_disable(file, 1);  		event_test_stuff(); -		ftrace_event_enable_disable(call, 0); +		ftrace_event_enable_disable(file, 0);  		pr_cont("OK\n");  	} @@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on trace event systems:\n"); -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { + +		system = dir->subsystem;  		/* the ftrace system is special, skip it */  		if (strcmp(system->name, "ftrace") == 0) @@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void)  		pr_info("Testing event system %s: ", system->name); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);  		if (WARN_ON_ONCE(ret)) {  			pr_warning("error enabling system %s\n",  				   system->name); @@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void)  		event_test_stuff(); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);  		if (WARN_ON_ONCE(ret)) {  			pr_warning("error disabling system %s\n",  				   system->name); @@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on all trace events:\n");  	pr_info("Testing all events: "); -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error enabling all events\n");  		return; @@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void)  	event_test_stuff();  	/* reset sysname */ -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error disabling all events\n");  		return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e5b0ca8b8d4..a6361178de5 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,  	mutex_unlock(&event_mutex);  } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ -	struct ftrace_event_field *field; - -	list_for_each_entry(field, head, link) { -		if (!strcmp(field->name, name)) -			return field; -	} - -	return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ -	struct ftrace_event_field *field; -	struct list_head *head; - -	field = __find_event_field(&ftrace_common_fields, name); -	if (field) -		return field; - -	head = trace_get_fields(call); -	return __find_event_field(head, name); -} -  static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)  {  	stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); @@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,  		return NULL;  	} -	field = find_event_field(call, operand1); +	field = trace_find_event_field(call, operand1);  	if (!field) {  		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);  		return NULL; @@ -1907,16 +1880,17 @@ out_unlock:  	return err;  } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  				 char *filter_string)  { +	struct event_subsystem *system = dir->subsystem;  	struct event_filter *filter;  	int err = 0;  	mutex_lock(&event_mutex);  	/* Make sure the system still has events */ -	if (!system->nr_events) { +	if (!dir->nr_events) {  		err = -ENODEV;  		goto out_unlock;  	} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e039906b037..d21a7467008 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void)		\  #undef FTRACE_ENTRY  #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ -int									\ +static int __init							\  ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  {									\  	struct struct_name field;					\ @@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\  			 regfn)						\  									\ -struct ftrace_event_class event_class_ftrace_##call = {			\ +struct ftrace_event_class __refdata event_class_ftrace_##call = {	\  	.system			= __stringify(TRACE_SYSTEM),		\  	.define_fields		= ftrace_define_fields_##call,		\  	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 60115252332..c4d6d719198 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);  static int function_trace_init(struct trace_array *tr)  {  	func_trace = tr; -	tr->cpu = get_cpu(); +	tr->trace_buffer.cpu = get_cpu();  	put_cpu();  	tracing_start_cmdline_record(); @@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)  static void function_trace_start(struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  /* Our option */ @@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,  		goto out;  	cpu = smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (!atomic_read(&data->disabled)) {  		local_save_flags(flags);  		trace_function(tr, ip, parent_ip, flags, pc); @@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,  	 */  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) { @@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =  };  #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data)  { -	long *count = (long *)data; - -	if (tracing_is_on()) -		return; +	unsigned long *count = (long *)data;  	if (!*count) -		return; +		return 0;  	if (*count != -1)  		(*count)--; -	tracing_on(); +	return 1;  }  static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)  { -	long *count = (long *)data; +	if (tracing_is_on()) +		return; + +	if (update_count(data)) +		tracing_on(); +} +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{  	if (!tracing_is_on())  		return; -	if (!*count) +	if (update_count(data)) +		tracing_off(); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (tracing_is_on())  		return; -	if (*count != -1) -		(*count)--; +	tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return;  	tracing_off();  } -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data); +/* + * Skip 4: + *   ftrace_stacktrace() + *   function_trace_probe_call() + *   ftrace_ops_list_func() + *   ftrace_call() + */ +#define STACK_SKIP 4 -static struct ftrace_probe_ops traceon_probe_ops = { -	.func			= ftrace_traceon, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ +	trace_dump_stack(STACK_SKIP); +} -static struct ftrace_probe_ops traceoff_probe_ops = { -	.func			= ftrace_traceoff, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return; + +	if (update_count(data)) +		trace_dump_stack(STACK_SKIP); +}  static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data) +ftrace_probe_print(const char *name, struct seq_file *m, +		   unsigned long ip, void *data)  {  	long count = (long)data; -	seq_printf(m, "%ps:", (void *)ip); - -	if (ops == &traceon_probe_ops) -		seq_printf(m, "traceon"); -	else -		seq_printf(m, "traceoff"); +	seq_printf(m, "%ps:%s", (void *)ip, name);  	if (count == -1)  		seq_printf(m, ":unlimited\n"); @@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,  }  static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +ftrace_traceon_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data)  { -	struct ftrace_probe_ops *ops; - -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +	return ftrace_probe_print("traceon", m, ip, data); +} -	unregister_ftrace_function_probe_func(glob, ops); +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("traceoff", m, ip, data); +} -	return 0; +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("stacktrace", m, ip, data);  } +static struct ftrace_probe_ops traceon_count_probe_ops = { +	.func			= ftrace_traceon_count, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { +	.func			= ftrace_traceoff_count, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { +	.func			= ftrace_stacktrace_count, +	.print			= ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { +	.func			= ftrace_traceon, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { +	.func			= ftrace_traceoff, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { +	.func			= ftrace_stacktrace, +	.print			= ftrace_stacktrace_print, +}; +  static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, -			    char *glob, char *cmd, char *param, int enable) +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +			    struct ftrace_hash *hash, char *glob, +			    char *cmd, char *param, int enable)  { -	struct ftrace_probe_ops *ops;  	void *count = (void *)-1;  	char *number;  	int ret; @@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,  	if (!enable)  		return -EINVAL; -	if (glob[0] == '!') -		return ftrace_trace_onoff_unreg(glob+1, cmd, param); - -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	}  	if (!param)  		goto out_reg; @@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,  	return ret < 0 ? ret : 0;  } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, +			    char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	/* we register both traceon and traceoff to this callback */ +	if (strcmp(cmd, "traceon") == 0) +		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; +	else +		ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} +  static struct ftrace_func_command ftrace_traceon_cmd = {  	.name			= "traceon",  	.func			= ftrace_trace_onoff_callback, @@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {  	.func			= ftrace_trace_onoff_callback,  }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { +	.name			= "stacktrace", +	.func			= ftrace_stacktrace_callback, +}; +  static int __init init_func_cmd_traceon(void)  {  	int ret; @@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)  	ret = register_ftrace_command(&ftrace_traceon_cmd);  	if (ret)  		unregister_ftrace_command(&ftrace_traceoff_cmd); + +	ret = register_ftrace_command(&ftrace_stacktrace_cmd); +	if (ret) { +		unregister_ftrace_command(&ftrace_traceoff_cmd); +		unregister_ftrace_command(&ftrace_traceon_cmd); +	}  	return ret;  }  #else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 39ada66389c..8388bc99f2e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_entry;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ent_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_exit;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ret_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,  			 * We need to consume the current entry to see  			 * the next one.  			 */ -			ring_buffer_consume(iter->tr->buffer, iter->cpu, +			ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,  					    NULL, NULL); -			event = ring_buffer_peek(iter->tr->buffer, iter->cpu, +			event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,  						 NULL, NULL);  		} diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac488..b19d065a28c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,7 +32,8 @@ enum {  static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; +static bool function_enabled;  static void stop_irqsoff_tracer(struct trace_array *tr, int graph);  static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,  	if (!irqs_disabled_flags(*flags))  		return 0; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (likely(disabled == 1)) @@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)  		per_cpu(tracing_cpu, cpu) = 0;  	tracing_max_latency = 0; -	tracing_reset_online_cpus(irqsoff_trace); +	tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);  	return start_irqsoff_tracer(irqsoff_trace, set);  } @@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)  	if (per_cpu(tracing_cpu, cpu))  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) || atomic_read(&data->disabled))  		return; @@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)  	if (!tracer_enabled)  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) ||  	    !data->critical_start || atomic_read(&data->disabled)) @@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)  }  #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(int graph, int set)  { -	int ret = 0; +	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&irqsoff_graph_return,  					    &irqsoff_graph_entry); +	else +		ret = register_ftrace_function(&trace_ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_irqsoff_function(int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(&trace_ops); + +	function_enabled = false; +} + +static void irqsoff_function_set(int set) +{ +	if (set) +		register_irqsoff_function(is_graph(), 1); +	else +		unregister_irqsoff_function(is_graph()); +} + +static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +{ +	if (mask & TRACE_ITER_FUNCTION) +		irqsoff_function_set(set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ +	int ret; + +	ret = register_irqsoff_function(graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_irqsoff_function(graph);  }  static void __irqsoff_tracer_init(struct trace_array *tr)  { -	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; -	trace_flags |= TRACE_ITER_LATENCY_FMT; +	save_flags = trace_flags; + +	/* non overwrite screws up the latency tracers */ +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);  	tracing_max_latency = 0;  	irqsoff_trace = tr;  	/* make sure that the tracer is visible */  	smp_wmb(); -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	if (start_irqsoff_tracer(tr, is_graph()))  		printk(KERN_ERR "failed to start irqsoff tracer\n"); @@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)  static void irqsoff_tracer_reset(struct trace_array *tr)  { +	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; +	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; +  	stop_irqsoff_tracer(tr, is_graph()); -	if (!save_lat_flag) -		trace_flags &= ~TRACE_ITER_LATENCY_FMT; +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);  }  static void irqsoff_tracer_start(struct trace_array *tr) @@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_irqsoff,  #endif @@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptoff,  #endif @@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptirqsoff,  #endif diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b..bd90e1b0608 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	old_userobj = trace_flags; @@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	iter.iter_flags |= TRACE_FILE_LAT_FMT;  	iter.pos = -1; -	if (cpu_file == TRACE_PIPE_ALL_CPU) { +	if (cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter.buffer_iter[cpu] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);  			ring_buffer_read_start(iter.buffer_iter[cpu]);  			tracing_iter_reset(&iter, cpu);  		}  	} else {  		iter.cpu_file = cpu_file;  		iter.buffer_iter[cpu_file] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu_file); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);  		ring_buffer_read_start(iter.buffer_iter[cpu_file]);  		tracing_iter_reset(&iter, cpu_file);  	} @@ -83,7 +83,7 @@ out:  	trace_flags = old_userobj;  	for_each_tracing_cpu(cpu) { -		atomic_dec(&iter.tr->data[cpu]->disabled); +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	for_each_tracing_cpu(cpu) @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)  		    !cpu_online(cpu_file))  			return KDB_BADINT;  	} else { -		cpu_file = TRACE_PIPE_ALL_CPU; +		cpu_file = RING_BUFFER_ALL_CPUS;  	}  	kdb_trap_printk++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e..a5e8f4878bf 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)  	overrun_detected = false;  	prev_overruns = 0; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  static int mmio_trace_init(struct trace_array *tr) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)  static unsigned long count_overruns(struct trace_iterator *iter)  {  	unsigned long cnt = atomic_xchg(&dropped_count, 0); -	unsigned long over = ring_buffer_overruns(iter->tr->buffer); +	unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);  	if (over > prev_overruns)  		cnt += over - prev_overruns; @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  				struct mmiotrace_rw *rw)  {  	struct ftrace_event_call *call = &event_mmiotrace_rw; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_rw *entry;  	int pc = preempt_count(); @@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  void mmio_trace_rw(struct mmiotrace_rw *rw)  {  	struct trace_array *tr = mmio_trace_array; -	struct trace_array_cpu *data = tr->data[smp_processor_id()]; +	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_rw(tr, data, rw);  } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  				struct mmiotrace_map *map)  {  	struct ftrace_event_call *call = &event_mmiotrace_map; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_map *entry;  	int pc = preempt_count(); @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)  	struct trace_array_cpu *data;  	preempt_disable(); -	data = tr->data[smp_processor_id()]; +	data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_map(tr, data, map);  	preempt_enable();  } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 697e88d1390..bb922d9ee51 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@  /* must be a power of 2 */  #define EVENT_HASHSIZE	128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem);  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)  	return ret;  } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ +	struct trace_seq *s = &iter->seq; +	struct trace_entry *entry = iter->ent; +	struct bputs_entry *field; +	int ret; + +	trace_assign_type(field, entry); + +	ret = trace_seq_puts(s, field->str); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return TRACE_TYPE_HANDLED; +} +  enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq; @@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)  }  EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, +			   struct trace_event *trace_event) +{ +	struct ftrace_event_call *event; +	struct trace_seq *s = &iter->seq; +	struct trace_seq *p = &iter->tmp_seq; +	struct trace_entry *entry; +	int ret; + +	event = container_of(trace_event, struct ftrace_event_call, event); +	entry = iter->ent; + +	if (entry->type != event->event.type) { +		WARN_ON_ONCE(1); +		return TRACE_TYPE_UNHANDLED; +	} + +	trace_seq_init(p); +	ret = trace_seq_printf(s, "%s: ", event->name); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); +  #ifdef CONFIG_KRETPROBES  static inline const char *kretprobed(const char *name)  { @@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)  {  	unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;  	unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; -	unsigned long long abs_ts = iter->ts - iter->tr->time_start; +	unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;  	unsigned long long rel_ts = next_ts - iter->ts;  	struct trace_seq *s = &iter->seq; @@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)  void trace_event_read_lock(void)  { -	down_read(&trace_event_mutex); +	down_read(&trace_event_sem);  }  void trace_event_read_unlock(void)  { -	up_read(&trace_event_mutex); +	up_read(&trace_event_sem);  }  /** @@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)  	unsigned key;  	int ret = 0; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	if (WARN_ON(!event))  		goto out; @@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)  	ret = event->type;   out: -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return ret;  }  EXPORT_SYMBOL_GPL(register_ftrace_event);  /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write.   */  int __unregister_ftrace_event(struct trace_event *event)  { @@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)   */  int unregister_ftrace_event(struct trace_event *event)  { -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	__unregister_ftrace_event(event); -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return 0;  } @@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {  	.funcs		= &trace_user_stack_funcs,  }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, +		   struct trace_event *event) +{ +	struct trace_entry *entry = iter->ent; +	struct trace_seq *s = &iter->seq; +	struct bputs_entry *field; + +	trace_assign_type(field, entry); + +	if (!seq_print_ip_sym(s, field->ip, flags)) +		goto partial; + +	if (!trace_seq_puts(s, ": ")) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, +		struct trace_event *event) +{ +	struct bputs_entry *field; +	struct trace_seq *s = &iter->seq; + +	trace_assign_type(field, iter->ent); + +	if (!trace_seq_printf(s, ": %lx : ", field->ip)) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { +	.trace		= trace_bputs_print, +	.raw		= trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { +	.type		= TRACE_BPUTS, +	.funcs		= &trace_bputs_funcs, +}; +  /* TRACE_BPRINT */  static enum print_line_t  trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {  	&trace_wake_event,  	&trace_stack_event,  	&trace_user_stack_event, +	&trace_bputs_event,  	&trace_bprint_event,  	&trace_print_event,  	NULL diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492..127a9d8c835 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -5,6 +5,8 @@  #include "trace.h"  extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t  trace_print_bprintk_msg_only(struct trace_iterator *iter);  extern enum print_line_t  trace_print_printk_msg_only(struct trace_iterator *iter); @@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);  /* used by module unregistering */  extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem;  #define MAX_MEMHEX_BYTES	8  #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3374c792ccd..4e98e3b257a 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  			   unsigned long flags, int pc)  {  	struct ftrace_event_call *call = &event_context_switch; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	struct ftrace_event_call *call = &event_wakeup;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,  					  sizeof(*entry), flags, pc); @@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a..fee77e15d81 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr);  static int wakeup_graph_entry(struct ftrace_graph_ent *trace);  static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; +static bool function_enabled;  #define TRACE_DISPLAY_GRAPH     1 @@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,  	if (cpu != wakeup_current_cpu)  		goto out_enable; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =  };  #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(int graph, int set)  {  	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&wakeup_graph_return,  					    &wakeup_graph_entry); +	else +		ret = register_ftrace_function(&trace_ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_wakeup_function(int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(&trace_ops); + +	function_enabled = false; +} + +static void wakeup_function_set(int set) +{ +	if (set) +		register_wakeup_function(is_graph(), 1); +	else +		unregister_wakeup_function(is_graph()); +} + +static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +{ +	if (mask & TRACE_ITER_FUNCTION) +		wakeup_function_set(set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(int graph) +{ +	int ret; + +	ret = register_wakeup_function(graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_wakeup_function(graph);  }  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,  	/* disable local data, not wakeup_cpu data */  	cpu = raw_smp_processor_id(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (likely(disabled != 1))  		goto out; @@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,  		goto out_unlock;  	/* The task we are waiting for is waking up */ -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);  	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -387,7 +430,7 @@ out_unlock:  	arch_spin_unlock(&wakeup_lock);  	local_irq_restore(flags);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void __wakeup_reset(struct trace_array *tr) @@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)  {  	unsigned long flags; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	local_irq_save(flags);  	arch_spin_lock(&wakeup_lock); @@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  		return;  	pc = preempt_count(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  	local_save_flags(flags); -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	data->preempt_timestamp = ftrace_now(cpu);  	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  out_locked:  	arch_spin_unlock(&wakeup_lock);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void start_wakeup_tracer(struct trace_array *tr) @@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)  static int __wakeup_tracer_init(struct trace_array *tr)  { -	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; -	trace_flags |= TRACE_ITER_LATENCY_FMT; +	save_flags = trace_flags; + +	/* non overwrite screws up the latency tracers */ +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);  	tracing_max_latency = 0;  	wakeup_trace = tr; @@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)  static void wakeup_tracer_reset(struct trace_array *tr)  { +	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; +	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; +  	stop_wakeup_tracer(tr);  	/* make sure we put back any tasks we are tracing */  	wakeup_reset(tr); -	if (!save_lat_flag) -		trace_flags &= ~TRACE_ITER_LATENCY_FMT; +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);  }  static void wakeup_tracer_start(struct trace_array *tr) @@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif @@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 51c819c12c2..55e2cf66967 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)  	return 0;  } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)  {  	struct ring_buffer_event *event;  	struct trace_entry *entry;  	unsigned int loops = 0; -	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { +	while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {  		entry = ring_buffer_event_data(event);  		/* @@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)   * Test the trace buffer to see if all the elements   * are still sane.   */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)  {  	unsigned long flags, cnt = 0;  	int cpu, ret = 0; @@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	local_irq_save(flags);  	arch_spin_lock(&ftrace_max_lock); -	cnt = ring_buffer_entries(tr->buffer); +	cnt = ring_buffer_entries(buf->buffer);  	/*  	 * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	 */  	tracing_off();  	for_each_possible_cpu(cpu) { -		ret = trace_test_buffer_cpu(tr, cpu); +		ret = trace_test_buffer_cpu(buf, cpu);  		if (ret)  			break;  	} @@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	msleep(100);  	/* we should have nothing in the buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	if (ret)  		goto out; @@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	tracing_start();  	/* we should only have one item */ @@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  /* Maximum number of functions to trace before diagnosing a hang */  #define GRAPH_MAX_FUNC_TEST	100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);  static unsigned int graph_hang_thresh;  /* Wrap the real function entry probe to avoid possible hanging */ @@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)  	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {  		ftrace_graph_stop();  		printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); -		if (ftrace_dump_on_oops) -			__ftrace_dump(false, DUMP_ALL); +		if (ftrace_dump_on_oops) { +			ftrace_dump(DUMP_ALL); +			/* ftrace_dump() disables tracing */ +			tracing_on(); +		}  		return 0;  	} @@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	 * Simulate the init() callback but we attach a watchdog callback  	 * to detect and recover from possible hangs  	 */ -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	set_graph_array(tr);  	ret = register_ftrace_graph(&trace_graph_return,  				    &trace_graph_entry_watchdog); @@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (ret)  		goto out; @@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (!ret && !count) {  		printk(KERN_CONT ".. no entries found .."); @@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	printk("ret = %d\n", ret);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr); @@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr  	/* stop the tracing. */  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc70..b20428c5efe 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -20,13 +20,24 @@  #define STACK_TRACE_ENTRIES 500 +#ifdef CC_USING_FENTRY +# define fentry		1 +#else +# define fentry		0 +#endif +  static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =  	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };  static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */  static struct stack_trace max_stack_trace = { -	.max_entries		= STACK_TRACE_ENTRIES, -	.entries		= stack_dump_trace, +	.max_entries		= STACK_TRACE_ENTRIES - 1, +	.entries		= &stack_dump_trace[1],  };  static unsigned long max_stack_size; @@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);  int stack_tracer_enabled;  static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void +check_stack(unsigned long ip, unsigned long *stack)  {  	unsigned long this_size, flags;  	unsigned long *p, *top, *start; +	static int tracer_frame; +	int frame_size = ACCESS_ONCE(tracer_frame);  	int i; -	this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); +	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);  	this_size = THREAD_SIZE - this_size; +	/* Remove the frame of the tracer */ +	this_size -= frame_size;  	if (this_size <= max_stack_size)  		return;  	/* we do not handle interrupt stacks yet */ -	if (!object_is_on_stack(&this_size)) +	if (!object_is_on_stack(stack))  		return;  	local_irq_save(flags);  	arch_spin_lock(&max_stack_lock); +	/* In case another CPU set the tracer_frame on us */ +	if (unlikely(!frame_size)) +		this_size -= tracer_frame; +  	/* a race could have already updated it */  	if (this_size <= max_stack_size)  		goto out; @@ -70,10 +90,18 @@ static inline void check_stack(void)  	save_stack_trace(&max_stack_trace);  	/* +	 * Add the passed in ip from the function tracer. +	 * Searching for this on the stack will skip over +	 * most of the overhead from the stack tracer itself. +	 */ +	stack_dump_trace[0] = ip; +	max_stack_trace.nr_entries++; + +	/*  	 * Now find where in the stack these are.  	 */  	i = 0; -	start = &this_size; +	start = stack;  	top = (unsigned long *)  		(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -97,6 +125,18 @@ static inline void check_stack(void)  				found = 1;  				/* Start the search from here */  				start = p + 1; +				/* +				 * We do not want to show the overhead +				 * of the stack tracer stack in the +				 * max stack. If we haven't figured +				 * out what that is, then figure it out +				 * now. +				 */ +				if (unlikely(!tracer_frame) && i == 1) { +					tracer_frame = (p - stack) * +						sizeof(unsigned long); +					max_stack_size -= tracer_frame; +				}  			}  		} @@ -113,6 +153,7 @@ static void  stack_trace_call(unsigned long ip, unsigned long parent_ip,  		 struct ftrace_ops *op, struct pt_regs *pt_regs)  { +	unsigned long stack;  	int cpu;  	preempt_disable_notrace(); @@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,  	if (per_cpu(trace_active, cpu)++ != 0)  		goto out; -	check_stack(); +	/* +	 * When fentry is used, the traced function does not get +	 * its stack frame set up, and we lose the parent. +	 * The ip is pretty useless because the function tracer +	 * was called before that function set up its stack frame. +	 * In this case, we use the parent ip. +	 * +	 * By adding the return address of either the parent ip +	 * or the current ip we can disregard most of the stack usage +	 * caused by the stack tracer itself. +	 * +	 * The function tracer always reports the address of where the +	 * mcount call was, but the stack will hold the return address. +	 */ +	if (fentry) +		ip = parent_ip; +	else +		ip += MCOUNT_INSN_SIZE; + +	check_stack(ip, &stack);   out:  	per_cpu(trace_active, cpu)--; @@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {  	.open = stack_trace_filter_open,  	.read = seq_read,  	.write = ftrace_filter_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -371,6 +431,8 @@ static __init int stack_trace_init(void)  	struct dentry *d_tracer;  	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0;  	trace_create_file("stack_max_size", 0644, d_tracer,  			&max_stack_size, &stack_max_size_fops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e7..847f88a6194 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -307,6 +307,8 @@ static int tracing_stat_init(void)  	struct dentry *d_tracing;  	d_tracing = tracing_init_dentry(); +	if (!d_tracing) +		return 0;  	stat_dir = debugfs_create_dir("trace_stat", d_tracing);  	if (!stat_dir) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a809e32105..8f2ac73c7a5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -12,10 +12,6 @@  #include "trace.h"  static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);  static int syscall_enter_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data); @@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name  	/*  	 * Only compare after the "sys" prefix. Archs that use  	 * syscall wrappers may have syscalls symbols aliases prefixed -	 * with "SyS" instead of "sys", leading to an unwanted +	 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted  	 * mismatch.  	 */  	return !strcmp(sym + 3, name + 3); @@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)  		kfree(call->print_fmt);  } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_enter trace;  	struct syscall_metadata *meta = call->data; @@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)  	return ret;  } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_exit trace;  	int ret; @@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)  	return ret;  } -static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  { +	struct trace_array *tr = data;  	struct syscall_trace_enter *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_enter_syscalls)) +	if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr); @@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; -	event = trace_current_buffer_lock_reserve(&buffer, +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer,  			sys_data->enter_event->event.type, size, 0, 0);  	if (!event)  		return; @@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  { +	struct trace_array *tr = data;  	struct syscall_trace_exit *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_exit_syscalls)) +	if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr);  	if (!sys_data)  		return; -	event = trace_current_buffer_lock_reserve(&buffer, +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer,  			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);  	if (!event)  		return; @@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -static int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, +				   struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num; @@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_enter) -		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); +	if (!tr->sys_refcount_enter) +		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);  	if (!ret) { -		set_bit(num, enabled_enter_syscalls); -		sys_refcount_enter++; +		set_bit(num, tr->enabled_enter_syscalls); +		tr->sys_refcount_enter++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -static void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, +				      struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr;  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_enter--; -	clear_bit(num, enabled_enter_syscalls); -	if (!sys_refcount_enter) -		unregister_trace_sys_enter(ftrace_syscall_enter, NULL); +	tr->sys_refcount_enter--; +	clear_bit(num, tr->enabled_enter_syscalls); +	if (!tr->sys_refcount_enter) +		unregister_trace_sys_enter(ftrace_syscall_enter, tr);  	mutex_unlock(&syscall_trace_lock);  } -static int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, +				  struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num; @@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_exit) -		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); +	if (!tr->sys_refcount_exit) +		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);  	if (!ret) { -		set_bit(num, enabled_exit_syscalls); -		sys_refcount_exit++; +		set_bit(num, tr->enabled_exit_syscalls); +		tr->sys_refcount_exit++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -static void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, +				     struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr;  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_exit--; -	clear_bit(num, enabled_exit_syscalls); -	if (!sys_refcount_exit) -		unregister_trace_sys_exit(ftrace_syscall_exit, NULL); +	tr->sys_refcount_exit--; +	clear_bit(num, tr->enabled_exit_syscalls); +	if (!tr->sys_refcount_exit) +		unregister_trace_sys_exit(ftrace_syscall_exit, tr);  	mutex_unlock(&syscall_trace_lock);  } @@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {  	.trace		= print_syscall_exit,  }; -struct ftrace_event_class event_class_syscall_enter = { +struct ftrace_event_class __refdata event_class_syscall_enter = {  	.system		= "syscalls",  	.reg		= syscall_enter_register,  	.define_fields	= syscall_enter_define_fields, @@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {  	.raw_init	= init_syscall_trace,  }; -struct ftrace_event_class event_class_syscall_exit = { +struct ftrace_event_class __refdata event_class_syscall_exit = {  	.system		= "syscalls",  	.reg		= syscall_exit_register,  	.define_fields	= syscall_exit_define_fields, @@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)  static int syscall_enter_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_enter(event); +		return reg_event_syscall_enter(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_enter(event); +		unreg_event_syscall_enter(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,  static int syscall_exit_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_exit(event); +		return reg_event_syscall_exit(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_exit(event); +		unreg_event_syscall_exit(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 0c05a459204..29f26540e9c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,  	int nr_probes = 0;  	struct tracepoint_func *old, *new; -	WARN_ON(!probe); +	if (WARN_ON(!probe)) +		return ERR_PTR(-EINVAL);  	debug_print_probes(entry);  	old = entry->funcs; @@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,  	debug_print_probes(entry);  	/* (N -> M), (N > 1, M >= 0) probes */ -	for (nr_probes = 0; old[nr_probes].func; nr_probes++) { -		if (!probe || -		    (old[nr_probes].func == probe && -		     old[nr_probes].data == data)) -			nr_del++; +	if (probe) { +		for (nr_probes = 0; old[nr_probes].func; nr_probes++) { +			if (old[nr_probes].func == probe && +			     old[nr_probes].data == data) +				nr_del++; +		}  	} +	/* +	 * If probe is NULL, then nr_probes = nr_del = 0, and then the +	 * entire entry will be removed. +	 */  	if (nr_probes - nr_del == 0) {  		/* N -> 0, (N > 1) */  		entry->funcs = NULL; @@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,  		if (new == NULL)  			return ERR_PTR(-ENOMEM);  		for (i = 0; old[i].func; i++) -			if (probe && -			    (old[i].func != probe || old[i].data != data)) +			if (old[i].func != probe || old[i].data != data)  				new[j++] = old[i];  		new[nr_probes - nr_del].func = NULL;  		entry->refcount = nr_probes - nr_del; diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03..8e635a18ab5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {  	.owner = GLOBAL_ROOT_UID,  	.group = GLOBAL_ROOT_GID,  	.proc_inum = PROC_USER_INIT_INO, +	.may_mount_sysfs = true, +	.may_mount_proc = true,  };  EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8b650837083..e134d8f365d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,10 +21,12 @@  #include <linux/uaccess.h>  #include <linux/ctype.h>  #include <linux/projid.h> +#include <linux/fs_struct.h>  static struct kmem_cache *user_ns_cachep __read_mostly; -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, +				struct user_namespace *ns, int cap_setid,  				struct uid_gid_map *map);  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) @@ -60,6 +62,15 @@ int create_user_ns(struct cred *new)  	kgid_t group = new->egid;  	int ret; +	/* +	 * Verify that we can not violate the policy of which files +	 * may be accessed that is specified by the root directory, +	 * by verifing that the root directory is at the root of the +	 * mount namespace which allows all files to be accessed. +	 */ +	if (current_chrooted()) +		return -EPERM; +  	/* The creator needs a mapping in the parent user namespace  	 * or else we won't be able to reasonably tell userspace who  	 * created a user_namespace. @@ -86,6 +97,8 @@ int create_user_ns(struct cred *new)  	set_cred_user_ns(new, ns); +	update_mnt_policy(ns); +  	return 0;  } @@ -600,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,  	if (map->nr_extents != 0)  		goto out; -	/* Require the appropriate privilege CAP_SETUID or CAP_SETGID -	 * over the user namespace in order to set the id mapping. +	/* +	 * Adjusting namespace settings requires capabilities on the target.  	 */ -	if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) +	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))  		goto out;  	/* Get a buffer */ @@ -688,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,  	ret = -EPERM;  	/* Validate the user is allowed to use user id's mapped to. */ -	if (!new_idmap_permitted(ns, cap_setid, &new_map)) +	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))  		goto out;  	/* Map the lower ids from the parent user namespace to the @@ -775,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t  			 &ns->projid_map, &ns->parent->projid_map);  } -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file,  +				struct user_namespace *ns, int cap_setid,  				struct uid_gid_map *new_map)  {  	/* Allow mapping to your own filesystem ids */ @@ -783,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,  		u32 id = new_map->extent[0].lower_first;  		if (cap_setid == CAP_SETUID) {  			kuid_t uid = make_kuid(ns->parent, id); -			if (uid_eq(uid, current_fsuid())) +			if (uid_eq(uid, file->f_cred->fsuid))  				return true;  		}  		else if (cap_setid == CAP_SETGID) {  			kgid_t gid = make_kgid(ns->parent, id); -			if (gid_eq(gid, current_fsgid())) +			if (gid_eq(gid, file->f_cred->fsgid))  				return true;  		}  	} @@ -799,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,  	/* Allow the specified ids if we have the appropriate capability  	 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. +	 * And the opener of the id file also had the approprpiate capability.  	 */ -	if (ns_capable(ns->parent, cap_setid)) +	if (ns_capable(ns->parent, cap_setid) && +	    file_ns_capable(file, ns->parent, cap_setid))  		return true;  	return false; @@ -837,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)  	if (atomic_read(¤t->mm->mm_users) > 1)  		return -EINVAL; +	if (current->fs->users != 1) +		return -EINVAL; +  	if (!ns_capable(user_ns, CAP_SYS_ADMIN))  		return -EPERM; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811e..b48cd597145 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool)  	int ret;  	mutex_lock(&worker_pool_idr_mutex); -	idr_pre_get(&worker_pool_idr, GFP_KERNEL); -	ret = idr_get_new(&worker_pool_idr, pool, &pool->id); +	ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); +	if (ret >= 0) +		pool->id = ret;  	mutex_unlock(&worker_pool_idr_mutex); -	return ret; +	return ret < 0 ? ret : 0;  }  /* @@ -3446,28 +3447,34 @@ static void wq_unbind_fn(struct work_struct *work)  		spin_unlock_irq(&pool->lock);  		mutex_unlock(&pool->assoc_mutex); -	} -	/* -	 * Call schedule() so that we cross rq->lock and thus can guarantee -	 * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary -	 * as scheduler callbacks may be invoked from other cpus. -	 */ -	schedule(); +		/* +		 * Call schedule() so that we cross rq->lock and thus can +		 * guarantee sched callbacks see the %WORKER_UNBOUND flag. +		 * This is necessary as scheduler callbacks may be invoked +		 * from other cpus. +		 */ +		schedule(); -	/* -	 * Sched callbacks are disabled now.  Zap nr_running.  After this, -	 * nr_running stays zero and need_more_worker() and keep_working() -	 * are always true as long as the worklist is not empty.  Pools on -	 * @cpu now behave as unbound (in terms of concurrency management) -	 * pools which are served by workers tied to the CPU. -	 * -	 * On return from this function, the current worker would trigger -	 * unbound chain execution of pending work items if other workers -	 * didn't already. -	 */ -	for_each_std_worker_pool(pool, cpu) +		/* +		 * Sched callbacks are disabled now.  Zap nr_running. +		 * After this, nr_running stays zero and need_more_worker() +		 * and keep_working() are always true as long as the +		 * worklist is not empty.  This pool now behaves as an +		 * unbound (in terms of concurrency management) pool which +		 * are served by workers tied to the pool. +		 */  		atomic_set(&pool->nr_running, 0); + +		/* +		 * With concurrency management just turned off, a busy +		 * worker blocking could lead to lengthy stalls.  Kick off +		 * unbound chain execution of currently pending work items. +		 */ +		spin_lock_irq(&pool->lock); +		wake_up_worker(pool); +		spin_unlock_irq(&pool->lock); +	}  }  /*  |