diff options
47 files changed, 1860 insertions, 704 deletions
diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 753f4de4b17..35a78bc6651 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -324,7 +324,7 @@ output. To see what is available, simply cat the file:    cat /debug/tracing/trace_options    print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ - noblock nostacktrace nosched-tree + noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj  To disable one of the options, echo in the option prepended with "no". @@ -378,6 +378,20 @@ Here are the available options:  		When a trace is recorded, so is the stack of functions.  		This allows for back traces of trace sites. +  userstacktrace - This option changes the trace. +		   It records a stacktrace of the current userspace thread. + +  sym-userobj - when user stacktrace are enabled, look up which object the +		address belongs to, and print a relative address +		This is especially useful when ASLR is on, otherwise you don't +		get a chance to resolve the address to object/file/line after the app is no +		longer running + +		The lookup is performed when you read trace,trace_pipe,latency_trace. Example: + +		a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 +x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] +    sched-tree - TBD (any users??) diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt index 5bbbe209622..cde23b4a12a 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/tracers/mmiotrace.txt @@ -37,7 +37,7 @@ $ echo mmiotrace > /debug/tracing/current_tracer  $ cat /debug/tracing/trace_pipe > mydump.txt &  Start X or whatever.  $ echo "X is up" > /debug/tracing/trace_marker -$ echo none > /debug/tracing/current_tracer +$ echo nop > /debug/tracing/current_tracer  Check for lost events. @@ -66,7 +66,7 @@ which action. It is recommended to place descriptive markers about what you  do.  Shut down mmiotrace (requires root privileges): -$ echo none > /debug/tracing/current_tracer +$ echo nop > /debug/tracing/current_tracer  The 'cat' process exits. If it does not, kill it by issuing 'fg' command and  pressing ctrl+c. @@ -81,7 +81,9 @@ are:  $ cat /debug/tracing/trace_entries  gives you a number. Approximately double this number and write it back, for  instance: +$ echo 0 > /debug/tracing/tracing_enabled  $ echo 128000 > /debug/tracing/trace_entries +$ echo 1 > /debug/tracing/tracing_enabled  Then start again from the top.  If you are doing a trace for a driver project, e.g. Nouveau, you should also diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index b298f7a631e..e5f2ae8362f 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -7,7 +7,19 @@  #ifndef __ASSEMBLY__  extern void _mcount(void); -#endif + +#ifdef CONFIG_DYNAMIC_FTRACE +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ +       /* reloction of mcount call site is the same as the address */ +       return addr; +} + +struct dyn_arch_ftrace { +	struct module *mod; +}; +#endif /*  CONFIG_DYNAMIC_FTRACE */ +#endif /* __ASSEMBLY__ */  #endif diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index e5f14b13ccf..08454880a2c 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -34,11 +34,19 @@ struct mod_arch_specific {  #ifdef __powerpc64__  	unsigned int stubs_section;	/* Index of stubs section in module */  	unsigned int toc_section;	/* What section is the TOC? */ -#else +#ifdef CONFIG_DYNAMIC_FTRACE +	unsigned long toc; +	unsigned long tramp; +#endif + +#else /* powerpc64 */  	/* Indices of PLT sections within module. */  	unsigned int core_plt_section;  	unsigned int init_plt_section; +#ifdef CONFIG_DYNAMIC_FTRACE +	unsigned long tramp;  #endif +#endif /* powerpc64 */  	/* List of BUG addresses, source line numbers and filenames */  	struct list_head bug_list; @@ -68,6 +76,12 @@ struct mod_arch_specific {  #    endif	/* MODULE */  #endif +#ifdef CONFIG_DYNAMIC_FTRACE +#    ifdef MODULE +	asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous"); +#    endif	/* MODULE */ +#endif +  struct exception_table_entry;  void sort_ex_table(struct exception_table_entry *start, diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index f4b006ed0ab..3271cd698e4 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -9,22 +9,30 @@  #include <linux/spinlock.h>  #include <linux/hardirq.h> +#include <linux/uaccess.h> +#include <linux/module.h>  #include <linux/ftrace.h>  #include <linux/percpu.h>  #include <linux/init.h>  #include <linux/list.h>  #include <asm/cacheflush.h> +#include <asm/code-patching.h>  #include <asm/ftrace.h> +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...)	do { } while (0) +#endif -static unsigned int ftrace_nop = 0x60000000; +static unsigned int ftrace_nop = PPC_NOP_INSTR;  #ifdef CONFIG_PPC32  # define GET_ADDR(addr) addr  #else  /* PowerPC64's functions are data that points to the functions */ -# define GET_ADDR(addr) *(unsigned long *)addr +# define GET_ADDR(addr) (*(unsigned long *)addr)  #endif @@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr)  	return (int)(addr - ip);  } -unsigned char *ftrace_nop_replace(void) +static unsigned char *ftrace_nop_replace(void)  {  	return (char *)&ftrace_nop;  } -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)  {  	static unsigned int op; @@ -68,49 +76,434 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)  # define _ASM_PTR	" .long "  #endif -int +static int  ftrace_modify_code(unsigned long ip, unsigned char *old_code,  		   unsigned char *new_code)  { -	unsigned replaced; -	unsigned old = *(unsigned *)old_code; -	unsigned new = *(unsigned *)new_code; -	int faulted = 0; +	unsigned char replaced[MCOUNT_INSN_SIZE];  	/*  	 * Note: Due to modules and __init, code can  	 *  disappear and change, we need to protect against faulting -	 *  as well as code changing. +	 *  as well as code changing. We do this by using the +	 *  probe_kernel_* functions.  	 *  	 * No real locking needed, this code is run through -	 * kstop_machine. +	 * kstop_machine, or before SMP starts. +	 */ + +	/* read the text we want to modify */ +	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* Make sure it is what we expect it to be */ +	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) +		return -EINVAL; + +	/* replace the text with the new text */ +	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) +		return -EPERM; + +	flush_icache_range(ip, ip + 8); + +	return 0; +} + +/* + * Helper functions that are the same for both PPC64 and PPC32. + */ +static int test_24bit_addr(unsigned long ip, unsigned long addr) +{ +	long diff; + +	/* +	 * Can we get to addr from ip in 24 bits? +	 *  (26 really, since we mulitply by 4 for 4 byte alignment) +	 */ +	diff = addr - ip; + +	/* +	 * Return true if diff is less than 1 << 25 +	 *  and greater than -1 << 26. +	 */ +	return (diff < (1 << 25)) && (diff > (-1 << 26)); +} + +static int is_bl_op(unsigned int op) +{ +	return (op & 0xfc000003) == 0x48000001; +} + +static int test_offset(unsigned long offset) +{ +	return (offset + 0x2000000 > 0x3ffffff) || ((offset & 3) != 0); +} + +static unsigned long find_bl_target(unsigned long ip, unsigned int op) +{ +	static int offset; + +	offset = (op & 0x03fffffc); +	/* make it signed */ +	if (offset & 0x02000000) +		offset |= 0xfe000000; + +	return ip + (long)offset; +} + +static unsigned int branch_offset(unsigned long offset) +{ +	/* return "bl ip+offset" */ +	return 0x48000001 | (offset & 0x03fffffc); +} + +#ifdef CONFIG_PPC64 +static int +__ftrace_make_nop(struct module *mod, +		  struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char replaced[MCOUNT_INSN_SIZE * 2]; +	unsigned int *op = (unsigned *)&replaced; +	unsigned char jmp[8]; +	unsigned long *ptr = (unsigned long *)&jmp; +	unsigned long ip = rec->ip; +	unsigned long tramp; +	int offset; + +	/* read where this goes */ +	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* Make sure that that this is still a 24bit jump */ +	if (!is_bl_op(*op)) { +		printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); +		return -EINVAL; +	} + +	/* lets find where the pointer goes */ +	tramp = find_bl_target(ip, *op); + +	/* +	 * On PPC64 the trampoline looks like: +	 * 0x3d, 0x82, 0x00, 0x00,    addis   r12,r2, <high> +	 * 0x39, 0x8c, 0x00, 0x00,    addi    r12,r12, <low> +	 *   Where the bytes 2,3,6 and 7 make up the 32bit offset +	 *   to the TOC that holds the pointer. +	 *   to jump to. +	 * 0xf8, 0x41, 0x00, 0x28,    std     r2,40(r1) +	 * 0xe9, 0x6c, 0x00, 0x20,    ld      r11,32(r12) +	 *   The actually address is 32 bytes from the offset +	 *   into the TOC. +	 * 0xe8, 0x4c, 0x00, 0x28,    ld      r2,40(r12) +	 */ + +	DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc); + +	/* Find where the trampoline jumps to */ +	if (probe_kernel_read(jmp, (void *)tramp, 8)) { +		printk(KERN_ERR "Failed to read %lx\n", tramp); +		return -EFAULT; +	} + +	DEBUGP(" %08x %08x", +	       (unsigned)(*ptr >> 32), +	       (unsigned)*ptr); + +	offset = (unsigned)jmp[2] << 24 | +		(unsigned)jmp[3] << 16 | +		(unsigned)jmp[6] << 8 | +		(unsigned)jmp[7]; + +	DEBUGP(" %x ", offset); + +	/* get the address this jumps too */ +	tramp = mod->arch.toc + offset + 32; +	DEBUGP("toc: %lx", tramp); + +	if (probe_kernel_read(jmp, (void *)tramp, 8)) { +		printk(KERN_ERR "Failed to read %lx\n", tramp); +		return -EFAULT; +	} + +	DEBUGP(" %08x %08x\n", +	       (unsigned)(*ptr >> 32), +	       (unsigned)*ptr); + +	/* This should match what was called */ +	if (*ptr != GET_ADDR(addr)) { +		printk(KERN_ERR "addr does not match %lx\n", *ptr); +		return -EINVAL; +	} + +	/* +	 * We want to nop the line, but the next line is +	 *  0xe8, 0x41, 0x00, 0x28   ld r2,40(r1) +	 * This needs to be turned to a nop too. +	 */ +	if (probe_kernel_read(replaced, (void *)(ip+4), MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	if (*op != 0xe8410028) { +		printk(KERN_ERR "Next line is not ld! (%08x)\n", *op); +		return -EINVAL; +	} + +	/* +	 * Milton Miller pointed out that we can not blindly do nops. +	 * If a task was preempted when calling a trace function, +	 * the nops will remove the way to restore the TOC in r2 +	 * and the r2 TOC will get corrupted. +	 */ + +	/* +	 * Replace: +	 *   bl <tramp>  <==== will be replaced with "b 1f" +	 *   ld r2,40(r1) +	 *  1: +	 */ +	op[0] = 0x48000008;	/* b +8 */ + +	if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) +		return -EPERM; + +	return 0; +} + +#else /* !PPC64 */ +static int +__ftrace_make_nop(struct module *mod, +		  struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char replaced[MCOUNT_INSN_SIZE]; +	unsigned int *op = (unsigned *)&replaced; +	unsigned char jmp[8]; +	unsigned int *ptr = (unsigned int *)&jmp; +	unsigned long ip = rec->ip; +	unsigned long tramp; +	int offset; + +	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* Make sure that that this is still a 24bit jump */ +	if (!is_bl_op(*op)) { +		printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); +		return -EINVAL; +	} + +	/* lets find where the pointer goes */ +	tramp = find_bl_target(ip, *op); + +	/* +	 * On PPC32 the trampoline looks like: +	 * lis r11,sym@ha +	 * addi r11,r11,sym@l +	 * mtctr r11 +	 * bctr +	 */ + +	DEBUGP("ip:%lx jumps to %lx", ip, tramp); + +	/* Find where the trampoline jumps to */ +	if (probe_kernel_read(jmp, (void *)tramp, 8)) { +		printk(KERN_ERR "Failed to read %lx\n", tramp); +		return -EFAULT; +	} + +	DEBUGP(" %08x %08x ", ptr[0], ptr[1]); + +	tramp = (ptr[1] & 0xffff) | +		((ptr[0] & 0xffff) << 16); +	if (tramp & 0x8000) +		tramp -= 0x10000; + +	DEBUGP(" %x ", tramp); + +	if (tramp != addr) { +		printk(KERN_ERR +		       "Trampoline location %08lx does not match addr\n", +		       tramp); +		return -EINVAL; +	} + +	op[0] = PPC_NOP_INSTR; + +	if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) +		return -EPERM; + +	return 0; +} +#endif /* PPC64 */ + +int ftrace_make_nop(struct module *mod, +		    struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char *old, *new; +	unsigned long ip = rec->ip; + +	/* +	 * If the calling address is more that 24 bits away, +	 * then we had to use a trampoline to make the call. +	 * Otherwise just update the call site.  	 */ -	asm volatile ( -		"1: lwz		%1, 0(%2)\n" -		"   cmpw	%1, %5\n" -		"   bne		2f\n" -		"   stwu	%3, 0(%2)\n" -		"2:\n" -		".section .fixup, \"ax\"\n" -		"3:	li %0, 1\n" -		"	b 2b\n" -		".previous\n" -		".section __ex_table,\"a\"\n" -		_ASM_ALIGN "\n" -		_ASM_PTR "1b, 3b\n" -		".previous" -		: "=r"(faulted), "=r"(replaced) -		: "r"(ip), "r"(new), -		  "0"(faulted), "r"(old) -		: "memory"); +	if (test_24bit_addr(ip, addr)) { +		/* within range */ +		old = ftrace_call_replace(ip, addr); +		new = ftrace_nop_replace(); +		return ftrace_modify_code(ip, old, new); +	} -	if (replaced != old && replaced != new) -		faulted = 2; +	/* +	 * Out of range jumps are called from modules. +	 * We should either already have a pointer to the module +	 * or it has been passed in. +	 */ +	if (!rec->arch.mod) { +		if (!mod) { +			printk(KERN_ERR "No module loaded addr=%lx\n", +			       addr); +			return -EFAULT; +		} +		rec->arch.mod = mod; +	} else if (mod) { +		if (mod != rec->arch.mod) { +			printk(KERN_ERR +			       "Record mod %p not equal to passed in mod %p\n", +			       rec->arch.mod, mod); +			return -EINVAL; +		} +		/* nothing to do if mod == rec->arch.mod */ +	} else +		mod = rec->arch.mod; -	if (!faulted) -		flush_icache_range(ip, ip + 8); +	return __ftrace_make_nop(mod, rec, addr); -	return faulted; +} + +#ifdef CONFIG_PPC64 +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char replaced[MCOUNT_INSN_SIZE * 2]; +	unsigned int *op = (unsigned *)&replaced; +	unsigned long ip = rec->ip; +	unsigned long offset; + +	/* read where this goes */ +	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE * 2)) +		return -EFAULT; + +	/* +	 * It should be pointing to two nops or +	 *  b +8; ld r2,40(r1) +	 */ +	if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) && +	    ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) { +		printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]); +		return -EINVAL; +	} + +	/* If we never set up a trampoline to ftrace_caller, then bail */ +	if (!rec->arch.mod->arch.tramp) { +		printk(KERN_ERR "No ftrace trampoline\n"); +		return -EINVAL; +	} + +	/* now calculate a jump to the ftrace caller trampoline */ +	offset = rec->arch.mod->arch.tramp - ip; + +	if (test_offset(offset)) { +		printk(KERN_ERR "REL24 %li out of range!\n", +		       (long int)offset); +		return -EINVAL; +	} + +	/* Set to "bl addr" */ +	op[0] = branch_offset(offset); +	/* ld r2,40(r1) */ +	op[1] = 0xe8410028; + +	DEBUGP("write to %lx\n", rec->ip); + +	if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE * 2)) +		return -EPERM; + +	return 0; +} +#else +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char replaced[MCOUNT_INSN_SIZE]; +	unsigned int *op = (unsigned *)&replaced; +	unsigned long ip = rec->ip; +	unsigned long offset; + +	/* read where this goes */ +	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* It should be pointing to a nop */ +	if (op[0] != PPC_NOP_INSTR) { +		printk(KERN_ERR "Expected NOP but have %x\n", op[0]); +		return -EINVAL; +	} + +	/* If we never set up a trampoline to ftrace_caller, then bail */ +	if (!rec->arch.mod->arch.tramp) { +		printk(KERN_ERR "No ftrace trampoline\n"); +		return -EINVAL; +	} + +	/* now calculate a jump to the ftrace caller trampoline */ +	offset = rec->arch.mod->arch.tramp - ip; + +	if (test_offset(offset)) { +		printk(KERN_ERR "REL24 %li out of range!\n", +		       (long int)offset); +		return -EINVAL; +	} + +	/* Set to "bl addr" */ +	op[0] = branch_offset(offset); + +	DEBUGP("write to %lx\n", rec->ip); + +	if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) +		return -EPERM; + +	return 0; +} +#endif /* CONFIG_PPC64 */ + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char *old, *new; +	unsigned long ip = rec->ip; + +	/* +	 * If the calling address is more that 24 bits away, +	 * then we had to use a trampoline to make the call. +	 * Otherwise just update the call site. +	 */ +	if (test_24bit_addr(ip, addr)) { +		/* within range */ +		old = ftrace_nop_replace(); +		new = ftrace_call_replace(ip, addr); +		return ftrace_modify_code(ip, old, new); +	} + +	/* +	 * Out of range jumps are called from modules. +	 * Being that we are converting from nop, it had better +	 * already have a module defined. +	 */ +	if (!rec->arch.mod) { +		printk(KERN_ERR "No module loaded\n"); +		return -EINVAL; +	} + +	return __ftrace_make_call(rec, addr);  }  int ftrace_update_ftrace_func(ftrace_func_t func) @@ -128,10 +521,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func)  int __init ftrace_dyn_arch_init(void *data)  { -	/* This is running in kstop_machine */ +	/* caller expects data to be zero */ +	unsigned long *p = data; -	ftrace_mcount_set(data); +	*p = 0;  	return 0;  } - diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 31982d05d81..88d9c1d5e5f 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -69,10 +69,15 @@ void cpu_idle(void)  				smp_mb();  				local_irq_disable(); +				/* Don't trace irqs off for idle */ +				stop_critical_timings(); +  				/* check again after disabling irqs */  				if (!need_resched() && !cpu_should_die())  					ppc_md.power_save(); +				start_critical_timings(); +  				local_irq_enable();  				set_thread_flag(TIF_POLLING_NRFLAG); diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 2df91a03462..f832773fc28 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -22,6 +22,7 @@  #include <linux/fs.h>  #include <linux/string.h>  #include <linux/kernel.h> +#include <linux/ftrace.h>  #include <linux/cache.h>  #include <linux/bug.h>  #include <linux/sort.h> @@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)  			r_addend = rela[i].r_addend;  		} +#ifdef CONFIG_DYNAMIC_FTRACE +	_count_relocs++;	/* add one for ftrace_caller */ +#endif  	return _count_relocs;  } @@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,  			return -ENOEXEC;  		}  	} +#ifdef CONFIG_DYNAMIC_FTRACE +	module->arch.tramp = +		do_plt_call(module->module_core, +			    (unsigned long)ftrace_caller, +			    sechdrs, module); +#endif  	return 0;  } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 1af2377e499..8992b031a7b 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -20,6 +20,7 @@  #include <linux/moduleloader.h>  #include <linux/err.h>  #include <linux/vmalloc.h> +#include <linux/ftrace.h>  #include <linux/bug.h>  #include <asm/module.h>  #include <asm/firmware.h> @@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,  		}  	} +#ifdef CONFIG_DYNAMIC_FTRACE +	/* make the trampoline to the ftrace_caller */ +	relocs++; +#endif +  	DEBUGP("Looks like a total of %lu stubs, max\n", relocs);  	return relocs * sizeof(struct ppc64_stub_entry);  } @@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  		}  	} +#ifdef CONFIG_DYNAMIC_FTRACE +	me->arch.toc = my_r2(sechdrs, me); +	me->arch.tramp = stub_for_addr(sechdrs, +				       (unsigned long)ftrace_caller, +				       me); +#endif +  	return 0;  } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7a146baaa99..e49a4fd718f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -36,6 +36,7 @@ config X86  	select HAVE_ARCH_TRACEHOOK  	select HAVE_GENERIC_DMA_COHERENT if X86_32  	select HAVE_EFFICIENT_UNALIGNED_ACCESS +	select USER_STACKTRACE_SUPPORT  config ARCH_DEFCONFIG  	string diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664fe37..85a78575956 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -515,6 +515,7 @@ config CPU_SUP_UMC_32  config X86_DS  	def_bool X86_PTRACE_BTS  	depends on X86_DEBUGCTLMSR +	select HAVE_HW_BRANCH_TRACER  config X86_PTRACE_BTS  	bool "Branch Trace Store" diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 0be77b39328..7e8e8b25f5f 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -74,7 +74,7 @@ static int kbd_pending(void)  {  	u8 pending;  	asm volatile("int $0x16; setnz %0" -		     : "=rm" (pending) +		     : "=qm" (pending)  		     : "a" (0x0100));  	return pending;  } diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 72c5a190bf4..99b6c39774a 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -7,13 +7,12 @@   *   * It manages:   * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done)   * - buffer access   *   * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks   *   *   * Copyright (C) 2007-2008 Intel Corporation. @@ -23,13 +22,21 @@  #ifndef _ASM_X86_DS_H  #define _ASM_X86_DS_H -#ifdef CONFIG_X86_DS  #include <linux/types.h>  #include <linux/init.h> +#include <linux/err.h> + +#ifdef CONFIG_X86_DS  struct task_struct; +struct ds_tracer; +struct bts_tracer; +struct pebs_tracer; + +typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); +typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);  /*   * Request BTS or PEBS @@ -37,60 +44,62 @@ struct task_struct;   * Due to alignement constraints, the actual buffer may be slightly   * smaller than the requested or provided buffer.   * - * Returns 0 on success; -Eerrno otherwise + * Returns a pointer to a tracer structure on success, or + * ERR_PTR(errcode) on failure. + * + * The interrupt threshold is independent from the overflow callback + * to allow users to use their own overflow interrupt handling mechanism.   *   * task: the task to request recording for;   *       NULL for per-cpu recording on the current cpu   * base: the base pointer for the (non-pageable) buffer; - *       NULL if buffer allocation requested - * size: the size of the requested or provided buffer + * size: the size of the provided buffer in bytes   * ovfl: pointer to a function to be called on buffer overflow;   *       NULL if cyclic buffer requested + * th: the interrupt threshold in records from the end of the buffer; + *     -1 if no interrupt threshold is requested.   */ -typedef void (*ds_ovfl_callback_t)(struct task_struct *); -extern int ds_request_bts(struct task_struct *task, void *base, size_t size, -			  ds_ovfl_callback_t ovfl); -extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, -			   ds_ovfl_callback_t ovfl); +extern struct bts_tracer *ds_request_bts(struct task_struct *task, +					 void *base, size_t size, +					 bts_ovfl_callback_t ovfl, size_t th); +extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, +					   void *base, size_t size, +					   pebs_ovfl_callback_t ovfl, +					   size_t th);  /*   * Release BTS or PEBS resources   * - * Frees buffers allocated on ds_request. - *   * Returns 0 on success; -Eerrno otherwise   * - * task: the task to release resources for; - *       NULL to release resources for the current cpu + * tracer: the tracer handle returned from ds_request_~()   */ -extern int ds_release_bts(struct task_struct *task); -extern int ds_release_pebs(struct task_struct *task); +extern int ds_release_bts(struct bts_tracer *tracer); +extern int ds_release_pebs(struct pebs_tracer *tracer);  /* - * Return the (array) index of the write pointer. + * Get the (array) index of the write pointer.   * (assuming an array of BTS/PEBS records)   * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result   */ -extern int ds_get_bts_index(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); +extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos);  /* - * Return the (array) index one record beyond the end of the array. + * Get the (array) index one record beyond the end of the array.   * (assuming an array of BTS/PEBS records)   * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result   */ -extern int ds_get_bts_end(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); +extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos);  /*   * Provide a pointer to the BTS/PEBS record at parameter index. @@ -101,14 +110,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);   *   * Returns the size of a single record on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   * index: the index of the requested record   * record (out): pointer to the requested record   */ -extern int ds_access_bts(struct task_struct *task, +extern int ds_access_bts(struct bts_tracer *tracer,  			 size_t index, const void **record); -extern int ds_access_pebs(struct task_struct *task, +extern int ds_access_pebs(struct pebs_tracer *tracer,  			  size_t index, const void **record);  /* @@ -128,38 +136,24 @@ extern int ds_access_pebs(struct task_struct *task,   *   * Returns the number of bytes written or -Eerrno.   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   * buffer: the buffer to write   * size: the size of the buffer   */ -extern int ds_write_bts(struct task_struct *task, +extern int ds_write_bts(struct bts_tracer *tracer,  			const void *buffer, size_t size); -extern int ds_write_pebs(struct task_struct *task, +extern int ds_write_pebs(struct pebs_tracer *tracer,  			 const void *buffer, size_t size);  /* - * Same as ds_write_bts/pebs, but omit ownership checks. - * - * This is needed to have some other task than the owner of the - * BTS/PEBS buffer or the parameter task itself write into the - * respective buffer. - */ -extern int ds_unchecked_write_bts(struct task_struct *task, -				  const void *buffer, size_t size); -extern int ds_unchecked_write_pebs(struct task_struct *task, -				   const void *buffer, size_t size); - -/*   * Reset the write pointer of the BTS/PEBS buffer.   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   */ -extern int ds_reset_bts(struct task_struct *task); -extern int ds_reset_pebs(struct task_struct *task); +extern int ds_reset_bts(struct bts_tracer *tracer); +extern int ds_reset_pebs(struct pebs_tracer *tracer);  /*   * Clear the BTS/PEBS buffer and reset the write pointer. @@ -167,33 +161,30 @@ extern int ds_reset_pebs(struct task_struct *task);   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   */ -extern int ds_clear_bts(struct task_struct *task); -extern int ds_clear_pebs(struct task_struct *task); +extern int ds_clear_bts(struct bts_tracer *tracer); +extern int ds_clear_pebs(struct pebs_tracer *tracer);  /*   * Provide the PEBS counter reset value.   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs()   * value (out): the counter reset value   */ -extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); +extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);  /*   * Set the PEBS counter reset value.   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs()   * value: the new counter reset value   */ -extern int ds_set_pebs_reset(struct task_struct *task, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);  /*   * Initialization @@ -206,17 +197,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);  /*   * The DS context - part of struct thread_struct.   */ +#define MAX_SIZEOF_DS (12 * 8) +  struct ds_context {  	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ -	unsigned char *ds; +	unsigned char ds[MAX_SIZEOF_DS];  	/* the owner of the BTS and PEBS configuration, respectively */ -	struct task_struct *owner[2]; -	/* buffer overflow notification function for BTS and PEBS */ -	ds_ovfl_callback_t callback[2]; -	/* the original buffer address */ -	void *buffer[2]; -	/* the number of allocated pages for on-request allocated buffers */ -	unsigned int pages[2]; +	struct ds_tracer  *owner[2];  	/* use count */  	unsigned long count;  	/* a pointer to the context location inside the thread_struct @@ -232,7 +219,8 @@ extern void ds_free(struct ds_context *context);  #else /* CONFIG_X86_DS */ -#define ds_init_intel(config) do {} while (0) +struct cpuinfo_x86; +static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}  #endif /* CONFIG_X86_DS */  #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 2bb43b433e0..754a3e082f9 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -29,7 +29,6 @@ struct dyn_arch_ftrace {  #endif /* CONFIG_FUNCTION_TRACER */  #ifdef CONFIG_FUNCTION_RET_TRACER -#define FTRACE_RET_STACK_SIZE 20  #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e90e81ef6ab..0921b4018c1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,36 +40,8 @@ struct thread_info {  						*/  	__u8			supervisor_stack[0];  #endif - -#ifdef CONFIG_FUNCTION_RET_TRACER -	/* Index of current stored adress in ret_stack */ -	int		curr_ret_stack; -	/* Stack of return addresses for return function tracing */ -	struct ftrace_ret_stack	ret_stack[FTRACE_RET_STACK_SIZE]; -	/* -	 * Number of functions that haven't been traced -	 * because of depth overrun. -	 */ -	atomic_t	trace_overrun; -#endif  }; -#ifdef CONFIG_FUNCTION_RET_TRACER -#define INIT_THREAD_INFO(tsk)			\ -{						\ -	.task		= &tsk,			\ -	.exec_domain	= &default_exec_domain,	\ -	.flags		= 0,			\ -	.cpu		= 0,			\ -	.preempt_count	= 1,			\ -	.addr_limit	= KERNEL_DS,		\ -	.restart_block = {			\ -		.fn = do_no_restart_syscall,	\ -	},					\ -	.curr_ret_stack = -1,\ -	.trace_overrun	= ATOMIC_INIT(0)	\ -} -#else  #define INIT_THREAD_INFO(tsk)			\  {						\  	.task		= &tsk,			\ @@ -82,7 +54,6 @@ struct thread_info {  		.fn = do_no_restart_syscall,	\  	},					\  } -#endif  #define init_thread_info	(init_thread_union.thread_info)  #define init_stack		(init_thread_union.stack) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1d8ed95da84..af2bc36ca1c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,7 +46,7 @@ obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o  obj-y				+= process.o  obj-y				+= i387.o xsave.o  obj-y				+= ptrace.o -obj-y				+= ds.o +obj-$(CONFIG_X86_DS)		+= ds.o  obj-$(CONFIG_X86_32)		+= tls.o  obj-$(CONFIG_IA32_EMULATION)	+= tls.o  obj-y				+= step.o diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d5..816f27f289b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_P4);  	if (c->x86 == 6)  		set_cpu_cap(c, X86_FEATURE_P3); +#endif  	if (cpu_has_bts)  		ptrace_bts_init_intel(c); -#endif -  	detect_extended_topology(c);  	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {  		/* diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d1a121443bd..19a8c2c0389 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -7,13 +7,12 @@   *   * It manages:   * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done)   * - buffer access   *   * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks   *   *   * Copyright (C) 2007-2008 Intel Corporation. @@ -21,8 +20,6 @@   */ -#ifdef CONFIG_X86_DS -  #include <asm/ds.h>  #include <linux/errno.h> @@ -30,6 +27,7 @@  #include <linux/slab.h>  #include <linux/sched.h>  #include <linux/mm.h> +#include <linux/kernel.h>  /* @@ -46,6 +44,33 @@ struct ds_configuration {  };  static struct ds_configuration ds_cfg; +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { +	/* the DS context (partially) owned by this tracer */ +	struct ds_context *context; +	/* the buffer provided on ds_request() and its size in bytes */ +	void *buffer; +	size_t size; +}; + +struct bts_tracer { +	/* the common DS part */ +	struct ds_tracer ds; +	/* buffer overflow notification function */ +	bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { +	/* the common DS part */ +	struct ds_tracer ds; +	/* buffer overflow notification function */ +	pebs_ovfl_callback_t ovfl; +};  /*   * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -109,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,  	(*(unsigned long *)base) = value;  } +#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */ -/* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. - */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);  /* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. + * Locking is done only for allocating BTS or PEBS resources.   */ -static inline int ds_validate_access(struct ds_context *context, -				     enum ds_qualifier qual) -{ -	if (!context) -		return -EPERM; - -	if (context->owner[qual] == current) -		return 0; - -	return -EPERM; -} +static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);  /* @@ -185,96 +189,43 @@ static inline int check_tracer(struct task_struct *task)   *   * Contexts are use-counted. They are allocated on first access and   * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - *   requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - *   non-existing context indicates an error. It acquires and releases - *   the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size.   */  static DEFINE_PER_CPU(struct ds_context *, system_context);  #define this_system_context per_cpu(system_context, smp_processor_id()) -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. - */  static inline struct ds_context *ds_get_context(struct task_struct *task)  { -	struct ds_context *context; - -	spin_lock(&ds_lock); - -	context = (task ? task->thread.ds_ctx : this_system_context); -	if (context) -		context->count++; - -	spin_unlock(&ds_lock); - -	return context; -} - -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - * - * pre: requires ds_lock to be held - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) -{  	struct ds_context **p_context =  		(task ? &task->thread.ds_ctx : &this_system_context);  	struct ds_context *context = *p_context; +	unsigned long irq;  	if (!context) { -		spin_unlock(&ds_lock); -  		context = kzalloc(sizeof(*context), GFP_KERNEL); - -		if (!context) { -			spin_lock(&ds_lock); +		if (!context)  			return NULL; -		} -		context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); -		if (!context->ds) { -			kfree(context); -			spin_lock(&ds_lock); -			return NULL; -		} +		spin_lock_irqsave(&ds_lock, irq); -		spin_lock(&ds_lock); -		/* -		 * Check for race - another CPU could have allocated -		 * it meanwhile: -		 */  		if (*p_context) { -			kfree(context->ds);  			kfree(context); -			return *p_context; -		} -		*p_context = context; +			context = *p_context; +		} else { +			*p_context = context; -		context->this = p_context; -		context->task = task; +			context->this = p_context; +			context->task = task; -		if (task) -			set_tsk_thread_flag(task, TIF_DS_AREA_MSR); +			if (task) +				set_tsk_thread_flag(task, TIF_DS_AREA_MSR); -		if (!task || (task == current)) -			wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0); - -		get_tracer(task); +			if (!task || (task == current)) +				wrmsrl(MSR_IA32_DS_AREA, +				       (unsigned long)context->ds); +		} +		spin_unlock_irqrestore(&ds_lock, irq);  	}  	context->count++; @@ -282,16 +233,14 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)  	return context;  } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */  static inline void ds_put_context(struct ds_context *context)  { +	unsigned long irq; +  	if (!context)  		return; -	spin_lock(&ds_lock); +	spin_lock_irqsave(&ds_lock, irq);  	if (--context->count)  		goto out; @@ -304,352 +253,351 @@ static inline void ds_put_context(struct ds_context *context)  	if (!context->task || (context->task == current))  		wrmsrl(MSR_IA32_DS_AREA, 0); -	put_tracer(context->task); - -	/* free any leftover buffers from tracers that did not -	 * deallocate them properly. */ -	kfree(context->buffer[ds_bts]); -	kfree(context->buffer[ds_pebs]); -	kfree(context->ds);  	kfree(context);   out: -	spin_unlock(&ds_lock); +	spin_unlock_irqrestore(&ds_lock, irq);  }  /*   * Handle a buffer overflow   * - * task: the task whose buffers are overflowing; - *       NULL for a buffer overflow on the current cpu   * context: the ds context   * qual: the buffer type   */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, -			enum ds_qualifier qual) +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)  { -	if (!context) -		return; - -	if (context->callback[qual]) -		(*context->callback[qual])(task); - -	/* todo: do some more overflow handling */ +	switch (qual) { +	case ds_bts: { +		struct bts_tracer *tracer = +			container_of(context->owner[qual], +				     struct bts_tracer, ds); +		if (tracer->ovfl) +			tracer->ovfl(tracer); +	} +		break; +	case ds_pebs: { +		struct pebs_tracer *tracer = +			container_of(context->owner[qual], +				     struct pebs_tracer, ds); +		if (tracer->ovfl) +			tracer->ovfl(tracer); +	} +		break; +	}  } -/* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. - * - * Returns the buffer, if successful; - *         NULL, if out of memory or rlimit exceeded. - * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved - */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) +static void ds_install_ds_config(struct ds_context *context, +				 enum ds_qualifier qual, +				 void *base, size_t size, size_t ith)  { -	unsigned long rlim, vm, pgsz; -	void *buffer; - -	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - -	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; -	vm   = current->mm->total_vm  + pgsz; -	if (rlim < vm) -		return NULL; +	unsigned long buffer, adj; -	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; -	vm   = current->mm->locked_vm  + pgsz; -	if (rlim < vm) -		return NULL; +	/* adjust the buffer address and size to meet alignment +	 * constraints: +	 * - buffer is double-word aligned +	 * - size is multiple of record size +	 * +	 * We checked the size at the very beginning; we have enough +	 * space to do the adjustment. +	 */ +	buffer = (unsigned long)base; -	buffer = kzalloc(size, GFP_KERNEL); -	if (!buffer) -		return NULL; +	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; +	buffer += adj; +	size   -= adj; -	current->mm->total_vm  += pgsz; -	current->mm->locked_vm += pgsz; +	size /= ds_cfg.sizeof_rec[qual]; +	size *= ds_cfg.sizeof_rec[qual]; -	if (pages) -		*pages = pgsz; +	ds_set(context->ds, qual, ds_buffer_base, buffer); +	ds_set(context->ds, qual, ds_index, buffer); +	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); -	return buffer; +	/* The value for 'no threshold' is -1, which will set the +	 * threshold outside of the buffer, just like we want it. +	 */ +	ds_set(context->ds, qual, +	       ds_interrupt_threshold, buffer + size - ith);  } -static int ds_request(struct task_struct *task, void *base, size_t size, -		      ds_ovfl_callback_t ovfl, enum ds_qualifier qual) +static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, +		      struct task_struct *task, +		      void *base, size_t size, size_t th)  {  	struct ds_context *context; -	unsigned long buffer, adj; -	const unsigned long alignment = (1 << 3); -	int error = 0; +	unsigned long irq; +	int error; +	error = -EOPNOTSUPP;  	if (!ds_cfg.sizeof_ds) -		return -EOPNOTSUPP; +		goto out; + +	error = -EINVAL; +	if (!base) +		goto out;  	/* we require some space to do alignment adjustments below */ -	if (size < (alignment + ds_cfg.sizeof_rec[qual])) -		return -EINVAL; +	error = -EINVAL; +	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) +		goto out; -	/* buffer overflow notification is not yet implemented */ -	if (ovfl) -		return -EOPNOTSUPP; +	if (th != (size_t)-1) { +		th *= ds_cfg.sizeof_rec[qual]; +		error = -EINVAL; +		if (size <= th) +			goto out; +	} -	spin_lock(&ds_lock); +	tracer->buffer = base; +	tracer->size = size;  	error = -ENOMEM; -	context = ds_alloc_context(task); +	context = ds_get_context(task);  	if (!context) -		goto out_unlock; +		goto out; +	tracer->context = context; + + +	spin_lock_irqsave(&ds_lock, irq);  	error = -EPERM;  	if (!check_tracer(task))  		goto out_unlock; +	get_tracer(task); -	error = -EALREADY; -	if (context->owner[qual] == current) -		goto out_unlock;  	error = -EPERM; -	if (context->owner[qual] != NULL) -		goto out_unlock; -	context->owner[qual] = current; - -	spin_unlock(&ds_lock); - - -	error = -ENOMEM; -	if (!base) { -		base = ds_allocate_buffer(size, &context->pages[qual]); -		if (!base) -			goto out_release; - -		context->buffer[qual]   = base; -	} -	error = 0; +	if (context->owner[qual]) +		goto out_put_tracer; +	context->owner[qual] = tracer; -	context->callback[qual] = ovfl; +	spin_unlock_irqrestore(&ds_lock, irq); -	/* adjust the buffer address and size to meet alignment -	 * constraints: -	 * - buffer is double-word aligned -	 * - size is multiple of record size -	 * -	 * We checked the size at the very beginning; we have enough -	 * space to do the adjustment. -	 */ -	buffer = (unsigned long)base; - -	adj = ALIGN(buffer, alignment) - buffer; -	buffer += adj; -	size   -= adj; - -	size /= ds_cfg.sizeof_rec[qual]; -	size *= ds_cfg.sizeof_rec[qual]; - -	ds_set(context->ds, qual, ds_buffer_base, buffer); -	ds_set(context->ds, qual, ds_index, buffer); -	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); -	if (ovfl) { -		/* todo: select a suitable interrupt threshold */ -	} else -		ds_set(context->ds, qual, -		       ds_interrupt_threshold, buffer + size + 1); +	ds_install_ds_config(context, qual, base, size, th); -	/* we keep the context until ds_release */ -	return error; - - out_release: -	context->owner[qual] = NULL; -	ds_put_context(context); -	return error; +	return 0; + out_put_tracer: +	put_tracer(task);   out_unlock: -	spin_unlock(&ds_lock); +	spin_unlock_irqrestore(&ds_lock, irq);  	ds_put_context(context); +	tracer->context = NULL; + out:  	return error;  } -int ds_request_bts(struct task_struct *task, void *base, size_t size, -		   ds_ovfl_callback_t ovfl) +struct bts_tracer *ds_request_bts(struct task_struct *task, +				  void *base, size_t size, +				  bts_ovfl_callback_t ovfl, size_t th)  { -	return ds_request(task, base, size, ovfl, ds_bts); -} +	struct bts_tracer *tracer; +	int error; -int ds_request_pebs(struct task_struct *task, void *base, size_t size, -		    ds_ovfl_callback_t ovfl) -{ -	return ds_request(task, base, size, ovfl, ds_pebs); +	/* buffer overflow notification is not yet implemented */ +	error = -EOPNOTSUPP; +	if (ovfl) +		goto out; + +	error = -ENOMEM; +	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); +	if (!tracer) +		goto out; +	tracer->ovfl = ovfl; + +	error = ds_request(&tracer->ds, ds_bts, task, base, size, th); +	if (error < 0) +		goto out_tracer; + +	return tracer; + + out_tracer: +	kfree(tracer); + out: +	return ERR_PTR(error);  } -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, +				    void *base, size_t size, +				    pebs_ovfl_callback_t ovfl, size_t th)  { -	struct ds_context *context; +	struct pebs_tracer *tracer;  	int error; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) +	/* buffer overflow notification is not yet implemented */ +	error = -EOPNOTSUPP; +	if (ovfl)  		goto out; -	kfree(context->buffer[qual]); -	context->buffer[qual] = NULL; +	error = -ENOMEM; +	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); +	if (!tracer) +		goto out; +	tracer->ovfl = ovfl; -	current->mm->total_vm  -= context->pages[qual]; -	current->mm->locked_vm -= context->pages[qual]; -	context->pages[qual] = 0; -	context->owner[qual] = NULL; +	error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); +	if (error < 0) +		goto out_tracer; -	/* -	 * we put the context twice: -	 *   once for the ds_get_context -	 *   once for the corresponding ds_request -	 */ -	ds_put_context(context); +	return tracer; + + out_tracer: +	kfree(tracer);   out: -	ds_put_context(context); -	return error; +	return ERR_PTR(error);  } -int ds_release_bts(struct task_struct *task) +static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)  { -	return ds_release(task, ds_bts); +	BUG_ON(tracer->context->owner[qual] != tracer); +	tracer->context->owner[qual] = NULL; + +	put_tracer(tracer->context->task); +	ds_put_context(tracer->context);  } -int ds_release_pebs(struct task_struct *task) +int ds_release_bts(struct bts_tracer *tracer)  { -	return ds_release(task, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	ds_release(&tracer->ds, ds_bts); +	kfree(tracer); + +	return 0;  } -static int ds_get_index(struct task_struct *task, size_t *pos, -			enum ds_qualifier qual) +int ds_release_pebs(struct pebs_tracer *tracer)  { -	struct ds_context *context; -	unsigned long base, index; -	int error; +	if (!tracer) +		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; +	ds_release(&tracer->ds, ds_pebs); +	kfree(tracer); + +	return 0; +} + +static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) +{ +	unsigned long base, index;  	base  = ds_get(context->ds, qual, ds_buffer_base);  	index = ds_get(context->ds, qual, ds_index); -	error = ((index - base) / ds_cfg.sizeof_rec[qual]); -	if (pos) -		*pos = error; - out: -	ds_put_context(context); -	return error; +	return (index - base) / ds_cfg.sizeof_rec[qual];  } -int ds_get_bts_index(struct task_struct *task, size_t *pos) +int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)  { -	return ds_get_index(task, pos, ds_bts); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_index(tracer->ds.context, ds_bts); + +	return 0;  } -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)  { -	return ds_get_index(task, pos, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_index(tracer->ds.context, ds_pebs); + +	return 0;  } -static int ds_get_end(struct task_struct *task, size_t *pos, -		      enum ds_qualifier qual) +static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)  { -	struct ds_context *context; -	unsigned long base, end; -	int error; - -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; +	unsigned long base, max;  	base = ds_get(context->ds, qual, ds_buffer_base); -	end  = ds_get(context->ds, qual, ds_absolute_maximum); +	max  = ds_get(context->ds, qual, ds_absolute_maximum); -	error = ((end - base) / ds_cfg.sizeof_rec[qual]); -	if (pos) -		*pos = error; - out: -	ds_put_context(context); -	return error; +	return (max - base) / ds_cfg.sizeof_rec[qual];  } -int ds_get_bts_end(struct task_struct *task, size_t *pos) +int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)  { -	return ds_get_end(task, pos, ds_bts); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_end(tracer->ds.context, ds_bts); + +	return 0;  } -int ds_get_pebs_end(struct task_struct *task, size_t *pos) +int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)  { -	return ds_get_end(task, pos, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_end(tracer->ds.context, ds_pebs); + +	return 0;  } -static int ds_access(struct task_struct *task, size_t index, -		     const void **record, enum ds_qualifier qual) +static int ds_access(struct ds_context *context, enum ds_qualifier qual, +		     size_t index, const void **record)  { -	struct ds_context *context;  	unsigned long base, idx; -	int error;  	if (!record)  		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; -  	base = ds_get(context->ds, qual, ds_buffer_base);  	idx = base + (index * ds_cfg.sizeof_rec[qual]); -	error = -EINVAL;  	if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) -		goto out; +		return -EINVAL;  	*record = (const void *)idx; -	error = ds_cfg.sizeof_rec[qual]; - out: -	ds_put_context(context); -	return error; + +	return ds_cfg.sizeof_rec[qual];  } -int ds_access_bts(struct task_struct *task, size_t index, const void **record) +int ds_access_bts(struct bts_tracer *tracer, size_t index, +		  const void **record)  { -	return ds_access(task, index, record, ds_bts); +	if (!tracer) +		return -EINVAL; + +	return ds_access(tracer->ds.context, ds_bts, index, record);  } -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) +int ds_access_pebs(struct pebs_tracer *tracer, size_t index, +		   const void **record)  { -	return ds_access(task, index, record, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	return ds_access(tracer->ds.context, ds_pebs, index, record);  } -static int ds_write(struct task_struct *task, const void *record, size_t size, -		    enum ds_qualifier qual, int force) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, +		    const void *record, size_t size)  { -	struct ds_context *context; -	int error; +	int bytes_written = 0;  	if (!record)  		return -EINVAL; -	error = -EPERM; -	context = ds_get_context(task); -	if (!context) -		goto out; - -	if (!force) { -		error = ds_validate_access(context, qual); -		if (error < 0) -			goto out; -	} - -	error = 0;  	while (size) {  		unsigned long base, index, end, write_end, int_th;  		unsigned long write_size, adj_write_size; @@ -677,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,  			write_end = end;  		if (write_end <= index) -			goto out; +			break;  		write_size = min((unsigned long) size, write_end - index);  		memcpy((void *)index, record, write_size);  		record = (const char *)record + write_size; -		size  -= write_size; -		error += write_size; +		size -= write_size; +		bytes_written += write_size;  		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];  		adj_write_size *= ds_cfg.sizeof_rec[qual]; @@ -699,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,  		ds_set(context->ds, qual, ds_index, index);  		if (index >= int_th) -			ds_overflow(task, context, qual); +			ds_overflow(context, qual);  	} - out: -	ds_put_context(context); -	return error; +	return bytes_written;  } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)  { -	return ds_write(task, record, size, ds_bts, /* force = */ 0); -} +	if (!tracer) +		return -EINVAL; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ -	return ds_write(task, record, size, ds_pebs, /* force = */ 0); +	return ds_write(tracer->ds.context, ds_bts, record, size);  } -int ds_unchecked_write_bts(struct task_struct *task, -			   const void *record, size_t size) +int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)  { -	return ds_write(task, record, size, ds_bts, /* force = */ 1); -} +	if (!tracer) +		return -EINVAL; -int ds_unchecked_write_pebs(struct task_struct *task, -			    const void *record, size_t size) -{ -	return ds_write(task, record, size, ds_pebs, /* force = */ 1); +	return ds_write(tracer->ds.context, ds_pebs, record, size);  } -static int ds_reset_or_clear(struct task_struct *task, -			     enum ds_qualifier qual, int clear) +static void ds_reset_or_clear(struct ds_context *context, +			      enum ds_qualifier qual, int clear)  { -	struct ds_context *context;  	unsigned long base, end; -	int error; - -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out;  	base = ds_get(context->ds, qual, ds_buffer_base);  	end  = ds_get(context->ds, qual, ds_absolute_maximum); @@ -748,89 +681,100 @@ static int ds_reset_or_clear(struct task_struct *task,  		memset((void *)base, 0, end - base);  	ds_set(context->ds, qual, ds_index, base); - -	error = 0; - out: -	ds_put_context(context); -	return error;  } -int ds_reset_bts(struct task_struct *task) +int ds_reset_bts(struct bts_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + +	return 0;  } -int ds_reset_pebs(struct task_struct *task) +int ds_reset_pebs(struct pebs_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + +	return 0;  } -int ds_clear_bts(struct task_struct *task) +int ds_clear_bts(struct bts_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); + +	return 0;  } -int ds_clear_pebs(struct task_struct *task) +int ds_clear_pebs(struct pebs_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + +	return 0;  } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)  { -	struct ds_context *context; -	int error; +	if (!tracer) +		return -EINVAL;  	if (!value)  		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, ds_pebs); -	if (error < 0) -		goto out; +	*value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); -	*value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); - -	error = 0; - out: -	ds_put_context(context); -	return error; +	return 0;  } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)  { -	struct ds_context *context; -	int error; - -	context = ds_get_context(task); -	error = ds_validate_access(context, ds_pebs); -	if (error < 0) -		goto out; +	if (!tracer) +		return -EINVAL; -	*(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; +	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; -	error = 0; - out: -	ds_put_context(context); -	return error; +	return 0;  }  static const struct ds_configuration ds_cfg_var = {  	.sizeof_ds    = sizeof(long) * 12,  	.sizeof_field = sizeof(long),  	.sizeof_rec[ds_bts]   = sizeof(long) * 3, +#ifdef __i386__  	.sizeof_rec[ds_pebs]  = sizeof(long) * 10 +#else +	.sizeof_rec[ds_pebs]  = sizeof(long) * 18 +#endif  };  static const struct ds_configuration ds_cfg_64 = {  	.sizeof_ds    = 8 * 12,  	.sizeof_field = 8,  	.sizeof_rec[ds_bts]   = 8 * 3, +#ifdef __i386__  	.sizeof_rec[ds_pebs]  = 8 * 10 +#else +	.sizeof_rec[ds_pebs]  = 8 * 18 +#endif  };  static inline void  ds_configure(const struct ds_configuration *cfg)  {  	ds_cfg = *cfg; + +	printk(KERN_INFO "DS available\n"); + +	BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);  }  void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -838,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  	switch (c->x86) {  	case 0x6:  		switch (c->x86_model) { +		case 0 ... 0xC: +			/* sorry, don't know about them */ +			break;  		case 0xD:  		case 0xE: /* Pentium M */  			ds_configure(&ds_cfg_var);  			break; -		case 0xF: /* Core2 */ -		case 0x1C: /* Atom */ +		default: /* Core2, Atom, ... */  			ds_configure(&ds_cfg_64);  			break; -		default: -			/* sorry, don't know about them */ -			break;  		}  		break;  	case 0xF: @@ -875,7 +818,8 @@ void ds_free(struct ds_context *context)  	 * is dying. There should not be any user of that context left  	 * to disturb us, anymore. */  	unsigned long leftovers = context->count; -	while (leftovers--) +	while (leftovers--) { +		put_tracer(context->task);  		ds_put_context(context); +	}  } -#endif /* CONFIG_X86_DS */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 356bb1eb6e9..bb137f7297e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -350,19 +350,21 @@ static int push_return_trace(unsigned long ret, unsigned long long time,  				unsigned long func)  {  	int index; -	struct thread_info *ti = current_thread_info(); + +	if (!current->ret_stack) +		return -EBUSY;  	/* The return trace stack is full */ -	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { -		atomic_inc(&ti->trace_overrun); +	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { +		atomic_inc(¤t->trace_overrun);  		return -EBUSY;  	} -	index = ++ti->curr_ret_stack; +	index = ++current->curr_ret_stack;  	barrier(); -	ti->ret_stack[index].ret = ret; -	ti->ret_stack[index].func = func; -	ti->ret_stack[index].calltime = time; +	current->ret_stack[index].ret = ret; +	current->ret_stack[index].func = func; +	current->ret_stack[index].calltime = time;  	return 0;  } @@ -373,13 +375,12 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,  {  	int index; -	struct thread_info *ti = current_thread_info(); -	index = ti->curr_ret_stack; -	*ret = ti->ret_stack[index].ret; -	*func = ti->ret_stack[index].func; -	*time = ti->ret_stack[index].calltime; -	*overrun = atomic_read(&ti->trace_overrun); -	ti->curr_ret_stack--; +	index = current->curr_ret_stack; +	*ret = current->ret_stack[index].ret; +	*func = current->ret_stack[index].func; +	*time = current->ret_stack[index].calltime; +	*overrun = atomic_read(¤t->trace_overrun); +	current->curr_ret_stack--;  }  /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 1f20608d4ca..b0f61f0dcd0 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -58,7 +58,7 @@ void __cpuinit mxcsr_feature_mask_init(void)  	stts();  } -void __init init_thread_xstate(void) +void __cpuinit init_thread_xstate(void)  {  	if (!HAVE_HWFP) {  		xstate_size = sizeof(struct i387_soft_struct); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index c9513e1ff28..1fec0f9b150 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3608,27 +3608,7 @@ int __init io_apic_get_redir_entries (int ioapic)  int __init probe_nr_irqs(void)  { -	int idx; -	int nr = 0; -#ifndef CONFIG_XEN -	int nr_min = 32; -#else -	int nr_min = NR_IRQS; -#endif - -	for (idx = 0; idx < nr_ioapics; idx++) -		nr += io_apic_get_redir_entries(idx) + 1; - -	/* double it for hotplug and msi and nmi */ -	nr <<= 1; - -	/* something wrong ? */ -	if (nr < nr_min) -		nr = nr_min; -	if (WARN_ON(nr > NR_IRQS)) -		nr = NR_IRQS; - -	return nr; +	return NR_IRQS;  }  /* -------------------------------------------------------------------------- diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index e1e731d78f3..d28bbdc35e4 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1567,7 +1567,7 @@ static int __init calgary_parse_options(char *p)  				++p;  			if (*p == '\0')  				break; -			bridge = simple_strtol(p, &endp, 0); +			bridge = simple_strtoul(p, &endp, 0);  			if (p == endp)  				break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a6d8c12e10..2c8ec1ba75e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,  	size_t bts_index, bts_end;  	int error; -	error = ds_get_bts_end(child, &bts_end); +	error = ds_get_bts_end(child->bts, &bts_end);  	if (error < 0)  		return error;  	if (bts_end <= index)  		return -EINVAL; -	error = ds_get_bts_index(child, &bts_index); +	error = ds_get_bts_index(child->bts, &bts_index);  	if (error < 0)  		return error; @@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,  	if (bts_end <= bts_index)  		bts_index -= bts_end; -	error = ds_access_bts(child, bts_index, &bts_record); +	error = ds_access_bts(child->bts, bts_index, &bts_record);  	if (error < 0)  		return error; @@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,  	size_t end, i;  	int error; -	error = ds_get_bts_index(child, &end); +	error = ds_get_bts_index(child->bts, &end);  	if (error < 0)  		return error;  	if (size < (end * sizeof(struct bts_struct)))  		return -EIO; -	error = ds_access_bts(child, 0, (const void **)&raw); +	error = ds_access_bts(child->bts, 0, (const void **)&raw);  	if (error < 0)  		return error; @@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,  			return -EFAULT;  	} -	error = ds_clear_bts(child); +	error = ds_clear_bts(child->bts);  	if (error < 0)  		return error;  	return end;  } -static void ptrace_bts_ovfl(struct task_struct *child) -{ -	send_sig(child->thread.bts_ovfl_signal, child, 0); -} -  static int ptrace_bts_config(struct task_struct *child,  			     long cfg_size,  			     const struct ptrace_bts_config __user *ucfg) @@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child,  		goto errout;  	if (cfg.flags & PTRACE_BTS_O_ALLOC) { -		ds_ovfl_callback_t ovfl = NULL; +		bts_ovfl_callback_t ovfl = NULL;  		unsigned int sig = 0; -		/* we ignore the error in case we were not tracing child */ -		(void)ds_release_bts(child); +		error = -EINVAL; +		if (cfg.size < (10 * bts_cfg.sizeof_bts)) +			goto errout;  		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {  			if (!cfg.signal)  				goto errout; +			error = -EOPNOTSUPP; +			goto errout; +  			sig  = cfg.signal; -			ovfl = ptrace_bts_ovfl;  		} -		error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); -		if (error < 0) +		if (child->bts) { +			(void)ds_release_bts(child->bts); +			kfree(child->bts_buffer); + +			child->bts = NULL; +			child->bts_buffer = NULL; +		} + +		error = -ENOMEM; +		child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); +		if (!child->bts_buffer) +			goto errout; + +		child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, +					    ovfl, /* th = */ (size_t)-1); +		if (IS_ERR(child->bts)) { +			error = PTR_ERR(child->bts); +			kfree(child->bts_buffer); +			child->bts = NULL; +			child->bts_buffer = NULL;  			goto errout; +		}  		child->thread.bts_ovfl_signal = sig;  	} @@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child,  	if (cfg_size < sizeof(cfg))  		return -EIO; -	error = ds_get_bts_end(child, &end); +	error = ds_get_bts_end(child->bts, &end);  	if (error < 0)  		return error; -	error = ds_access_bts(child, /* index = */ 0, &base); +	error = ds_access_bts(child->bts, /* index = */ 0, &base);  	if (error < 0)  		return error; -	error = ds_access_bts(child, /* index = */ end, &max); +	error = ds_access_bts(child->bts, /* index = */ end, &max);  	if (error < 0)  		return error; @@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child,  		return -EINVAL;  	} -	/* The writing task will be the switched-to task on a context -	 * switch. It needs to write into the switched-from task's BTS -	 * buffer. */ -	return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); +	return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);  }  void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)  	switch (c->x86) {  	case 0x6:  		switch (c->x86_model) { +		case 0 ... 0xC: +			/* sorry, don't know about them */ +			break;  		case 0xD:  		case 0xE: /* Pentium M */  			bts_configure(&bts_cfg_pentium_m);  			break; -		case 0xF: /* Core2 */ -        case 0x1C: /* Atom */ +		default: /* Core2, Atom, ... */  			bts_configure(&bts_cfg_core2);  			break; -		default: -			/* sorry, don't know about them */ -			break;  		}  		break;  	case 0xF: @@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child)  	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);  #endif  #ifdef CONFIG_X86_PTRACE_BTS -	(void)ds_release_bts(child); +	if (child->bts) { +		(void)ds_release_bts(child->bts); +		kfree(child->bts_buffer); +		child->bts_buffer = NULL; -	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; -	if (!child->thread.debugctlmsr) -		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); +		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; +		if (!child->thread.debugctlmsr) +			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); -	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +	}  #endif /* CONFIG_X86_PTRACE_BTS */  } @@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)  			(child, data, (struct ptrace_bts_config __user *)addr);  		break; -	case PTRACE_BTS_SIZE: -		ret = ds_get_bts_index(child, /* pos = */ NULL); +	case PTRACE_BTS_SIZE: { +		size_t size; + +		ret = ds_get_bts_index(child->bts, &size); +		if (ret == 0) { +			BUG_ON(size != (int) size); +			ret = (int) size; +		}  		break; +	}  	case PTRACE_BTS_GET:  		ret = ptrace_bts_read_record @@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)  		break;  	case PTRACE_BTS_CLEAR: -		ret = ds_clear_bts(child); +		ret = ds_clear_bts(child->bts);  		break;  	case PTRACE_BTS_DRAIN: diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index a03e7f6d90c..10786af9554 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -6,6 +6,7 @@  #include <linux/sched.h>  #include <linux/stacktrace.h>  #include <linux/module.h> +#include <linux/uaccess.h>  #include <asm/stacktrace.h>  static void save_stack_warning(void *data, char *msg) @@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)  		trace->entries[trace->nr_entries++] = ULONG_MAX;  }  EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame { +	const void __user	*next_fp; +	unsigned long		ret_addr; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ +	int ret; + +	if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) +		return 0; + +	ret = 1; +	pagefault_disable(); +	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) +		ret = 0; +	pagefault_enable(); + +	return ret; +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ +	const struct pt_regs *regs = task_pt_regs(current); +	const void __user *fp = (const void __user *)regs->bp; + +	if (trace->nr_entries < trace->max_entries) +		trace->entries[trace->nr_entries++] = regs->ip; + +	while (trace->nr_entries < trace->max_entries) { +		struct stack_frame frame; + +		frame.next_fp = NULL; +		frame.ret_addr = 0; +		if (!copy_stack_frame(fp, &frame)) +			break; +		if ((unsigned long)fp < regs->sp) +			break; +		if (frame.ret_addr) { +			trace->entries[trace->nr_entries++] = +				frame.ret_addr; +		} +		if (fp == frame.next_fp) +			break; +		fp = frame.next_fp; +	} +} + +void save_stack_trace_user(struct stack_trace *trace) +{ +	/* +	 * Trace user stack if we are not a kernel thread +	 */ +	if (current->mm) { +		__save_stack_trace_user(trace); +	} +	if (trace->nr_entries < trace->max_entries) +		trace->entries[trace->nr_entries++] = ULONG_MAX; +} + diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index b13acb75e82..15c3e699918 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -310,7 +310,7 @@ static void __init setup_xstate_init(void)  /*   * Enable and initialize the xsave feature.   */ -void __init xsave_cntxt_init(void) +void __ref xsave_cntxt_init(void)  {  	unsigned int eax, ebx, ecx, edx; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 3f1b81a83e2..716d26f0e5d 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -69,7 +69,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)  	int i;  	if (!reset_value) { -		reset_value = kmalloc(sizeof(unsigned) * num_counters, +		reset_value = kmalloc(sizeof(reset_value[0]) * num_counters,  					GFP_ATOMIC);  		if (!reset_value)  			return; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 688936044dc..636ef4caa52 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -661,12 +661,11 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)   * For 64-bit, we must skip the Xen hole in the middle of the address   * space, just after the big x86-64 virtual hole.   */ -static int xen_pgd_walk(struct mm_struct *mm, -			int (*func)(struct mm_struct *mm, struct page *, -				    enum pt_level), -			unsigned long limit) +static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, +			  int (*func)(struct mm_struct *mm, struct page *, +				      enum pt_level), +			  unsigned long limit)  { -	pgd_t *pgd = mm->pgd;  	int flush = 0;  	unsigned hole_low, hole_high;  	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; @@ -753,6 +752,14 @@ out:  	return flush;  } +static int xen_pgd_walk(struct mm_struct *mm, +			int (*func)(struct mm_struct *mm, struct page *, +				    enum pt_level), +			unsigned long limit) +{ +	return __xen_pgd_walk(mm, mm->pgd, func, limit); +} +  /* If we're using split pte locks, then take the page's lock and     return a pointer to it.  Otherwise return NULL. */  static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) @@ -854,7 +861,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)  	xen_mc_batch(); -	 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { +	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {  		/* re-enable interrupts for flushing */  		xen_mc_issue(0); @@ -998,7 +1005,7 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)  		       PT_PMD);  #endif -	xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); +	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);  	xen_mc_issue(0);  } diff --git a/fs/seq_file.c b/fs/seq_file.c index eba2eabcd2b..f03220d7891 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)  }  EXPORT_SYMBOL(seq_printf); -static char *mangle_path(char *s, char *p, char *esc) +/** + *	mangle_path -	mangle and copy path to buffer beginning + *	@s: buffer start + *	@p: beginning of path in above buffer + *	@esc: set of characters that need escaping + * + *      Copy the path from @p to @s, replacing each occurrence of character from + *      @esc with usual octal escape. + *      Returns pointer past last written character in @s, or NULL in case of + *      failure. + */ +char *mangle_path(char *s, char *p, char *esc)  {  	while (s <= p) {  		char c = *p++; @@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc)  	}  	return NULL;  } +EXPORT_SYMBOL_GPL(mangle_path);  /*   * return the absolute path of 'dentry' residing in mount 'mnt'. diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f7ba4ea5e12..7854d87b97b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -257,6 +257,7 @@ extern int ftrace_dump_on_oops;  extern void tracing_start(void);  extern void tracing_stop(void); +extern void ftrace_off_permanent(void);  extern void  ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); @@ -290,6 +291,7 @@ ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));  static inline void tracing_start(void) { }  static inline void tracing_stop(void) { } +static inline void ftrace_off_permanent(void) { }  static inline int  ftrace_printk(const char *fmt, ...)  { @@ -323,6 +325,8 @@ struct ftrace_retfunc {  };  #ifdef CONFIG_FUNCTION_RET_TRACER +#define FTRACE_RETFUNC_DEPTH 50 +#define FTRACE_RETSTACK_ALLOC_SIZE 32  /* Type of a callback handler of tracing return function */  typedef void (*trace_function_return_t)(struct ftrace_retfunc *); @@ -330,6 +334,12 @@ extern int register_ftrace_return(trace_function_return_t func);  /* The current handler in use */  extern trace_function_return_t ftrace_function_return;  extern void unregister_ftrace_return(void); + +extern void ftrace_retfunc_init_task(struct task_struct *t); +extern void ftrace_retfunc_exit_task(struct task_struct *t); +#else +static inline void ftrace_retfunc_init_task(struct task_struct *t) { } +static inline void ftrace_retfunc_exit_task(struct task_struct *t) { }  #endif  #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e097c2e6b6d..3bb87a753fa 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -122,6 +122,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);  void tracing_on(void);  void tracing_off(void); +void tracing_off_permanent(void);  enum ring_buffer_flags {  	RB_FL_OVERWRITE		= 1 << 0, diff --git a/include/linux/sched.h b/include/linux/sched.h index c8e0db46420..d02a0ca70ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,6 +96,7 @@ struct exec_domain;  struct futex_pi_state;  struct robust_list_head;  struct bio; +struct bts_tracer;  /*   * List of flags we want to share for kernel threads, @@ -1161,6 +1162,18 @@ struct task_struct {  	struct list_head ptraced;  	struct list_head ptrace_entry; +#ifdef CONFIG_X86_PTRACE_BTS +	/* +	 * This is the tracer handle for the ptrace BTS extension. +	 * This field actually belongs to the ptracer task. +	 */ +	struct bts_tracer *bts; +	/* +	 * The buffer to hold the BTS data. +	 */ +	void *bts_buffer; +#endif /* CONFIG_X86_PTRACE_BTS */ +  	/* PID/PID hash table linkage. */  	struct pid_link pids[PIDTYPE_MAX];  	struct list_head thread_group; @@ -1352,6 +1365,17 @@ struct task_struct {  	unsigned long default_timer_slack_ns;  	struct list_head	*scm_work_list; +#ifdef CONFIG_FUNCTION_RET_TRACER +	/* Index of current stored adress in ret_stack */ +	int curr_ret_stack; +	/* Stack of return addresses for return function tracing */ +	struct ftrace_ret_stack	*ret_stack; +	/* +	 * Number of functions that haven't been traced +	 * because of depth overrun. +	 */ +	atomic_t trace_overrun; +#endif  };  /* @@ -2006,18 +2030,6 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct  {  	*task_thread_info(p) = *task_thread_info(org);  	task_thread_info(p)->task = p; - -#ifdef CONFIG_FUNCTION_RET_TRACER -	/* -	 * When fork() creates a child process, this function is called. -	 * But the child task may not inherit the return adresses traced -	 * by the return function tracer because it will directly execute -	 * in userspace and will not return to kernel functions its parent -	 * used. -	 */ -	task_thread_info(p)->curr_ret_stack = -1; -	atomic_set(&task_thread_info(p)->trace_overrun, 0); -#endif  }  static inline unsigned long *end_of_stack(struct task_struct *p) diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index dc50bcc282a..b3dfa72f13b 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -34,6 +34,7 @@ struct seq_operations {  #define SEQ_SKIP 1 +char *mangle_path(char *s, char *p, char *esc);  int seq_open(struct file *, const struct seq_operations *);  ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);  loff_t seq_lseek(struct file *, loff_t, int); diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index b106fd8e0d5..1a8cecc4f38 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,  				struct stack_trace *trace);  extern void print_stack_trace(struct stack_trace *trace, int spaces); + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +extern void save_stack_trace_user(struct stack_trace *trace); +#else +# define save_stack_trace_user(trace)              do { } while (0) +#endif +  #else  # define save_stack_trace(trace)			do { } while (0)  # define save_stack_trace_tsk(tsk, trace)		do { } while (0) +# define save_stack_trace_user(trace)			do { } while (0)  # define print_stack_trace(trace, spaces)		do { } while (0)  #endif diff --git a/init/main.c b/init/main.c index e810196bf2f..79213c0785d 100644 --- a/init/main.c +++ b/init/main.c @@ -723,7 +723,7 @@ int do_one_initcall(initcall_t fn)  		disable_boot_trace();  		rettime = ktime_get();  		delta = ktime_sub(rettime, calltime); -		ret.duration = (unsigned long long) delta.tv64 >> 10; +		ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10;  		trace_boot_ret(&ret, fn);  		printk("initcall %pF returned %d after %Ld usecs\n", fn,  			ret.result, ret.duration); diff --git a/kernel/exit.c b/kernel/exit.c index 35c8ec2ba03..e5ae36ebe8a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1127,7 +1127,6 @@ NORET_TYPE void do_exit(long code)  	preempt_disable();  	/* causes final put_task_struct in finish_task_switch(). */  	tsk->state = TASK_DEAD; -  	schedule();  	BUG();  	/* Avoid "noreturn function does return".  */ diff --git a/kernel/fork.c b/kernel/fork.c index ac62f43ee43..d6e1a3205f6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -47,6 +47,7 @@  #include <linux/mount.h>  #include <linux/audit.h>  #include <linux/memcontrol.h> +#include <linux/ftrace.h>  #include <linux/profile.h>  #include <linux/rmap.h>  #include <linux/acct.h> @@ -139,6 +140,7 @@ void free_task(struct task_struct *tsk)  	prop_local_destroy_single(&tsk->dirties);  	free_thread_info(tsk->stack);  	rt_mutex_debug_task_free(tsk); +	ftrace_retfunc_exit_task(tsk);  	free_task_struct(tsk);  }  EXPORT_SYMBOL(free_task); @@ -1269,6 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	total_forks++;  	spin_unlock(¤t->sighand->siglock);  	write_unlock_irq(&tasklist_lock); +	ftrace_retfunc_init_task(p);  	proc_fork_connector(p);  	cgroup_post_fork(p);  	return p; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index c9d74083746..f77d3819ef5 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,7 +22,6 @@  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/freezer.h> -#include <linux/ftrace.h>  #include "power.h" @@ -257,7 +256,7 @@ static int create_image(int platform_mode)  int hibernation_snapshot(int platform_mode)  { -	int error, ftrace_save; +	int error;  	/* Free memory before shutting down devices. */  	error = swsusp_shrink_memory(); @@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)  		goto Close;  	suspend_console(); -	ftrace_save = __ftrace_enabled_save();  	error = device_suspend(PMSG_FREEZE);  	if (error)  		goto Recover_platform; @@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)   Resume_devices:  	device_resume(in_suspend ?  		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); -	__ftrace_enabled_restore(ftrace_save);  	resume_console();   Close:  	platform_end(platform_mode); @@ -370,11 +367,10 @@ static int resume_target_kernel(void)  int hibernation_restore(int platform_mode)  { -	int error, ftrace_save; +	int error;  	pm_prepare_console();  	suspend_console(); -	ftrace_save = __ftrace_enabled_save();  	error = device_suspend(PMSG_QUIESCE);  	if (error)  		goto Finish; @@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)  	platform_restore_cleanup(platform_mode);  	device_resume(PMSG_RECOVER);   Finish: -	__ftrace_enabled_restore(ftrace_save);  	resume_console();  	pm_restore_console();  	return error; @@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)  int hibernation_platform_enter(void)  { -	int error, ftrace_save; +	int error;  	if (!hibernation_ops)  		return -ENOSYS; @@ -417,7 +412,6 @@ int hibernation_platform_enter(void)  		goto Close;  	suspend_console(); -	ftrace_save = __ftrace_enabled_save();  	error = device_suspend(PMSG_HIBERNATE);  	if (error) {  		if (hibernation_ops->recover) @@ -452,7 +446,6 @@ int hibernation_platform_enter(void)  	hibernation_ops->finish();   Resume_devices:  	device_resume(PMSG_RESTORE); -	__ftrace_enabled_restore(ftrace_save);  	resume_console();   Close:  	hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index b8f7ce9473e..613f16941b8 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -22,7 +22,6 @@  #include <linux/freezer.h>  #include <linux/vmstat.h>  #include <linux/syscalls.h> -#include <linux/ftrace.h>  #include "power.h" @@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)   */  int suspend_devices_and_enter(suspend_state_t state)  { -	int error, ftrace_save; +	int error;  	if (!suspend_ops)  		return -ENOSYS; @@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)  			goto Close;  	}  	suspend_console(); -	ftrace_save = __ftrace_enabled_save();  	suspend_test_start();  	error = device_suspend(PMSG_SUSPEND);  	if (error) { @@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)  	suspend_test_start();  	device_resume(PMSG_RESUME);  	suspend_test_finish("resume devices"); -	__ftrace_enabled_restore(ftrace_save);  	resume_console();   Close:  	if (suspend_ops->end) diff --git a/kernel/sched.c b/kernel/sched.c index 4de56108c86..388d9db044a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5901,6 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)  	 * The idle tasks have their own, simple scheduling class:  	 */  	idle->sched_class = &idle_sched_class; +	ftrace_retfunc_init_task(idle);  }  /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61e8cca6ff4..620feadff67 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,6 +3,9 @@  #  select HAVE_FUNCTION_TRACER:  # +config USER_STACKTRACE_SUPPORT +	bool +  config NOP_TRACER  	bool @@ -25,6 +28,9 @@ config HAVE_DYNAMIC_FTRACE  config HAVE_FTRACE_MCOUNT_RECORD  	bool +config HAVE_HW_BRANCH_TRACER +	bool +  config TRACER_MAX_TRACE  	bool @@ -230,6 +236,14 @@ config STACK_TRACER  	  Say N if unsure. +config BTS_TRACER +	depends on HAVE_HW_BRANCH_TRACER +	bool "Trace branches" +	select TRACING +	help +	  This tracer records all branches on the system in a circular +	  buffer giving access to the last N branches for each cpu. +  config DYNAMIC_FTRACE  	bool "enable/disable ftrace tracepoints dynamically"  	depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1a8c9259dc6..cef4bcb4e82 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,5 +31,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o  obj-$(CONFIG_BOOT_TRACER) += trace_boot.o  obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o  obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o +obj-$(CONFIG_BTS_TRACER) += trace_bts.o  libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f212da48668..53042f118f2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1498,10 +1498,77 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  #ifdef CONFIG_FUNCTION_RET_TRACER +static atomic_t ftrace_retfunc_active; +  /* The callback that hooks the return of a function */  trace_function_return_t ftrace_function_return =  			(trace_function_return_t)ftrace_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ +	int i; +	int ret = 0; +	unsigned long flags; +	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; +	struct task_struct *g, *t; + +	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { +		ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH +					* sizeof(struct ftrace_ret_stack), +					GFP_KERNEL); +		if (!ret_stack_list[i]) { +			start = 0; +			end = i; +			ret = -ENOMEM; +			goto free; +		} +	} + +	read_lock_irqsave(&tasklist_lock, flags); +	do_each_thread(g, t) { +		if (start == end) { +			ret = -EAGAIN; +			goto unlock; +		} + +		if (t->ret_stack == NULL) { +			t->ret_stack = ret_stack_list[start++]; +			t->curr_ret_stack = -1; +			atomic_set(&t->trace_overrun, 0); +		} +	} while_each_thread(g, t); + +unlock: +	read_unlock_irqrestore(&tasklist_lock, flags); +free: +	for (i = start; i < end; i++) +		kfree(ret_stack_list[i]); +	return ret; +} + +/* Allocate a return stack for each task */ +static int start_return_tracing(void) +{ +	struct ftrace_ret_stack **ret_stack_list; +	int ret; + +	ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * +				sizeof(struct ftrace_ret_stack *), +				GFP_KERNEL); + +	if (!ret_stack_list) +		return -ENOMEM; + +	do { +		ret = alloc_retstack_tasklist(ret_stack_list); +	} while (ret == -EAGAIN); + +	kfree(ret_stack_list); +	return ret; +} +  int register_ftrace_return(trace_function_return_t func)  {  	int ret = 0; @@ -1516,7 +1583,12 @@ int register_ftrace_return(trace_function_return_t func)  		ret = -EBUSY;  		goto out;  	} - +	atomic_inc(&ftrace_retfunc_active); +	ret = start_return_tracing(); +	if (ret) { +		atomic_dec(&ftrace_retfunc_active); +		goto out; +	}  	ftrace_tracing_type = FTRACE_TYPE_RETURN;  	ftrace_function_return = func;  	ftrace_startup(); @@ -1530,6 +1602,7 @@ void unregister_ftrace_return(void)  {  	mutex_lock(&ftrace_sysctl_lock); +	atomic_dec(&ftrace_retfunc_active);  	ftrace_function_return = (trace_function_return_t)ftrace_stub;  	ftrace_shutdown();  	/* Restore normal tracing type */ @@ -1537,6 +1610,32 @@ void unregister_ftrace_return(void)  	mutex_unlock(&ftrace_sysctl_lock);  } + +/* Allocate a return stack for newly created task */ +void ftrace_retfunc_init_task(struct task_struct *t) +{ +	if (atomic_read(&ftrace_retfunc_active)) { +		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH +				* sizeof(struct ftrace_ret_stack), +				GFP_KERNEL); +		if (!t->ret_stack) +			return; +		t->curr_ret_stack = -1; +		atomic_set(&t->trace_overrun, 0); +	} else +		t->ret_stack = NULL; +} + +void ftrace_retfunc_exit_task(struct task_struct *t) +{ +	struct ftrace_ret_stack	*ret_stack = t->ret_stack; + +	t->ret_stack = NULL; +	/* NULL must become visible to IRQs before we free it: */ +	barrier(); + +	kfree(ret_stack); +}  #endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 85ced143c2c..e206951603c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -18,8 +18,46 @@  #include "trace.h" -/* Global flag to disable all recording to ring buffers */ -static int ring_buffers_off __read_mostly; +/* + * A fast way to enable or disable all ring buffers is to + * call tracing_on or tracing_off. Turning off the ring buffers + * prevents all ring buffers from being recorded to. + * Turning this switch on, makes it OK to write to the + * ring buffer, if the ring buffer is enabled itself. + * + * There's three layers that must be on in order to write + * to the ring buffer. + * + * 1) This global flag must be set. + * 2) The ring buffer must be enabled for recording. + * 3) The per cpu buffer must be enabled for recording. + * + * In case of an anomaly, this global flag has a bit set that + * will permantly disable all ring buffers. + */ + +/* + * Global flag to disable all recording to ring buffers + *  This has two bits: ON, DISABLED + * + *  ON   DISABLED + * ---- ---------- + *   0      0        : ring buffers are off + *   1      0        : ring buffers are on + *   X      1        : ring buffers are permanently disabled + */ + +enum { +	RB_BUFFERS_ON_BIT	= 0, +	RB_BUFFERS_DISABLED_BIT	= 1, +}; + +enum { +	RB_BUFFERS_ON		= 1 << RB_BUFFERS_ON_BIT, +	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT, +}; + +static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;  /**   * tracing_on - enable all tracing buffers @@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;   */  void tracing_on(void)  { -	ring_buffers_off = 0; +	set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);  }  /** @@ -42,7 +80,18 @@ void tracing_on(void)   */  void tracing_off(void)  { -	ring_buffers_off = 1; +	clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); +} + +/** + * tracing_off_permanent - permanently disable ring buffers + * + * This function, once called, will disable all ring buffers + * permanenty. + */ +void tracing_off_permanent(void) +{ +	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);  }  #include "trace.h" @@ -1185,7 +1234,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,  	struct ring_buffer_event *event;  	int cpu, resched; -	if (ring_buffers_off) +	if (ring_buffer_flags != RB_BUFFERS_ON)  		return NULL;  	if (atomic_read(&buffer->record_disabled)) @@ -1297,7 +1346,7 @@ int ring_buffer_write(struct ring_buffer *buffer,  	int ret = -EBUSY;  	int cpu, resched; -	if (ring_buffers_off) +	if (ring_buffer_flags != RB_BUFFERS_ON)  		return -EBUSY;  	if (atomic_read(&buffer->record_disabled)) @@ -2178,12 +2227,14 @@ static ssize_t  rb_simple_read(struct file *filp, char __user *ubuf,  	       size_t cnt, loff_t *ppos)  { -	int *p = filp->private_data; +	long *p = filp->private_data;  	char buf[64];  	int r; -	/* !ring_buffers_off == tracing_on */ -	r = sprintf(buf, "%d\n", !*p); +	if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) +		r = sprintf(buf, "permanently disabled\n"); +	else +		r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);  } @@ -2192,7 +2243,7 @@ static ssize_t  rb_simple_write(struct file *filp, const char __user *ubuf,  		size_t cnt, loff_t *ppos)  { -	int *p = filp->private_data; +	long *p = filp->private_data;  	char buf[64];  	long val;  	int ret; @@ -2209,8 +2260,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  	if (ret < 0)  		return ret; -	/* !ring_buffers_off == tracing_on */ -	*p = !val; +	if (val) +		set_bit(RB_BUFFERS_ON_BIT, p); +	else +		clear_bit(RB_BUFFERS_ON_BIT, p);  	(*ppos)++; @@ -2232,7 +2285,7 @@ static __init int rb_init_debugfs(void)  	d_tracer = tracing_init_dentry();  	entry = debugfs_create_file("tracing_on", 0644, d_tracer, -				    &ring_buffers_off, &rb_simple_fops); +				    &ring_buffer_flags, &rb_simple_fops);  	if (!entry)  		pr_warning("Could not create debugfs 'tracing_on' entry\n"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4ee6f037522..8df8fdd69c9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -30,6 +30,7 @@  #include <linux/gfp.h>  #include <linux/fs.h>  #include <linux/kprobes.h> +#include <linux/seq_file.h>  #include <linux/writeback.h>  #include <linux/stacktrace.h> @@ -275,6 +276,8 @@ static const char *trace_options[] = {  	"ftrace_preempt",  	"branch",  	"annotate", +	"userstacktrace", +	"sym-userobj",  	NULL  }; @@ -421,6 +424,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)  	return trace_seq_putmem(s, hex, j);  } +static int +trace_seq_path(struct trace_seq *s, struct path *path) +{ +	unsigned char *p; + +	if (s->len >= (PAGE_SIZE - 1)) +		return 0; +	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); +	if (!IS_ERR(p)) { +		p = mangle_path(s->buffer + s->len, p, "\n"); +		if (p) { +			s->len = p - s->buffer; +			return 1; +		} +	} else { +		s->buffer[s->len++] = '?'; +		return 1; +	} + +	return 0; +} +  static void  trace_seq_reset(struct trace_seq *s)  { @@ -661,6 +686,21 @@ static int trace_stop_count;  static DEFINE_SPINLOCK(tracing_start_lock);  /** + * ftrace_off_permanent - disable all ftrace code permanently + * + * This should only be called when a serious anomally has + * been detected.  This will turn off the function tracing, + * ring buffers, and other tracing utilites. It takes no + * locks and can be called from any context. + */ +void ftrace_off_permanent(void) +{ +	tracing_disabled = 1; +	ftrace_stop(); +	tracing_off_permanent(); +} + +/**   * tracing_start - quick start of the tracer   *   * If tracing is enabled but was stopped by tracing_stop, @@ -801,6 +841,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  	entry->preempt_count		= pc & 0xff;  	entry->pid			= (tsk) ? tsk->pid : 0; +	entry->tgid               	= (tsk) ? tsk->tgid : 0;  	entry->flags =  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT  		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -918,6 +959,44 @@ void __trace_stack(struct trace_array *tr,  	ftrace_trace_stack(tr, data, flags, skip, preempt_count());  } +static void ftrace_trace_userstack(struct trace_array *tr, +		   struct trace_array_cpu *data, +		   unsigned long flags, int pc) +{ +	struct ring_buffer_event *event; +	struct userstack_entry *entry; +	struct stack_trace trace; +	unsigned long irq_flags; + +	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) +		return; + +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +					 &irq_flags); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, flags, pc); +	entry->ent.type		= TRACE_USER_STACK; + +	memset(&entry->caller, 0, sizeof(entry->caller)); + +	trace.nr_entries	= 0; +	trace.max_entries	= FTRACE_STACK_ENTRIES; +	trace.skip		= 0; +	trace.entries		= entry->caller; + +	save_stack_trace_user(&trace); +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +} + +void __trace_userstack(struct trace_array *tr, +		   struct trace_array_cpu *data, +		   unsigned long flags) +{ +	ftrace_trace_userstack(tr, data, flags, preempt_count()); +} +  static void  ftrace_trace_special(void *__tr, void *__data,  		     unsigned long arg1, unsigned long arg2, unsigned long arg3, @@ -941,6 +1020,7 @@ ftrace_trace_special(void *__tr, void *__data,  	entry->arg3			= arg3;  	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);  	ftrace_trace_stack(tr, data, irq_flags, 4, pc); +	ftrace_trace_userstack(tr, data, irq_flags, pc);  	trace_wake_up();  } @@ -979,6 +1059,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  	entry->next_cpu	= task_cpu(next);  	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);  	ftrace_trace_stack(tr, data, flags, 5, pc); +	ftrace_trace_userstack(tr, data, flags, pc);  }  void @@ -1008,6 +1089,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	entry->next_cpu			= task_cpu(wakee);  	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);  	ftrace_trace_stack(tr, data, flags, 6, pc); +	ftrace_trace_userstack(tr, data, flags, pc);  	trace_wake_up();  } @@ -1387,6 +1469,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)  	return ret;  } +static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, +				    unsigned long ip, unsigned long sym_flags) +{ +	struct file *file = NULL; +	unsigned long vmstart = 0; +	int ret = 1; + +	if (mm) { +		const struct vm_area_struct *vma; + +		down_read(&mm->mmap_sem); +		vma = find_vma(mm, ip); +		if (vma) { +			file = vma->vm_file; +			vmstart = vma->vm_start; +		} +		if (file) { +			ret = trace_seq_path(s, &file->f_path); +			if (ret) +				ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); +		} +		up_read(&mm->mmap_sem); +	} +	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) +		ret = trace_seq_printf(s, " <" IP_FMT ">", ip); +	return ret; +} + +static int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, +		      unsigned long sym_flags) +{ +	struct mm_struct *mm = NULL; +	int ret = 1; +	unsigned int i; + +	if (trace_flags & TRACE_ITER_SYM_USEROBJ) { +		struct task_struct *task; +		/* +		 * we do the lookup on the thread group leader, +		 * since individual threads might have already quit! +		 */ +		rcu_read_lock(); +		task = find_task_by_vpid(entry->ent.tgid); +		if (task) +			mm = get_task_mm(task); +		rcu_read_unlock(); +	} + +	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { +		unsigned long ip = entry->caller[i]; + +		if (ip == ULONG_MAX || !ret) +			break; +		if (i && ret) +			ret = trace_seq_puts(s, " <- "); +		if (!ip) { +			if (ret) +				ret = trace_seq_puts(s, "??"); +			continue; +		} +		if (!ret) +			break; +		if (ret) +			ret = seq_print_user_ip(s, mm, ip, sym_flags); +	} + +	if (mm) +		mmput(mm); +	return ret; +} +  static void print_lat_help_header(struct seq_file *m)  {  	seq_puts(m, "#                  _------=> CPU#            \n"); @@ -1702,6 +1856,15 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)  				 field->line);  		break;  	} +	case TRACE_USER_STACK: { +		struct userstack_entry *field; + +		trace_assign_type(field, entry); + +		seq_print_userip_objs(field, s, sym_flags); +		trace_seq_putc(s, '\n'); +		break; +	}  	default:  		trace_seq_printf(s, "Unknown type %d\n", entry->type);  	} @@ -1853,6 +2016,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)  				 field->line);  		break;  	} +	case TRACE_USER_STACK: { +		struct userstack_entry *field; + +		trace_assign_type(field, entry); + +		ret = seq_print_userip_objs(field, s, sym_flags); +		if (!ret) +			return TRACE_TYPE_PARTIAL_LINE; +		ret = trace_seq_putc(s, '\n'); +		if (!ret) +			return TRACE_TYPE_PARTIAL_LINE; +		break; +	}  	}  	return TRACE_TYPE_HANDLED;  } @@ -1912,6 +2088,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)  		break;  	}  	case TRACE_SPECIAL: +	case TRACE_USER_STACK:  	case TRACE_STACK: {  		struct special_entry *field; @@ -2000,6 +2177,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)  		break;  	}  	case TRACE_SPECIAL: +	case TRACE_USER_STACK:  	case TRACE_STACK: {  		struct special_entry *field; @@ -2054,6 +2232,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)  		break;  	}  	case TRACE_SPECIAL: +	case TRACE_USER_STACK:  	case TRACE_STACK: {  		struct special_entry *field; @@ -2119,7 +2298,9 @@ static int s_show(struct seq_file *m, void *v)  			seq_printf(m, "# tracer: %s\n", iter->trace->name);  			seq_puts(m, "#\n");  		} -		if (iter->iter_flags & TRACE_FILE_LAT_FMT) { +		if (iter->trace && iter->trace->print_header) +			iter->trace->print_header(m); +		else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {  			/* print nothing if the buffers are empty */  			if (trace_empty(iter))  				return 0; @@ -2171,6 +2352,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)  	iter->trace = current_trace;  	iter->pos = -1; +	/* Notify the tracer early; before we stop tracing. */ +	if (iter->trace && iter->trace->open) +			iter->trace->open(iter); +  	/* Annotate start of buffers if we had overruns */  	if (ring_buffer_overruns(iter->tr->buffer))  		iter->iter_flags |= TRACE_FILE_ANNOTATE; @@ -2196,9 +2381,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)  	/* stop the trace while dumping */  	tracing_stop(); -	if (iter->trace && iter->trace->open) -			iter->trace->open(iter); -  	mutex_unlock(&trace_types_lock);   out: @@ -3488,6 +3670,9 @@ void ftrace_dump(void)  		atomic_inc(&global_trace.data[cpu]->disabled);  	} +	/* don't look at user memory in panic mode */ +	trace_flags &= ~TRACE_ITER_SYM_USEROBJ; +  	printk(KERN_TRACE "Dumping ftrace buffer:\n");  	iter.tr = &global_trace; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cb12fd98f6..3abd645e8af 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -26,6 +26,8 @@ enum trace_type {  	TRACE_BOOT_CALL,  	TRACE_BOOT_RET,  	TRACE_FN_RET, +	TRACE_USER_STACK, +	TRACE_BTS,  	__TRACE_LAST_TYPE  }; @@ -42,6 +44,7 @@ struct trace_entry {  	unsigned char		flags;  	unsigned char		preempt_count;  	int			pid; +	int			tgid;  };  /* @@ -99,6 +102,11 @@ struct stack_entry {  	unsigned long		caller[FTRACE_STACK_ENTRIES];  }; +struct userstack_entry { +	struct trace_entry	ent; +	unsigned long		caller[FTRACE_STACK_ENTRIES]; +}; +  /*   * ftrace_printk entry:   */ @@ -146,6 +154,12 @@ struct trace_branch {  	char			correct;  }; +struct bts_entry { +	struct trace_entry	ent; +	unsigned long		from; +	unsigned long		to; +}; +  /*   * trace_flag_type is an enumeration that holds different   * states when a trace occurs. These are: @@ -240,6 +254,7 @@ extern void __ftrace_bad_type(void);  		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\  		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \  		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\ +		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\  		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\  		IF_ASSIGN(var, ent, struct special_entry, 0);		\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\ @@ -250,6 +265,7 @@ extern void __ftrace_bad_type(void);  		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\  		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \  		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ +		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\  		__ftrace_bad_type();					\  	} while (0) @@ -303,6 +319,7 @@ struct tracer {  	int			(*selftest)(struct tracer *trace,  					    struct trace_array *tr);  #endif +	void			(*print_header)(struct seq_file *m);  	enum print_line_t	(*print_line)(struct trace_iterator *iter);  	/* If you handled the flag setting, return 0 */  	int			(*set_flag)(u32 old_flags, u32 bit, int set); @@ -383,6 +400,10 @@ void trace_function(struct trace_array *tr,  void  trace_function_return(struct ftrace_retfunc *trace); +void trace_bts(struct trace_array *tr, +	       unsigned long from, +	       unsigned long to); +  void tracing_start_cmdline_record(void);  void tracing_stop_cmdline_record(void);  void tracing_sched_switch_assign_trace(struct trace_array *tr); @@ -500,6 +521,8 @@ enum trace_iterator_flags {  	TRACE_ITER_PREEMPTONLY		= 0x800,  	TRACE_ITER_BRANCH		= 0x1000,  	TRACE_ITER_ANNOTATE		= 0x2000, +	TRACE_ITER_USERSTACKTRACE       = 0x4000, +	TRACE_ITER_SYM_USEROBJ          = 0x8000  };  /* diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c new file mode 100644 index 00000000000..23b76e4690e --- /dev/null +++ b/kernel/trace/trace_bts.c @@ -0,0 +1,276 @@ +/* + * BTS tracer + * + * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com> + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> + +#include <asm/ds.h> + +#include "trace.h" + + +#define SIZEOF_BTS (1 << 13) + +static DEFINE_PER_CPU(struct bts_tracer *, tracer); +static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); + +#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_buffer per_cpu(buffer, smp_processor_id()) + + +/* + * Information to interpret a BTS record. + * This will go into an in-kernel BTS interface. + */ +static unsigned char sizeof_field; +static unsigned long debugctl_mask; + +#define sizeof_bts (3 * sizeof_field) + +static void bts_trace_cpuinit(struct cpuinfo_x86 *c) +{ +	switch (c->x86) { +	case 0x6: +		switch (c->x86_model) { +		case 0x0 ... 0xC: +			break; +		case 0xD: +		case 0xE: /* Pentium M */ +			sizeof_field = sizeof(long); +			debugctl_mask = (1<<6)|(1<<7); +			break; +		default: +			sizeof_field = 8; +			debugctl_mask = (1<<6)|(1<<7); +			break; +		} +		break; +	case 0xF: +		switch (c->x86_model) { +		case 0x0: +		case 0x1: +		case 0x2: /* Netburst */ +			sizeof_field = sizeof(long); +			debugctl_mask = (1<<2)|(1<<3); +			break; +		default: +			/* sorry, don't know about them */ +			break; +		} +		break; +	default: +		/* sorry, don't know about them */ +		break; +	} +} + +static inline void bts_enable(void) +{ +	unsigned long debugctl; + +	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask); +} + +static inline void bts_disable(void) +{ +	unsigned long debugctl; + +	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask); +} + +static void bts_trace_reset(struct trace_array *tr) +{ +	int cpu; + +	tr->time_start = ftrace_now(tr->cpu); + +	for_each_online_cpu(cpu) +		tracing_reset(tr, cpu); +} + +static void bts_trace_start_cpu(void *arg) +{ +	this_tracer = +		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, +			       /* ovfl = */ NULL, /* th = */ (size_t)-1); +	if (IS_ERR(this_tracer)) { +		this_tracer = NULL; +		return; +	} + +	bts_enable(); +} + +static void bts_trace_start(struct trace_array *tr) +{ +	int cpu; + +	bts_trace_reset(tr); + +	for_each_cpu_mask(cpu, cpu_possible_map) +		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); +} + +static void bts_trace_stop_cpu(void *arg) +{ +	if (this_tracer) { +		bts_disable(); + +		ds_release_bts(this_tracer); +		this_tracer = NULL; +	} +} + +static void bts_trace_stop(struct trace_array *tr) +{ +	int cpu; + +	for_each_cpu_mask(cpu, cpu_possible_map) +		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); +} + +static int bts_trace_init(struct trace_array *tr) +{ +	bts_trace_cpuinit(&boot_cpu_data); +	bts_trace_reset(tr); +	bts_trace_start(tr); + +	return 0; +} + +static void bts_trace_print_header(struct seq_file *m) +{ +#ifdef __i386__ +	seq_puts(m, "# CPU#    FROM           TO     FUNCTION\n"); +	seq_puts(m, "#  |       |             |         |\n"); +#else +	seq_puts(m, +		 "# CPU#        FROM                   TO         FUNCTION\n"); +	seq_puts(m, +		 "#  |           |                     |             |\n"); +#endif +} + +static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) +{ +	struct trace_entry *entry = iter->ent; +	struct trace_seq *seq = &iter->seq; +	struct bts_entry *it; + +	trace_assign_type(it, entry); + +	if (entry->type == TRACE_BTS) { +		int ret; +#ifdef CONFIG_KALLSYMS +		char function[KSYM_SYMBOL_LEN]; +		sprint_symbol(function, it->from); +#else +		char *function = "<unknown>"; +#endif + +		ret = trace_seq_printf(seq, "%4d  0x%lx -> 0x%lx [%s]\n", +				       entry->cpu, it->from, it->to, function); +		if (!ret) +			return TRACE_TYPE_PARTIAL_LINE;; +		return TRACE_TYPE_HANDLED; +	} +	return TRACE_TYPE_UNHANDLED; +} + +void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to) +{ +	struct ring_buffer_event *event; +	struct bts_entry *entry; +	unsigned long irq; + +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	tracing_generic_entry_update(&entry->ent, 0, from); +	entry->ent.type = TRACE_BTS; +	entry->ent.cpu = smp_processor_id(); +	entry->from = from; +	entry->to   = to; +	ring_buffer_unlock_commit(tr->buffer, event, irq); +} + +static void trace_bts_at(struct trace_array *tr, size_t index) +{ +	const void *raw = NULL; +	unsigned long from, to; +	int err; + +	err = ds_access_bts(this_tracer, index, &raw); +	if (err < 0) +		return; + +	from = *(const unsigned long *)raw; +	to = *(const unsigned long *)((const char *)raw + sizeof_field); + +	trace_bts(tr, from, to); +} + +static void trace_bts_cpu(void *arg) +{ +	struct trace_array *tr = (struct trace_array *) arg; +	size_t index = 0, end = 0, i; +	int err; + +	if (!this_tracer) +		return; + +	bts_disable(); + +	err = ds_get_bts_index(this_tracer, &index); +	if (err < 0) +		goto out; + +	err = ds_get_bts_end(this_tracer, &end); +	if (err < 0) +		goto out; + +	for (i = index; i < end; i++) +		trace_bts_at(tr, i); + +	for (i = 0; i < index; i++) +		trace_bts_at(tr, i); + +out: +	bts_enable(); +} + +static void trace_bts_prepare(struct trace_iterator *iter) +{ +	int cpu; + +	for_each_cpu_mask(cpu, cpu_possible_map) +		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); +} + +struct tracer bts_tracer __read_mostly = +{ +	.name		= "bts", +	.init		= bts_trace_init, +	.reset		= bts_trace_stop, +	.print_header	= bts_trace_print_header, +	.print_line	= bts_trace_print_line, +	.start		= bts_trace_start, +	.stop		= bts_trace_stop, +	.open		= trace_bts_prepare +}; + +__init static int init_bts_trace(void) +{ +	return register_tracer(&bts_tracer); +} +device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 433d650eda9..2a98a206acc 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -18,12 +18,14 @@ struct header_iter {  static struct trace_array *mmio_trace_array;  static bool overrun_detected; +static unsigned long prev_overruns;  static void mmio_reset_data(struct trace_array *tr)  {  	int cpu;  	overrun_detected = false; +	prev_overruns = 0;  	tr->time_start = ftrace_now(tr->cpu);  	for_each_online_cpu(cpu) @@ -123,16 +125,12 @@ static void mmio_close(struct trace_iterator *iter)  static unsigned long count_overruns(struct trace_iterator *iter)  { -	int cpu;  	unsigned long cnt = 0; -/* FIXME: */ -#if 0 -	for_each_online_cpu(cpu) { -		cnt += iter->overrun[cpu]; -		iter->overrun[cpu] = 0; -	} -#endif -	(void)cpu; +	unsigned long over = ring_buffer_overruns(iter->tr->buffer); + +	if (over > prev_overruns) +		cnt = over - prev_overruns; +	prev_overruns = over;  	return cnt;  } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index eeac71c87c6..0197e2f6b54 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -130,11 +130,13 @@ my %weak;		# List of weak functions  my %convert;		# List of local functions used that needs conversion  my $type; +my $nm_regex;		# Find the local functions (return function)  my $section_regex;	# Find the start of a section  my $function_regex;	# Find the name of a function  			#    (return offset and func name)  my $mcount_regex;	# Find the call site to mcount (return offset) -my $alignment;         # The .align value to use for $mcount_section +my $alignment;		# The .align value to use for $mcount_section +my $section_type;	# Section header plus possible alignment command  if ($arch eq "x86") {      if ($bits == 64) { @@ -144,9 +146,18 @@ if ($arch eq "x86") {      }  } +# +# We base the defaults off of i386, the other archs may +# feel free to change them in the below if statements. +# +$nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; +$section_regex = "Disassembly of section\\s+(\\S+):"; +$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; +$mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; +$section_type = '@progbits'; +$type = ".long"; +  if ($arch eq "x86_64") { -    $section_regex = "Disassembly of section\\s+(\\S+):"; -    $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";      $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$";      $type = ".quad";      $alignment = 8; @@ -158,10 +169,6 @@ if ($arch eq "x86_64") {      $cc .= " -m64";  } elsif ($arch eq "i386") { -    $section_regex = "Disassembly of section\\s+(\\S+):"; -    $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; -    $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; -    $type = ".long";      $alignment = 4;      # force flags for this arch @@ -170,6 +177,27 @@ if ($arch eq "x86_64") {      $objcopy .= " -O elf32-i386";      $cc .= " -m32"; +} elsif ($arch eq "sh") { +    $alignment = 2; + +    # force flags for this arch +    $ld .= " -m shlelf_linux"; +    $objcopy .= " -O elf32-sh-linux"; +    $cc .= " -m32"; + +} elsif ($arch eq "powerpc") { +    $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)"; +    $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:"; +    $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$"; + +    if ($bits == 64) { +	$type = ".quad"; +    } + +} elsif ($arch eq "arm") { +    $alignment = 2; +    $section_type = '%progbits'; +  } else {      die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";  } @@ -239,7 +267,7 @@ if (!$found_version) {  #  open (IN, "$nm $inputfile|") || die "error running $nm";  while (<IN>) { -    if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) { +    if (/$nm_regex/) {  	$locals{$1} = 1;      } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) {  	$weak{$2} = $1; @@ -290,8 +318,8 @@ sub update_funcs  	if (!$opened) {  	    open(FILE, ">$mcount_s") || die "can't create $mcount_s\n";  	    $opened = 1; -	    print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; -	    print FILE "\t.align $alignment\n"; +	    print FILE "\t.section $mcount_section,\"a\",$section_type\n"; +	    print FILE "\t.align $alignment\n" if (defined($alignment));  	}  	printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset;      }  |