diff options
Diffstat (limited to 'arch/x86/kernel/ftrace.c')
| -rw-r--r-- | arch/x86/kernel/ftrace.c | 500 | 
1 files changed, 332 insertions, 168 deletions
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f272f..32ff36596ab 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,40 +24,21 @@  #include <trace/syscall.h>  #include <asm/cacheflush.h> +#include <asm/kprobes.h>  #include <asm/ftrace.h>  #include <asm/nops.h> -#include <asm/nmi.h> -  #ifdef CONFIG_DYNAMIC_FTRACE -/* - * modifying_code is set to notify NMIs that they need to use - * memory barriers when entering or exiting. But we don't want - * to burden NMIs with unnecessary memory barriers when code - * modification is not being done (which is most of the time). - * - * A mutex is already held when ftrace_arch_code_modify_prepare - * and post_process are called. No locks need to be taken here. - * - * Stop machine will make sure currently running NMIs are done - * and new NMIs will see the updated variable before we need - * to worry about NMIs doing memory barriers. - */ -static int modifying_code __read_mostly; -static DEFINE_PER_CPU(int, save_modifying_code); -  int ftrace_arch_code_modify_prepare(void)  {  	set_kernel_text_rw();  	set_all_modules_text_rw(); -	modifying_code = 1;  	return 0;  }  int ftrace_arch_code_modify_post_process(void)  { -	modifying_code = 0;  	set_all_modules_text_ro();  	set_kernel_text_ro();  	return 0; @@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)  	return calc.code;  } -/* - * Modifying code must take extra care. On an SMP machine, if - * the code being modified is also being executed on another CPU - * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. - * But this does not stop NMIs from happening. We still need - * to protect against that. We separate out the modification of - * the code to take care of this. - * - * Two buffers are added: An IP buffer and a "code" buffer. - * - * 1) Put the instruction pointer into the IP buffer - *    and the new code into the "code" buffer. - * 2) Wait for any running NMIs to finish and set a flag that says - *    we are modifying code, it is done in an atomic operation. - * 3) Write the code - * 4) clear the flag. - * 5) Wait for any running NMIs to finish. - * - * If an NMI is executed, the first thing it does is to call - * "ftrace_nmi_enter". This will check if the flag is set to write - * and if it is, it will write what is in the IP and "code" buffers. - * - * The trick is, it does not matter if everyone is writing the same - * content to the code location. Also, if a CPU is executing code - * it is OK to write to that code location if the contents being written - * are the same as what exists. - */ - -#define MOD_CODE_WRITE_FLAG (1 << 31)	/* set when NMI should do the write */ -static atomic_t nmi_running = ATOMIC_INIT(0); -static int mod_code_status;		/* holds return value of text write */ -static void *mod_code_ip;		/* holds the IP to write to */ -static const void *mod_code_newcode;	/* holds the text to write to the IP */ - -static unsigned nmi_wait_count; -static atomic_t nmi_update_count = ATOMIC_INIT(0); - -int ftrace_arch_read_dyn_info(char *buf, int size) -{ -	int r; - -	r = snprintf(buf, size, "%u %u", -		     nmi_wait_count, -		     atomic_read(&nmi_update_count)); -	return r; -} - -static void clear_mod_flag(void) -{ -	int old = atomic_read(&nmi_running); - -	for (;;) { -		int new = old & ~MOD_CODE_WRITE_FLAG; - -		if (old == new) -			break; - -		old = atomic_cmpxchg(&nmi_running, old, new); -	} -} - -static void ftrace_mod_code(void) -{ -	/* -	 * Yes, more than one CPU process can be writing to mod_code_status. -	 *    (and the code itself) -	 * But if one were to fail, then they all should, and if one were -	 * to succeed, then they all should. -	 */ -	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, -					     MCOUNT_INSN_SIZE); - -	/* if we fail, then kill any new writers */ -	if (mod_code_status) -		clear_mod_flag(); -} - -void ftrace_nmi_enter(void) -{ -	__this_cpu_write(save_modifying_code, modifying_code); - -	if (!__this_cpu_read(save_modifying_code)) -		return; - -	if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { -		smp_rmb(); -		ftrace_mod_code(); -		atomic_inc(&nmi_update_count); -	} -	/* Must have previous changes seen before executions */ -	smp_mb(); -} - -void ftrace_nmi_exit(void) -{ -	if (!__this_cpu_read(save_modifying_code)) -		return; - -	/* Finish all executions before clearing nmi_running */ -	smp_mb(); -	atomic_dec(&nmi_running); -} - -static void wait_for_nmi_and_set_mod_flag(void) -{ -	if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) -		return; - -	do { -		cpu_relax(); -	} while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); - -	nmi_wait_count++; -} - -static void wait_for_nmi(void) -{ -	if (!atomic_read(&nmi_running)) -		return; - -	do { -		cpu_relax(); -	} while (atomic_read(&nmi_running)); - -	nmi_wait_count++; -} -  static inline int  within(unsigned long addr, unsigned long start, unsigned long end)  { @@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)  	if (within(ip, (unsigned long)_text, (unsigned long)_etext))  		ip = (unsigned long)__va(__pa(ip)); -	mod_code_ip = (void *)ip; -	mod_code_newcode = new_code; - -	/* The buffers need to be visible before we let NMIs write them */ -	smp_mb(); - -	wait_for_nmi_and_set_mod_flag(); - -	/* Make sure all running NMIs have finished before we write the code */ -	smp_mb(); - -	ftrace_mod_code(); - -	/* Make sure the write happens before clearing the bit */ -	smp_mb(); - -	clear_mod_flag(); -	wait_for_nmi(); - -	return mod_code_status; +	return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);  }  static const unsigned char *ftrace_nop_replace(void) @@ -334,6 +168,336 @@ int ftrace_update_ftrace_func(ftrace_func_t func)  	return ret;  } +int modifying_ftrace_code __read_mostly; + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ +	if (WARN_ON_ONCE(!regs)) +		return 0; + +	if (!ftrace_location(regs->ip - 1)) +		return 0; + +	regs->ip += MCOUNT_INSN_SIZE - 1; + +	return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ +	/* +	 * On x86_64, kernel text mappings are mapped read-only with +	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead +	 * of the kernel text mapping to modify the kernel text. +	 * +	 * For 32bit kernels, these mappings are same and we can use +	 * kernel identity mapping to modify code. +	 */ +	if (within(ip, (unsigned long)_text, (unsigned long)_etext)) +		ip = (unsigned long)__va(__pa(ip)); + +	return probe_kernel_write((void *)ip, val, size); +} + +static int add_break(unsigned long ip, const char *old) +{ +	unsigned char replaced[MCOUNT_INSN_SIZE]; +	unsigned char brk = BREAKPOINT_INSTRUCTION; + +	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* Make sure it is what we expect it to be */ +	if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) +		return -EINVAL; + +	if (ftrace_write(ip, &brk, 1)) +		return -EPERM; + +	return 0; +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned const char *old; +	unsigned long ip = rec->ip; + +	old = ftrace_call_replace(ip, addr); + +	return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ +	unsigned const char *old; + +	old = ftrace_nop_replace(); + +	return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ +	unsigned long ftrace_addr; +	int ret; + +	ret = ftrace_test_record(rec, enable); + +	ftrace_addr = (unsigned long)FTRACE_ADDR; + +	switch (ret) { +	case FTRACE_UPDATE_IGNORE: +		return 0; + +	case FTRACE_UPDATE_MAKE_CALL: +		/* converting nop to call */ +		return add_brk_on_nop(rec); + +	case FTRACE_UPDATE_MAKE_NOP: +		/* converting a call to a nop */ +		return add_brk_on_call(rec, ftrace_addr); +	} +	return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ +	unsigned char ins[MCOUNT_INSN_SIZE]; +	unsigned char brk = BREAKPOINT_INSTRUCTION; +	const unsigned char *nop; +	unsigned long ftrace_addr; +	unsigned long ip = rec->ip; + +	/* If we fail the read, just give up */ +	if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* If this does not have a breakpoint, we are done */ +	if (ins[0] != brk) +		return -1; + +	nop = ftrace_nop_replace(); + +	/* +	 * If the last 4 bytes of the instruction do not match +	 * a nop, then we assume that this is a call to ftrace_addr. +	 */ +	if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { +		/* +		 * For extra paranoidism, we check if the breakpoint is on +		 * a call that would actually jump to the ftrace_addr. +		 * If not, don't touch the breakpoint, we make just create +		 * a disaster. +		 */ +		ftrace_addr = (unsigned long)FTRACE_ADDR; +		nop = ftrace_call_replace(ip, ftrace_addr); + +		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) +			return -EINVAL; +	} + +	return probe_kernel_write((void *)ip, &nop[0], 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ +	/* skip breakpoint */ +	ip++; +	new++; +	if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) +		return -EPERM; +	return 0; +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_call_replace(ip, addr); +	return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_nop_replace(); +	return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ +	unsigned long ftrace_addr; +	int ret; + +	ret = ftrace_test_record(rec, enable); + +	ftrace_addr = (unsigned long)FTRACE_ADDR; + +	switch (ret) { +	case FTRACE_UPDATE_IGNORE: +		return 0; + +	case FTRACE_UPDATE_MAKE_CALL: +		/* converting nop to call */ +		return add_update_call(rec, ftrace_addr); + +	case FTRACE_UPDATE_MAKE_NOP: +		/* converting a call to a nop */ +		return add_update_nop(rec); +	} + +	return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_call_replace(ip, addr); + +	if (ftrace_write(ip, new, 1)) +		return -EPERM; + +	return 0; +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_nop_replace(); + +	if (ftrace_write(ip, new, 1)) +		return -EPERM; +	return 0; +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ +	unsigned long ftrace_addr; +	int ret; + +	ret = ftrace_update_record(rec, enable); + +	ftrace_addr = (unsigned long)FTRACE_ADDR; + +	switch (ret) { +	case FTRACE_UPDATE_IGNORE: +		return 0; + +	case FTRACE_UPDATE_MAKE_CALL: +		/* converting nop to call */ +		return finish_update_call(rec, ftrace_addr); + +	case FTRACE_UPDATE_MAKE_NOP: +		/* converting a call to a nop */ +		return finish_update_nop(rec); +	} + +	return 0; +} + +static void do_sync_core(void *data) +{ +	sync_core(); +} + +static void run_sync(void) +{ +	int enable_irqs = irqs_disabled(); + +	/* We may be called with interrupts disbled (on bootup). */ +	if (enable_irqs) +		local_irq_enable(); +	on_each_cpu(do_sync_core, NULL, 1); +	if (enable_irqs) +		local_irq_disable(); +} + +void ftrace_replace_code(int enable) +{ +	struct ftrace_rec_iter *iter; +	struct dyn_ftrace *rec; +	const char *report = "adding breakpoints"; +	int count = 0; +	int ret; + +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); + +		ret = add_breakpoints(rec, enable); +		if (ret) +			goto remove_breakpoints; +		count++; +	} + +	run_sync(); + +	report = "updating code"; + +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); + +		ret = add_update(rec, enable); +		if (ret) +			goto remove_breakpoints; +	} + +	run_sync(); + +	report = "removing breakpoints"; + +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); + +		ret = finish_update(rec, enable); +		if (ret) +			goto remove_breakpoints; +	} + +	run_sync(); + +	return; + + remove_breakpoints: +	ftrace_bug(ret, rec ? rec->ip : 0); +	printk(KERN_WARNING "Failed on %s (%d):\n", report, count); +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); +		remove_breakpoint(rec); +	} +} + +void arch_ftrace_update_code(int command) +{ +	modifying_ftrace_code++; + +	ftrace_modify_all_code(command); + +	modifying_ftrace_code--; +} +  int __init ftrace_dyn_arch_init(void *data)  {  	/* The return code is retured via data */  |