diff options
Diffstat (limited to 'arch/x86/kernel')
68 files changed, 2348 insertions, 1105 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369059c07a..532d2e090e6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o  obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o  obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o  obj-$(CONFIG_KPROBES)		+= kprobes.o +obj-$(CONFIG_OPTPROBES)		+= kprobes-opt.o  obj-$(CONFIG_MODULES)		+= module.o  obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o  obj-$(CONFIG_KGDB)		+= kgdb.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ce664f33ea8..406ed77216d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void)  #ifdef CONFIG_ACPI_HOTPLUG_CPU  #include <acpi/processor.h> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)  {  #ifdef CONFIG_ACPI_NUMA  	int nid; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8c3cdded6f2..359b6899a36 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -180,6 +180,7 @@ static struct apic apic_flat =  {  	.name				= "flat",  	.probe				= flat_probe,  	.acpi_madt_oem_check		= flat_acpi_madt_oem_check, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= flat_apic_id_registered,  	.irq_delivery_mode		= dest_LowestPrio, @@ -337,6 +338,7 @@ static struct apic apic_physflat =  {  	.name				= "physical flat",  	.probe				= physflat_probe,  	.acpi_madt_oem_check		= physflat_acpi_madt_oem_check, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= flat_apic_id_registered,  	.irq_delivery_mode		= dest_Fixed, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 775b82bc655..634ae6cdd5c 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -124,6 +124,7 @@ struct apic apic_noop = {  	.probe				= noop_probe,  	.acpi_madt_oem_check		= NULL, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= noop_apic_id_registered,  	.irq_delivery_mode		= dest_LowestPrio, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 09d3d8c1cd9..d9ea5f331ac 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void)  	return get_apic_id(apic_read(APIC_ID));  } +static int numachip_apic_id_valid(int apicid) +{ +	/* Trust what bootloader passes in MADT */ +	return 1; +} +  static int numachip_apic_id_registered(void)  {  	return physid_isset(read_xapic_id(), phys_cpu_present_map); @@ -223,10 +229,11 @@ static int __init numachip_system_init(void)  }  early_initcall(numachip_system_init); -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  {  	if (!strncmp(oem_id, "NUMASC", 6)) {  		numachip_system = 1; +		setup_force_cpu_cap(X86_FEATURE_X2APIC);  		return 1;  	} @@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = {  	.name				= "NumaConnect system",  	.probe				= numachip_probe,  	.acpi_madt_oem_check		= numachip_acpi_madt_oem_check, +	.apic_id_valid			= numachip_apic_id_valid,  	.apic_id_registered		= numachip_apic_id_registered,  	.irq_delivery_mode		= dest_Fixed, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 521bead0113..0cdec7065af 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -198,6 +198,7 @@ static struct apic apic_bigsmp = {  	.name				= "bigsmp",  	.probe				= probe_bigsmp,  	.acpi_madt_oem_check		= NULL, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= bigsmp_apic_id_registered,  	.irq_delivery_mode		= dest_Fixed, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 5d513bc47b6..e42d1d3b913 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = {  	.name				= "es7000",  	.probe				= probe_es7000,  	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check_cluster, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= es7000_apic_id_registered,  	.irq_delivery_mode		= dest_LowestPrio, @@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = {  	.name				= "es7000",  	.probe				= probe_es7000,  	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= es7000_apic_id_registered,  	.irq_delivery_mode		= dest_Fixed, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fb072754bc1..6d10a66fc5a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3967,18 +3967,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)  static __init int bad_ioapic(unsigned long address)  {  	if (nr_ioapics >= MAX_IO_APICS) { -		printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " -		       "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); +		pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", +			MAX_IO_APICS, nr_ioapics);  		return 1;  	}  	if (!address) { -		printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" -		       " found in table, skipping!\n"); +		pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");  		return 1;  	}  	return 0;  } +static __init int bad_ioapic_register(int idx) +{ +	union IO_APIC_reg_00 reg_00; +	union IO_APIC_reg_01 reg_01; +	union IO_APIC_reg_02 reg_02; + +	reg_00.raw = io_apic_read(idx, 0); +	reg_01.raw = io_apic_read(idx, 1); +	reg_02.raw = io_apic_read(idx, 2); + +	if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { +		pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", +			mpc_ioapic_addr(idx)); +		return 1; +	} + +	return 0; +} +  void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)  {  	int idx = 0; @@ -3995,6 +4013,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)  	ioapics[idx].mp_config.apicaddr = address;  	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + +	if (bad_ioapic_register(idx)) { +		clear_fixmap(FIX_IO_APIC_BASE_0 + idx); +		return; +	} +  	ioapics[idx].mp_config.apicid = io_apic_unique_id(id);  	ioapics[idx].mp_config.apicver = io_apic_get_version(idx); @@ -4015,10 +4039,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)  	if (gsi_cfg->gsi_end >= gsi_top)  		gsi_top = gsi_cfg->gsi_end + 1; -	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " -	       "GSI %d-%d\n", idx, mpc_ioapic_id(idx), -	       mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), -	       gsi_cfg->gsi_base, gsi_cfg->gsi_end); +	pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", +		idx, mpc_ioapic_id(idx), +		mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), +		gsi_cfg->gsi_base, gsi_cfg->gsi_end);  	nr_ioapics++;  } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index c4a61ca1349..00d2422ca7c 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = {  	.name				= "NUMAQ",  	.probe				= probe_numaq,  	.acpi_madt_oem_check		= NULL, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= numaq_apic_id_registered,  	.irq_delivery_mode		= dest_LowestPrio, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0787bb3412f..ff2c1b9aac4 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -92,6 +92,7 @@ static struct apic apic_default = {  	.name				= "default",  	.probe				= probe_default,  	.acpi_madt_oem_check		= NULL, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= default_apic_id_registered,  	.irq_delivery_mode		= dest_LowestPrio, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 19114423c58..fea000b27f0 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -496,6 +496,7 @@ static struct apic apic_summit = {  	.name				= "summit",  	.probe				= probe_summit,  	.acpi_madt_oem_check		= summit_acpi_madt_oem_check, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= summit_apic_id_registered,  	.irq_delivery_mode		= dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 50079587582..9193713060a 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = {  	.name				= "cluster x2apic",  	.probe				= x2apic_cluster_probe,  	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= x2apic_apic_id_registered,  	.irq_delivery_mode		= dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f5373dfde21..bcd1db6eaca 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = {  	.name				= "physical x2apic",  	.probe				= x2apic_phys_probe,  	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= x2apic_apic_id_registered,  	.irq_delivery_mode		= dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 79b05b88aa1..fc477142585 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = {  	.name				= "UV large system",  	.probe				= uv_probe,  	.acpi_madt_oem_check		= uv_acpi_madt_oem_check, +	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= uv_apic_id_registered,  	.irq_delivery_mode		= dest_Fixed, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f76623cbe26..5d56931a15b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1234,8 +1234,7 @@ static int suspend(int vetoable)  	struct apm_user	*as;  	dpm_suspend_start(PMSG_SUSPEND); - -	dpm_suspend_noirq(PMSG_SUSPEND); +	dpm_suspend_end(PMSG_SUSPEND);  	local_irq_disable();  	syscore_suspend(); @@ -1259,9 +1258,9 @@ static int suspend(int vetoable)  	syscore_resume();  	local_irq_enable(); -	dpm_resume_noirq(PMSG_RESUME); - +	dpm_resume_start(PMSG_RESUME);  	dpm_resume_end(PMSG_RESUME); +  	queue_event(APM_NORMAL_RESUME, NULL);  	spin_lock(&user_list_lock);  	for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1276,7 @@ static void standby(void)  {  	int err; -	dpm_suspend_noirq(PMSG_SUSPEND); +	dpm_suspend_end(PMSG_SUSPEND);  	local_irq_disable();  	syscore_suspend(); @@ -1291,7 +1290,7 @@ static void standby(void)  	syscore_resume();  	local_irq_enable(); -	dpm_resume_noirq(PMSG_RESUME); +	dpm_resume_start(PMSG_RESUME);  }  static apm_event_t get_event(void) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 25f24dccdcf..6ab6aa2fdfd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,6 +16,7 @@ obj-y			:= intel_cacheinfo.o scattered.o topology.o  obj-y			+= proc.o capflags.o powerflags.o common.o  obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o  obj-y			+= rdrand.o +obj-y			+= match.o  obj-$(CONFIG_X86_32)	+= bugs.o  obj-$(CONFIG_X86_64)	+= bugs_64.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4773f4aae3..0a44b90602b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@  #include <linux/mm.h>  #include <linux/io.h> +#include <linux/sched.h>  #include <asm/processor.h>  #include <asm/apic.h>  #include <asm/cpu.h> @@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)  	if (c->x86_power & (1 << 8)) {  		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);  		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); +		if (!check_tsc_unstable()) +			sched_clock_stable = 1;  	}  #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f16..e49477444ff 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,6 +18,7 @@  #include <asm/archrandom.h>  #include <asm/hypervisor.h>  #include <asm/processor.h> +#include <asm/debugreg.h>  #include <asm/sections.h>  #include <linux/topology.h>  #include <linux/cpumask.h> @@ -28,6 +29,7 @@  #include <asm/apic.h>  #include <asm/desc.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/mtrr.h>  #include <linux/numa.h>  #include <asm/asm.h> @@ -933,7 +935,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = {  	{ 0xc0011000, 0xc001103b},  }; -static void __cpuinit print_cpu_msr(void) +static void __cpuinit __print_cpu_msr(void)  {  	unsigned index_min, index_max;  	unsigned index; @@ -997,13 +999,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)  	else  		printk(KERN_CONT "\n"); -#ifdef CONFIG_SMP +	__print_cpu_msr(); +} + +void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) +{  	if (c->cpu_index < show_msr) -		print_cpu_msr(); -#else -	if (show_msr) -		print_cpu_msr(); -#endif +		__print_cpu_msr();  }  static __init int setup_disablecpuid(char *arg) @@ -1044,6 +1046,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =  DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +  /*   * Special IST stacks which the CPU switches to when it calls   * an IST-marked descriptor entry. Up to 7 stacks (hardware @@ -1111,6 +1115,7 @@ void debug_stack_reset(void)  DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;  EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);  #ifdef CONFIG_CC_STACKPROTECTOR  DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b45e5e7a90..73d08ed98a6 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)  	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;  } -static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, -					int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)  {  	int node; @@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);  #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))  #ifdef CONFIG_SMP -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) + +static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)  { -	struct _cpuid4_info	*this_leaf, *sibling_leaf; -	unsigned long num_threads_sharing; -	int index_msb, i, sibling; +	struct _cpuid4_info *this_leaf; +	int ret, i, sibling;  	struct cpuinfo_x86 *c = &cpu_data(cpu); -	if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { +	ret = 0; +	if (index == 3) { +		ret = 1;  		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {  			if (!per_cpu(ici_cpuid4_info, i))  				continue; @@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)  				set_bit(sibling, this_leaf->shared_cpu_map);  			}  		} -		return; +	} else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { +		ret = 1; +		for_each_cpu(i, cpu_sibling_mask(cpu)) { +			if (!per_cpu(ici_cpuid4_info, i)) +				continue; +			this_leaf = CPUID4_INFO_IDX(i, index); +			for_each_cpu(sibling, cpu_sibling_mask(cpu)) { +				if (!cpu_online(sibling)) +					continue; +				set_bit(sibling, this_leaf->shared_cpu_map); +			} +		}  	} + +	return ret; +} + +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +	struct _cpuid4_info *this_leaf, *sibling_leaf; +	unsigned long num_threads_sharing; +	int index_msb, i; +	struct cpuinfo_x86 *c = &cpu_data(cpu); + +	if (c->x86_vendor == X86_VENDOR_AMD) { +		if (cache_shared_amd_cpu_map_setup(cpu, index)) +			return; +	} +  	this_leaf = CPUID4_INFO_IDX(cpu, index);  	num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 00000000000..5502b289341 --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,91 @@ +#include <asm/cpu_device_id.h> +#include <asm/processor.h> +#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/slab.h> + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + *         {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL.  The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86_cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ +	const struct x86_cpu_id *m; +	struct cpuinfo_x86 *c = &boot_cpu_data; + +	for (m = match; m->vendor | m->family | m->model | m->feature; m++) { +		if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) +			continue; +		if (m->family != X86_FAMILY_ANY && c->x86 != m->family) +			continue; +		if (m->model != X86_MODEL_ANY && c->x86_model != m->model) +			continue; +		if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) +			continue; +		return m; +	} +	return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); + +ssize_t arch_print_cpu_modalias(struct device *dev, +				struct device_attribute *attr, +				char *bufptr) +{ +	int size = PAGE_SIZE; +	int i, n; +	char *buf = bufptr; + +	n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" +		     "model:%04X:feature:", +		boot_cpu_data.x86_vendor, +		boot_cpu_data.x86, +		boot_cpu_data.x86_model); +	size -= n; +	buf += n; +	size -= 1; +	for (i = 0; i < NCAPINTS*32; i++) { +		if (boot_cpu_has(i)) { +			n = snprintf(buf, size, ",%04X", i); +			if (n >= size) { +				WARN(1, "x86 features overflow page\n"); +				break; +			} +			size -= n; +			buf += n; +		} +	} +	*buf++ = '\n'; +	return buf - bufptr; +} + +int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) +{ +	char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); +	if (buf) { +		arch_print_cpu_modalias(NULL, NULL, buf); +		add_uevent_var(env, "MODALIAS=%s", buf); +		kfree(buf); +	} +	return 0; +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 7395d5f4272..0c82091b165 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -54,7 +54,14 @@ static struct severity {  #define  MASK(x, y)	.mask = x, .result = y  #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)  #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)  #define MCACOD 0xffff +/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ +#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */ +#define MCACOD_SCRUBMSK	0xfff0 +#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */ +#define MCACOD_DATA	0x0134	/* Data Load */ +#define MCACOD_INSTR	0x0150	/* Instruction Fetch */  	MCESEV(  		NO, "Invalid", @@ -102,11 +109,24 @@ static struct severity {  		SER, BITCLR(MCI_STATUS_S)  		), -	/* AR add known MCACODs here */  	MCESEV(  		PANIC, "Action required with lost events",  		SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)  		), + +	/* known AR MCACODs: */ +#ifdef	CONFIG_MEMORY_FAILURE +	MCESEV( +		KEEP, "HT thread notices Action required: data load error", +		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), +		MCGMASK(MCG_STATUS_EIPV, 0) +		), +	MCESEV( +		AR, "Action required: data load error", +		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), +		USER +		), +#endif  	MCESEV(  		PANIC, "Action required: unknown MCACOD",  		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) @@ -115,11 +135,11 @@ static struct severity {  	/* known AO MCACODs: */  	MCESEV(  		AO, "Action optional: memory scrubbing error", -		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) +		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)  		),  	MCESEV(  		AO, "Action optional: last level cache writeback error", -		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) +		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)  		),  	MCESEV(  		SOME, "Action optional: unknown MCACOD", diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e9..d086a09c087 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void)  {  	unsigned int next, i, prev = 0; -	next = rcu_dereference_check_mce(mcelog.next); +	next = ACCESS_ONCE(mcelog.next);  	do {  		struct mce *m; @@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)  	irq_work_queue(&__get_cpu_var(mce_irq_work));  } +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ +	if (m->status & MCI_STATUS_MISCV) +		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); +	if (m->status & MCI_STATUS_ADDRV) { +		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + +		/* +		 * Mask the reported address by the reported granularity. +		 */ +		if (mce_ser && (m->status & MCI_STATUS_MISCV)) { +			u8 shift = MCI_MISC_ADDR_LSB(m->misc); +			m->addr >>= shift; +			m->addr <<= shift; +		} +	} +} +  DEFINE_PER_CPU(unsigned, mce_poll_count);  /* @@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))  			continue; -		if (m.status & MCI_STATUS_MISCV) -			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); -		if (m.status & MCI_STATUS_ADDRV) -			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); +		mce_read_aux(&m, i);  		if (!(flags & MCP_TIMESTAMP))  			m.tsc = 0; @@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear)  }  /* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define	MCE_INFO_MAX	16 + +struct mce_info { +	atomic_t		inuse; +	struct task_struct	*t; +	__u64			paddr; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr) +{ +	struct mce_info *mi; + +	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { +		if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { +			mi->t = current; +			mi->paddr = addr; +			return; +		} +	} + +	mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ +	struct mce_info *mi; + +	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) +		if (atomic_read(&mi->inuse) && mi->t == current) +			return mi; +	return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ +	atomic_set(&mi->inuse, 0); +} + +/*   * The actual machine check handler. This only handles real   * exceptions when something got corrupted coming in through int 18.   * @@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)  	barrier();  	/* -	 * When no restart IP must always kill or panic. +	 * When no restart IP might need to kill or panic. +	 * Assume the worst for now, but if we find the +	 * severity is MCE_AR_SEVERITY we have other options.  	 */  	if (!(m.mcgstatus & MCG_STATUS_RIPV))  		kill_it = 1; @@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)  			continue;  		} -		/* -		 * Kill on action required. -		 */ -		if (severity == MCE_AR_SEVERITY) -			kill_it = 1; - -		if (m.status & MCI_STATUS_MISCV) -			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); -		if (m.status & MCI_STATUS_ADDRV) -			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); +		mce_read_aux(&m, i);  		/*  		 * Action optional error. Queue address for later processing. @@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)  		}  	} +	/* mce_clear_state will clear *final, save locally for use later */ +	m = *final; +  	if (!no_way_out)  		mce_clear_state(toclear); @@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)  		no_way_out = worst >= MCE_PANIC_SEVERITY;  	/* -	 * If we have decided that we just CAN'T continue, and the user -	 * has not set tolerant to an insane level, give up and die. -	 * -	 * This is mainly used in the case when the system doesn't -	 * support MCE broadcasting or it has been disabled. -	 */ -	if (no_way_out && tolerant < 3) -		mce_panic("Fatal machine check on current CPU", final, msg); - -	/* -	 * If the error seems to be unrecoverable, something should be -	 * done.  Try to kill as little as possible.  If we can kill just -	 * one task, do that.  If the user has set the tolerance very -	 * high, don't try to do anything at all. +	 * At insane "tolerant" levels we take no action. Otherwise +	 * we only die if we have no other choice. For less serious +	 * issues we try to recover, or limit damage to the current +	 * process.  	 */ - -	if (kill_it && tolerant < 3) -		force_sig(SIGBUS, current); - -	/* notify userspace ASAP */ -	set_thread_flag(TIF_MCE_NOTIFY); +	if (tolerant < 3) { +		if (no_way_out) +			mce_panic("Fatal machine check on current CPU", &m, msg); +		if (worst == MCE_AR_SEVERITY) { +			/* schedule action before return to userland */ +			mce_save_info(m.addr); +			set_thread_flag(TIF_MCE_NOTIFY); +		} else if (kill_it) { +			force_sig(SIGBUS, current); +		} +	}  	if (worst > 0)  		mce_report_event(regs); @@ -1094,34 +1146,57 @@ out:  }  EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags)  { -	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); +	/* mce_severity() should not hand us an ACTION_REQUIRED error */ +	BUG_ON(flags & MF_ACTION_REQUIRED); +	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" +		"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + +	return 0;  } +#endif  /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here.   */  void mce_notify_process(void)  {  	unsigned long pfn; -	mce_notify_irq(); -	while (mce_ring_get(&pfn)) -		memory_failure(pfn, MCE_VECTOR); +	struct mce_info *mi = mce_find_info(); + +	if (!mi) +		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); +	pfn = mi->paddr >> PAGE_SHIFT; + +	clear_thread_flag(TIF_MCE_NOTIFY); + +	pr_err("Uncorrected hardware memory error in user-access at %llx", +		 mi->paddr); +	if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { +		pr_err("Memory error not recovered"); +		force_sig(SIGBUS, current); +	} +	mce_clear_info(mi);  } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */  static void mce_process_work(struct work_struct *dummy)  { -	mce_notify_process(); +	unsigned long pfn; + +	while (mce_ring_get(&pfn)) +		memory_failure(pfn, MCE_VECTOR, 0);  }  #ifdef CONFIG_X86_MCE_INTEL @@ -1211,8 +1286,6 @@ int mce_notify_irq(void)  	/* Not more than two messages every minute */  	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); -	clear_thread_flag(TIF_MCE_NOTIFY); -  	if (test_and_clear_bit(0, &mce_need_notify)) {  		/* wake processes polling /dev/mcelog */  		wake_up_interruptible(&mce_chrdev_wait); @@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)  	/* Error or no more MCE record */  	if (rc <= 0) {  		mce_apei_read_done = 1; +		/* +		 * When ERST is disabled, mce_chrdev_read() should return +		 * "no record" instead of "no device." +		 */ +		if (rc == -ENODEV) +			return 0;  		return rc;  	}  	rc = -EFAULT; @@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = {  	.dev_name	= "machinecheck",  }; -struct device *mce_device[CONFIG_NR_CPUS]; +DEFINE_PER_CPU(struct device *, mce_device);  __cpuinitdata  void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)  			goto error2;  	}  	cpumask_set_cpu(cpu, mce_device_initialized); -	mce_device[cpu] = dev; +	per_cpu(mce_device, cpu) = dev;  	return 0;  error2: @@ -2055,7 +2134,7 @@ error:  static __cpuinit void mce_device_remove(unsigned int cpu)  { -	struct device *dev = mce_device[cpu]; +	struct device *dev = per_cpu(mce_device, cpu);  	int i;  	if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)  	device_unregister(dev);  	cpumask_clear_cpu(cpu, mce_device_initialized); -	mce_device[cpu] = NULL; +	per_cpu(mce_device, cpu) = NULL;  }  /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a8632..99b57179f91 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,11 +523,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  {  	int i, err = 0;  	struct threshold_bank *b = NULL; -	struct device *dev = mce_device[cpu]; +	struct device *dev = per_cpu(mce_device, cpu);  	char name[32];  	sprintf(name, "threshold_bank%i", bank); +#ifdef CONFIG_SMP  	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */  		i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		goto out;  	} +#endif  	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);  	if (!b) { @@ -585,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (i == cpu)  			continue; -		dev = mce_device[i]; +		dev = per_cpu(mce_device, i);  		if (dev)  			err = sysfs_create_link(&dev->kobj,b->kobj, name);  		if (err) @@ -665,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  #ifdef CONFIG_SMP  	/* sibling symlink */  	if (shared_bank[bank] && b->blocks->cpu != cpu) { -		sysfs_remove_link(&mce_device[cpu]->kobj, name); +		dev = per_cpu(mce_device, cpu); +		sysfs_remove_link(&dev->kobj, name);  		per_cpu(threshold_banks, cpu)[bank] = NULL;  		return; @@ -677,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  		if (i == cpu)  			continue; -		dev = mce_device[i]; +		dev = per_cpu(mce_device, i);  		if (dev)  			sysfs_remove_link(&dev->kobj, name);  		per_cpu(threshold_banks, i)[bank] = NULL; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b1..fa2900c0e39 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -24,6 +24,7 @@  #include <linux/slab.h>  #include <linux/cpu.h>  #include <linux/bitops.h> +#include <linux/device.h>  #include <asm/apic.h>  #include <asm/stacktrace.h> @@ -31,6 +32,7 @@  #include <asm/compat.h>  #include <asm/smp.h>  #include <asm/alternative.h> +#include <asm/timer.h>  #include "perf_event.h" @@ -351,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event)  	return 0;  } +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) +{ +	u64 m = event->attr.branch_sample_type; +	u64 b = 0; + +	/* must capture all branches */ +	if (!(m & PERF_SAMPLE_BRANCH_ANY)) +		return 0; + +	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; + +	if (!event->attr.exclude_user) +		b |= PERF_SAMPLE_BRANCH_USER; + +	if (!event->attr.exclude_kernel) +		b |= PERF_SAMPLE_BRANCH_KERNEL; + +	/* +	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 +	 */ + +	return m == b; +} +  int x86_pmu_hw_config(struct perf_event *event)  {  	if (event->attr.precise_ip) { @@ -367,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event)  		if (event->attr.precise_ip > precise)  			return -EOPNOTSUPP; +		/* +		 * check that PEBS LBR correction does not conflict with +		 * whatever the user is asking with attr->branch_sample_type +		 */ +		if (event->attr.precise_ip > 1) { +			u64 *br_type = &event->attr.branch_sample_type; + +			if (has_branch_stack(event)) { +				if (!precise_br_compat(event)) +					return -EOPNOTSUPP; + +				/* branch_sample_type is compatible */ + +			} else { +				/* +				 * user did not specify  branch_sample_type +				 * +				 * For PEBS fixups, we capture all +				 * the branches at the priv level of the +				 * event. +				 */ +				*br_type = PERF_SAMPLE_BRANCH_ANY; + +				if (!event->attr.exclude_user) +					*br_type |= PERF_SAMPLE_BRANCH_USER; + +				if (!event->attr.exclude_kernel) +					*br_type |= PERF_SAMPLE_BRANCH_KERNEL; +			} +		}  	}  	/* @@ -424,6 +486,10 @@ static int __x86_pmu_event_init(struct perf_event *event)  	/* mark unused */  	event->hw.extra_reg.idx = EXTRA_REG_NONE; +	/* mark not used */ +	event->hw.extra_reg.idx = EXTRA_REG_NONE; +	event->hw.branch_reg.idx = EXTRA_REG_NONE; +  	return x86_pmu.hw_config(event);  } @@ -577,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)  	/* Prefer fixed purpose counters */  	if (x86_pmu.num_counters_fixed) {  		idx = X86_PMC_IDX_FIXED; -		for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { +		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {  			if (!__test_and_set_bit(idx, sched->state.used))  				goto done;  		}  	}  	/* Grab the first unused counter starting with idx */  	idx = sched->state.counter; -	for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { +	for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {  		if (!__test_and_set_bit(idx, sched->state.used))  			goto done;  	} @@ -1210,6 +1276,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)  		break;  	case CPU_STARTING: +		if (x86_pmu.attr_rdpmc) +			set_in_cr4(X86_CR4_PCE);  		if (x86_pmu.cpu_starting)  			x86_pmu.cpu_starting(cpu);  		break; @@ -1319,6 +1387,8 @@ static int __init init_hw_perf_events(void)  		}  	} +	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ +  	pr_info("... version:                %d\n",     x86_pmu.version);  	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);  	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters); @@ -1542,23 +1612,106 @@ static int x86_pmu_event_init(struct perf_event *event)  	return err;  } +static int x86_pmu_event_idx(struct perf_event *event) +{ +	int idx = event->hw.idx; + +	if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { +		idx -= X86_PMC_IDX_FIXED; +		idx |= 1 << 30; +	} + +	return idx + 1; +} + +static ssize_t get_attr_rdpmc(struct device *cdev, +			      struct device_attribute *attr, +			      char *buf) +{ +	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ +	bool enable = !!(unsigned long)info; + +	if (enable) +		set_in_cr4(X86_CR4_PCE); +	else +		clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, +			      struct device_attribute *attr, +			      const char *buf, size_t count) +{ +	unsigned long val = simple_strtoul(buf, NULL, 0); + +	if (!!val != !!x86_pmu.attr_rdpmc) { +		x86_pmu.attr_rdpmc = !!val; +		smp_call_function(change_rdpmc, (void *)val, 1); +	} + +	return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { +	&dev_attr_rdpmc.attr, +	NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { +	.attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { +	&x86_pmu_attr_group, +	NULL, +}; + +static void x86_pmu_flush_branch_stack(void) +{ +	if (x86_pmu.flush_branch_stack) +		x86_pmu.flush_branch_stack(); +} +  static struct pmu pmu = { -	.pmu_enable	= x86_pmu_enable, -	.pmu_disable	= x86_pmu_disable, +	.pmu_enable		= x86_pmu_enable, +	.pmu_disable		= x86_pmu_disable, + +	.attr_groups	= x86_pmu_attr_groups,  	.event_init	= x86_pmu_event_init, -	.add		= x86_pmu_add, -	.del		= x86_pmu_del, -	.start		= x86_pmu_start, -	.stop		= x86_pmu_stop, -	.read		= x86_pmu_read, +	.add			= x86_pmu_add, +	.del			= x86_pmu_del, +	.start			= x86_pmu_start, +	.stop			= x86_pmu_stop, +	.read			= x86_pmu_read,  	.start_txn	= x86_pmu_start_txn,  	.cancel_txn	= x86_pmu_cancel_txn,  	.commit_txn	= x86_pmu_commit_txn, + +	.event_idx	= x86_pmu_event_idx, +	.flush_branch_stack	= x86_pmu_flush_branch_stack,  }; +void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) +		return; + +	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) +		return; + +	userpg->time_mult = this_cpu_read(cyc2ns); +	userpg->time_shift = CYC2NS_SCALE_FACTOR; +	userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +} +  /*   * callchain support   */ diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e..8484e77c211 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -33,6 +33,7 @@ enum extra_reg_type {  	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */  	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */ +	EXTRA_REG_LBR   = 2,	/* lbr_select */  	EXTRA_REG_MAX		/* number of entries needed */  }; @@ -130,6 +131,8 @@ struct cpu_hw_events {  	void				*lbr_context;  	struct perf_branch_stack	lbr_stack;  	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES]; +	struct er_account		*lbr_sel; +	u64				br_sel;  	/*  	 * Intel host/guest exclude bits @@ -147,7 +150,9 @@ struct cpu_hw_events {  	/*  	 * AMD specific bits  	 */ -	struct amd_nb		*amd_nb; +	struct amd_nb			*amd_nb; +	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */ +	u64				perf_ctr_virt_mask;  	void				*kfree_on_online;  }; @@ -266,6 +271,29 @@ struct x86_pmu_quirk {  	void (*func)(void);  }; +union x86_pmu_config { +	struct { +		u64 event:8, +		    umask:8, +		    usr:1, +		    os:1, +		    edge:1, +		    pc:1, +		    interrupt:1, +		    __reserved1:1, +		    en:1, +		    inv:1, +		    cmask:8, +		    event2:4, +		    __reserved2:4, +		    go:1, +		    ho:1; +	} bits; +	u64 value; +}; + +#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value +  /*   * struct x86_pmu - generic x86 pmu   */ @@ -307,10 +335,19 @@ struct x86_pmu {  	struct x86_pmu_quirk *quirks;  	int		perfctr_second_write; +	/* +	 * sysfs attrs +	 */ +	int		attr_rdpmc; + +	/* +	 * CPU Hotplug hooks +	 */  	int		(*cpu_prepare)(int cpu);  	void		(*cpu_starting)(int cpu);  	void		(*cpu_dying)(int cpu);  	void		(*cpu_dead)(int cpu); +	void		(*flush_branch_stack)(void);  	/*  	 * Intel Arch Perfmon v2+ @@ -332,6 +369,8 @@ struct x86_pmu {  	 */  	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */  	int		lbr_nr;			   /* hardware stack size */ +	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */ +	const int	*lbr_sel_map;		   /* lbr_select mappings */  	/*  	 * Extra registers for events @@ -417,9 +456,11 @@ void x86_pmu_disable_all(void);  static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,  					  u64 enable_mask)  { +	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); +  	if (hwc->extra_reg.reg)  		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); -	wrmsrl(hwc->config_base, hwc->config | enable_mask); +	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);  }  void x86_pmu_enable_all(int added); @@ -443,6 +484,15 @@ extern struct event_constraint emptyconstraint;  extern struct event_constraint unconstrained; +static inline bool kernel_ip(unsigned long ip) +{ +#ifdef CONFIG_X86_32 +	return ip > PAGE_OFFSET; +#else +	return (long)ip < 0; +#endif +} +  #ifdef CONFIG_CPU_SUP_AMD  int amd_pmu_init(void); @@ -523,6 +573,10 @@ void intel_pmu_lbr_init_nhm(void);  void intel_pmu_lbr_init_atom(void); +void intel_pmu_lbr_init_snb(void); + +int intel_pmu_setup_lbr_filter(struct perf_event *event); +  int p4_pmu_init(void);  int p6_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 0397b23be8e..dd002faff7a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,4 +1,5 @@  #include <linux/perf_event.h> +#include <linux/export.h>  #include <linux/types.h>  #include <linux/init.h>  #include <linux/slab.h> @@ -138,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event)  	if (ret)  		return ret; +	if (has_branch_stack(event)) +		return -EOPNOTSUPP; +  	if (event->attr.exclude_host && event->attr.exclude_guest)  		/*  		 * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -357,7 +361,9 @@ static void amd_pmu_cpu_starting(int cpu)  	struct amd_nb *nb;  	int i, nb_id; -	if (boot_cpu_data.x86_max_cores < 2) +	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + +	if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)  		return;  	nb_id = amd_get_nb_id(cpu); @@ -587,9 +593,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {  	.put_event_constraints	= amd_put_event_constraints,  	.cpu_prepare		= amd_pmu_cpu_prepare, -	.cpu_starting		= amd_pmu_cpu_starting,  	.cpu_dead		= amd_pmu_cpu_dead,  #endif +	.cpu_starting		= amd_pmu_cpu_starting,  };  __init int amd_pmu_init(void) @@ -621,3 +627,33 @@ __init int amd_pmu_init(void)  	return 0;  } + +void amd_pmu_enable_virt(void) +{ +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + +	cpuc->perf_ctr_virt_mask = 0; + +	/* Reload all events */ +	x86_pmu_disable_all(); +	x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + +	/* +	 * We only mask out the Host-only bit so that host-only counting works +	 * when SVM is disabled. If someone sets up a guest-only counter when +	 * SVM is disabled the Guest-only bits still gets set and the counter +	 * will not count anything. +	 */ +	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + +	/* Reload all events */ +	x86_pmu_disable_all(); +	x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8..6a84e7f28f0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids  #define NHM_LOCAL_DRAM		(1 << 14)  #define NHM_NON_DRAM		(1 << 15) -#define NHM_ALL_DRAM		(NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) +#define NHM_LOCAL		(NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_REMOTE		(NHM_REMOTE_DRAM)  #define NHM_DMND_READ		(NHM_DMND_DATA_RD)  #define NHM_DMND_WRITE		(NHM_DMND_RFO|NHM_DMND_WB)  #define NHM_DMND_PREFETCH	(NHM_PF_DATA_RD|NHM_PF_DATA_RFO)  #define NHM_L3_HIT	(NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) -#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)  #define NHM_L3_ACCESS	(NHM_L3_HIT|NHM_L3_MISS)  static __initconst const u64 nehalem_hw_cache_extra_regs @@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs   },   [ C(NODE) ] = {  	[ C(OP_READ) ] = { -		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, -		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM, +		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, +		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,  	},  	[ C(OP_WRITE) ] = { -		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, -		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, +		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, +		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,  	},  	[ C(OP_PREFETCH) ] = { -		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, -		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, +		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, +		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,  	},   },  }; @@ -727,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids   },  }; +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ +	/* user explicitly requested branch sampling */ +	if (has_branch_stack(event)) +		return true; + +	/* implicit branch sampling to correct PEBS skid */ +	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) +		return true; + +	return false; +} +  static void intel_pmu_disable_all(void)  {  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -881,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event)  	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);  	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); +	/* +	 * must disable before any actual event +	 * because any event may be combined with LBR +	 */ +	if (intel_pmu_needs_lbr_smpl(event)) +		intel_pmu_lbr_disable(event); +  	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {  		intel_pmu_disable_fixed(hwc);  		return; @@ -935,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event)  		intel_pmu_enable_bts(hwc->config);  		return;  	} +	/* +	 * must enabled before any actual event +	 * because any event may be combined with LBR +	 */ +	if (intel_pmu_needs_lbr_smpl(event)) +		intel_pmu_lbr_enable(event);  	if (event->attr.exclude_host)  		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); @@ -1057,6 +1084,9 @@ again:  		data.period = event->hw.last_period; +		if (has_branch_stack(event)) +			data.br_stack = &cpuc->lbr_stack; +  		if (perf_event_overflow(event, &data, regs))  			x86_pmu_stop(event, 0);  	} @@ -1123,17 +1153,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)   */  static struct event_constraint *  __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, -				   struct perf_event *event) +				   struct perf_event *event, +				   struct hw_perf_event_extra *reg)  {  	struct event_constraint *c = &emptyconstraint; -	struct hw_perf_event_extra *reg = &event->hw.extra_reg;  	struct er_account *era;  	unsigned long flags;  	int orig_idx = reg->idx;  	/* already allocated shared msr */  	if (reg->alloc) -		return &unconstrained; +		return NULL; /* call x86_get_event_constraint() */  again:  	era = &cpuc->shared_regs->regs[reg->idx]; @@ -1156,14 +1186,10 @@ again:  		reg->alloc = 1;  		/* -		 * All events using extra_reg are unconstrained. -		 * Avoids calling x86_get_event_constraints() -		 * -		 * Must revisit if extra_reg controlling events -		 * ever have constraints. Worst case we go through -		 * the regular event constraint table. +		 * need to call x86_get_event_constraint() +		 * to check if associated event has constraints  		 */ -		c = &unconstrained; +		c = NULL;  	} else if (intel_try_alt_er(event, orig_idx)) {  		raw_spin_unlock_irqrestore(&era->lock, flags);  		goto again; @@ -1200,11 +1226,23 @@ static struct event_constraint *  intel_shared_regs_constraints(struct cpu_hw_events *cpuc,  			      struct perf_event *event)  { -	struct event_constraint *c = NULL; - -	if (event->hw.extra_reg.idx != EXTRA_REG_NONE) -		c = __intel_shared_reg_get_constraints(cpuc, event); +	struct event_constraint *c = NULL, *d; +	struct hw_perf_event_extra *xreg, *breg; +	xreg = &event->hw.extra_reg; +	if (xreg->idx != EXTRA_REG_NONE) { +		c = __intel_shared_reg_get_constraints(cpuc, event, xreg); +		if (c == &emptyconstraint) +			return c; +	} +	breg = &event->hw.branch_reg; +	if (breg->idx != EXTRA_REG_NONE) { +		d = __intel_shared_reg_get_constraints(cpuc, event, breg); +		if (d == &emptyconstraint) { +			__intel_shared_reg_put_constraints(cpuc, xreg); +			c = d; +		} +	}  	return c;  } @@ -1252,6 +1290,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,  	reg = &event->hw.extra_reg;  	if (reg->idx != EXTRA_REG_NONE)  		__intel_shared_reg_put_constraints(cpuc, reg); + +	reg = &event->hw.branch_reg; +	if (reg->idx != EXTRA_REG_NONE) +		__intel_shared_reg_put_constraints(cpuc, reg);  }  static void intel_put_event_constraints(struct cpu_hw_events *cpuc, @@ -1287,12 +1329,19 @@ static int intel_pmu_hw_config(struct perf_event *event)  		 *  		 * Thereby we gain a PEBS capable cycle counter.  		 */ -		u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ +		u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); +  		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);  		event->hw.config = alt_config;  	} +	if (intel_pmu_needs_lbr_smpl(event)) { +		ret = intel_pmu_setup_lbr_filter(event); +		if (ret) +			return ret; +	} +  	if (event->attr.type != PERF_TYPE_RAW)  		return 0; @@ -1431,7 +1480,7 @@ static int intel_pmu_cpu_prepare(int cpu)  {  	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); -	if (!x86_pmu.extra_regs) +	if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))  		return NOTIFY_OK;  	cpuc->shared_regs = allocate_shared_regs(cpu); @@ -1453,22 +1502,28 @@ static void intel_pmu_cpu_starting(int cpu)  	 */  	intel_pmu_lbr_reset(); -	if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) +	cpuc->lbr_sel = NULL; + +	if (!cpuc->shared_regs)  		return; -	for_each_cpu(i, topology_thread_cpumask(cpu)) { -		struct intel_shared_regs *pc; +	if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { +		for_each_cpu(i, topology_thread_cpumask(cpu)) { +			struct intel_shared_regs *pc; -		pc = per_cpu(cpu_hw_events, i).shared_regs; -		if (pc && pc->core_id == core_id) { -			cpuc->kfree_on_online = cpuc->shared_regs; -			cpuc->shared_regs = pc; -			break; +			pc = per_cpu(cpu_hw_events, i).shared_regs; +			if (pc && pc->core_id == core_id) { +				cpuc->kfree_on_online = cpuc->shared_regs; +				cpuc->shared_regs = pc; +				break; +			}  		} +		cpuc->shared_regs->core_id = core_id; +		cpuc->shared_regs->refcnt++;  	} -	cpuc->shared_regs->core_id = core_id; -	cpuc->shared_regs->refcnt++; +	if (x86_pmu.lbr_sel_map) +		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];  }  static void intel_pmu_cpu_dying(int cpu) @@ -1486,6 +1541,18 @@ static void intel_pmu_cpu_dying(int cpu)  	fini_debug_store_on_cpu(cpu);  } +static void intel_pmu_flush_branch_stack(void) +{ +	/* +	 * Intel LBR does not tag entries with the +	 * PID of the current task, then we need to +	 * flush it on ctxsw +	 * For now, we simply reset it +	 */ +	if (x86_pmu.lbr_nr) +		intel_pmu_lbr_reset(); +} +  static __initconst const struct x86_pmu intel_pmu = {  	.name			= "Intel",  	.handle_irq		= intel_pmu_handle_irq, @@ -1513,6 +1580,7 @@ static __initconst const struct x86_pmu intel_pmu = {  	.cpu_starting		= intel_pmu_cpu_starting,  	.cpu_dying		= intel_pmu_cpu_dying,  	.guest_get_msrs		= intel_guest_get_msrs, +	.flush_branch_stack	= intel_pmu_flush_branch_stack,  };  static __init void intel_clovertown_quirk(void) @@ -1689,9 +1757,11 @@ __init int intel_pmu_init(void)  		x86_pmu.extra_regs = intel_nehalem_extra_regs;  		/* UOPS_ISSUED.STALLED_CYCLES */ -		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; +		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = +			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);  		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ -		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; +		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = +			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);  		x86_add_quirk(intel_nehalem_quirk); @@ -1726,9 +1796,11 @@ __init int intel_pmu_init(void)  		x86_pmu.er_flags |= ERF_HAS_RSP_1;  		/* UOPS_ISSUED.STALLED_CYCLES */ -		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; +		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = +			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);  		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ -		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; +		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = +			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);  		pr_cont("Westmere events, ");  		break; @@ -1739,7 +1811,7 @@ __init int intel_pmu_init(void)  		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids)); -		intel_pmu_lbr_init_nhm(); +		intel_pmu_lbr_init_snb();  		x86_pmu.event_constraints = intel_snb_event_constraints;  		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; @@ -1749,9 +1821,11 @@ __init int intel_pmu_init(void)  		x86_pmu.er_flags |= ERF_NO_HT_SHARING;  		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ -		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; +		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = +			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);  		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ -		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; +		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = +			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);  		pr_cont("SandyBridge events, ");  		break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 73da6b64f5b..7f64df19e7d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -3,6 +3,7 @@  #include <linux/slab.h>  #include <asm/perf_event.h> +#include <asm/insn.h>  #include "perf_event.h" @@ -439,10 +440,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)  	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;  	cpuc->pebs_enabled |= 1ULL << hwc->idx; -	WARN_ON_ONCE(cpuc->enabled); - -	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) -		intel_pmu_lbr_enable(event);  }  void intel_pmu_pebs_disable(struct perf_event *event) @@ -455,9 +452,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)  		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);  	hwc->config |= ARCH_PERFMON_EVENTSEL_INT; - -	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) -		intel_pmu_lbr_disable(event);  }  void intel_pmu_pebs_enable_all(void) @@ -476,17 +470,6 @@ void intel_pmu_pebs_disable_all(void)  		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);  } -#include <asm/insn.h> - -static inline bool kernel_ip(unsigned long ip) -{ -#ifdef CONFIG_X86_32 -	return ip > PAGE_OFFSET; -#else -	return (long)ip < 0; -#endif -} -  static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  {  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -573,6 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  	 * both formats and we don't use the other fields in this  	 * routine.  	 */ +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	struct pebs_record_core *pebs = __pebs;  	struct perf_sample_data data;  	struct pt_regs regs; @@ -603,6 +587,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  	else  		regs.flags &= ~PERF_EFLAGS_EXACT; +	if (has_branch_stack(event)) +		data.br_stack = &cpuc->lbr_stack; +  	if (perf_event_overflow(event, &data, ®s))  		x86_pmu_stop(event, 0);  } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 3fab3de3ce9..520b4265fcd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -3,6 +3,7 @@  #include <asm/perf_event.h>  #include <asm/msr.h> +#include <asm/insn.h>  #include "perf_event.h" @@ -14,6 +15,100 @@ enum {  };  /* + * Intel LBR_SELECT bits + * Intel Vol3a, April 2011, Section 16.7 Table 16-10 + * + * Hardware branch filter (not available on all CPUs) + */ +#define LBR_KERNEL_BIT		0 /* do not capture at ring0 */ +#define LBR_USER_BIT		1 /* do not capture at ring > 0 */ +#define LBR_JCC_BIT		2 /* do not capture conditional branches */ +#define LBR_REL_CALL_BIT	3 /* do not capture relative calls */ +#define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */ +#define LBR_RETURN_BIT		5 /* do not capture near returns */ +#define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */ +#define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */ +#define LBR_FAR_BIT		8 /* do not capture far branches */ + +#define LBR_KERNEL	(1 << LBR_KERNEL_BIT) +#define LBR_USER	(1 << LBR_USER_BIT) +#define LBR_JCC		(1 << LBR_JCC_BIT) +#define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT) +#define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT) +#define LBR_RETURN	(1 << LBR_RETURN_BIT) +#define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT) +#define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT) +#define LBR_FAR		(1 << LBR_FAR_BIT) + +#define LBR_PLM (LBR_KERNEL | LBR_USER) + +#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */ +#define LBR_NOT_SUPP	-1	/* LBR filter not supported */ +#define LBR_IGN		0	/* ignored */ + +#define LBR_ANY		 \ +	(LBR_JCC	|\ +	 LBR_REL_CALL	|\ +	 LBR_IND_CALL	|\ +	 LBR_RETURN	|\ +	 LBR_REL_JMP	|\ +	 LBR_IND_JMP	|\ +	 LBR_FAR) + +#define LBR_FROM_FLAG_MISPRED  (1ULL << 63) + +#define for_each_branch_sample_type(x) \ +	for ((x) = PERF_SAMPLE_BRANCH_USER; \ +	     (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) + +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { +	X86_BR_NONE     = 0,      /* unknown */ + +	X86_BR_USER     = 1 << 0, /* branch target is user */ +	X86_BR_KERNEL   = 1 << 1, /* branch target is kernel */ + +	X86_BR_CALL     = 1 << 2, /* call */ +	X86_BR_RET      = 1 << 3, /* return */ +	X86_BR_SYSCALL  = 1 << 4, /* syscall */ +	X86_BR_SYSRET   = 1 << 5, /* syscall return */ +	X86_BR_INT      = 1 << 6, /* sw interrupt */ +	X86_BR_IRET     = 1 << 7, /* return from interrupt */ +	X86_BR_JCC      = 1 << 8, /* conditional */ +	X86_BR_JMP      = 1 << 9, /* jump */ +	X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */ +	X86_BR_IND_CALL = 1 << 11,/* indirect calls */ +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) + +#define X86_BR_ANY       \ +	(X86_BR_CALL    |\ +	 X86_BR_RET     |\ +	 X86_BR_SYSCALL |\ +	 X86_BR_SYSRET  |\ +	 X86_BR_INT     |\ +	 X86_BR_IRET    |\ +	 X86_BR_JCC     |\ +	 X86_BR_JMP	 |\ +	 X86_BR_IRQ	 |\ +	 X86_BR_IND_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL		 \ +	(X86_BR_CALL		|\ +	 X86_BR_IND_CALL	|\ +	 X86_BR_SYSCALL		|\ +	 X86_BR_IRQ		|\ +	 X86_BR_INT) + +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); + +/*   * We only support LBR implementations that have FREEZE_LBRS_ON_PMI   * otherwise it becomes near impossible to get a reliable stack.   */ @@ -21,6 +116,10 @@ enum {  static void __intel_pmu_lbr_enable(void)  {  	u64 debugctl; +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + +	if (cpuc->lbr_sel) +		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);  	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);  	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); @@ -72,17 +171,15 @@ void intel_pmu_lbr_enable(struct perf_event *event)  	if (!x86_pmu.lbr_nr)  		return; -	WARN_ON_ONCE(cpuc->enabled); -  	/*  	 * Reset the LBR stack if we changed task context to  	 * avoid data leaks.  	 */ -  	if (event->ctx->task && cpuc->lbr_context != event->ctx) {  		intel_pmu_lbr_reset();  		cpuc->lbr_context = event->ctx;  	} +	cpuc->br_sel = event->hw.branch_reg.reg;  	cpuc->lbr_users++;  } @@ -97,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)  	cpuc->lbr_users--;  	WARN_ON_ONCE(cpuc->lbr_users < 0); -	if (cpuc->enabled && !cpuc->lbr_users) +	if (cpuc->enabled && !cpuc->lbr_users) {  		__intel_pmu_lbr_disable(); +		/* avoid stale pointer */ +		cpuc->lbr_context = NULL; +	}  }  void intel_pmu_lbr_enable_all(void) @@ -117,6 +217,9 @@ void intel_pmu_lbr_disable_all(void)  		__intel_pmu_lbr_disable();  } +/* + * TOS = most recently recorded branch + */  static inline u64 intel_pmu_lbr_tos(void)  {  	u64 tos; @@ -144,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)  		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); -		cpuc->lbr_entries[i].from  = msr_lastbranch.from; -		cpuc->lbr_entries[i].to    = msr_lastbranch.to; -		cpuc->lbr_entries[i].flags = 0; +		cpuc->lbr_entries[i].from	= msr_lastbranch.from; +		cpuc->lbr_entries[i].to		= msr_lastbranch.to; +		cpuc->lbr_entries[i].mispred	= 0; +		cpuc->lbr_entries[i].predicted	= 0; +		cpuc->lbr_entries[i].reserved	= 0;  	}  	cpuc->lbr_stack.nr = i;  } -#define LBR_FROM_FLAG_MISPRED  (1ULL << 63) -  /*   * Due to lack of segmentation in Linux the effective address (offset)   * is the same as the linear address, allowing us to merge the LIP and EIP @@ -167,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)  	for (i = 0; i < x86_pmu.lbr_nr; i++) {  		unsigned long lbr_idx = (tos - i) & mask; -		u64 from, to, flags = 0; +		u64 from, to, mis = 0, pred = 0;  		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);  		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);  		if (lbr_format == LBR_FORMAT_EIP_FLAGS) { -			flags = !!(from & LBR_FROM_FLAG_MISPRED); +			mis = !!(from & LBR_FROM_FLAG_MISPRED); +			pred = !mis;  			from = (u64)((((s64)from) << 1) >> 1);  		} -		cpuc->lbr_entries[i].from  = from; -		cpuc->lbr_entries[i].to    = to; -		cpuc->lbr_entries[i].flags = flags; +		cpuc->lbr_entries[i].from	= from; +		cpuc->lbr_entries[i].to		= to; +		cpuc->lbr_entries[i].mispred	= mis; +		cpuc->lbr_entries[i].predicted	= pred; +		cpuc->lbr_entries[i].reserved	= 0;  	}  	cpuc->lbr_stack.nr = i;  } @@ -195,28 +301,404 @@ void intel_pmu_lbr_read(void)  		intel_pmu_lbr_read_32(cpuc);  	else  		intel_pmu_lbr_read_64(cpuc); + +	intel_pmu_lbr_filter(cpuc); +} + +/* + * SW filter is used: + * - in case there is no HW filter + * - in case the HW filter has errata or limitations + */ +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +{ +	u64 br_type = event->attr.branch_sample_type; +	int mask = 0; + +	if (br_type & PERF_SAMPLE_BRANCH_USER) +		mask |= X86_BR_USER; + +	if (br_type & PERF_SAMPLE_BRANCH_KERNEL) +		mask |= X86_BR_KERNEL; + +	/* we ignore BRANCH_HV here */ + +	if (br_type & PERF_SAMPLE_BRANCH_ANY) +		mask |= X86_BR_ANY; + +	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) +		mask |= X86_BR_ANY_CALL; + +	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) +		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + +	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) +		mask |= X86_BR_IND_CALL; +	/* +	 * stash actual user request into reg, it may +	 * be used by fixup code for some CPU +	 */ +	event->hw.branch_reg.reg = mask; +} + +/* + * setup the HW LBR filter + * Used only when available, may not be enough to disambiguate + * all branches, may need the help of the SW filter + */ +static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) +{ +	struct hw_perf_event_extra *reg; +	u64 br_type = event->attr.branch_sample_type; +	u64 mask = 0, m; +	u64 v; + +	for_each_branch_sample_type(m) { +		if (!(br_type & m)) +			continue; + +		v = x86_pmu.lbr_sel_map[m]; +		if (v == LBR_NOT_SUPP) +			return -EOPNOTSUPP; + +		if (v != LBR_IGN) +			mask |= v; +	} +	reg = &event->hw.branch_reg; +	reg->idx = EXTRA_REG_LBR; + +	/* LBR_SELECT operates in suppress mode so invert mask */ +	reg->config = ~mask & x86_pmu.lbr_sel_mask; + +	return 0; +} + +int intel_pmu_setup_lbr_filter(struct perf_event *event) +{ +	int ret = 0; + +	/* +	 * no LBR on this PMU +	 */ +	if (!x86_pmu.lbr_nr) +		return -EOPNOTSUPP; + +	/* +	 * setup SW LBR filter +	 */ +	intel_pmu_setup_sw_lbr_filter(event); + +	/* +	 * setup HW LBR filter, if any +	 */ +	if (x86_pmu.lbr_sel_map) +		ret = intel_pmu_setup_hw_lbr_filter(event); + +	return ret; +} + +/* + * return the type of control flow change at address "from" + * intruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +static int branch_type(unsigned long from, unsigned long to) +{ +	struct insn insn; +	void *addr; +	int bytes, size = MAX_INSN_SIZE; +	int ret = X86_BR_NONE; +	int ext, to_plm, from_plm; +	u8 buf[MAX_INSN_SIZE]; +	int is64 = 0; + +	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; +	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + +	/* +	 * maybe zero if lbr did not fill up after a reset by the time +	 * we get a PMU interrupt +	 */ +	if (from == 0 || to == 0) +		return X86_BR_NONE; + +	if (from_plm == X86_BR_USER) { +		/* +		 * can happen if measuring at the user level only +		 * and we interrupt in a kernel thread, e.g., idle. +		 */ +		if (!current->mm) +			return X86_BR_NONE; + +		/* may fail if text not present */ +		bytes = copy_from_user_nmi(buf, (void __user *)from, size); +		if (bytes != size) +			return X86_BR_NONE; + +		addr = buf; +	} else +		addr = (void *)from; + +	/* +	 * decoder needs to know the ABI especially +	 * on 64-bit systems running 32-bit apps +	 */ +#ifdef CONFIG_X86_64 +	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); +#endif +	insn_init(&insn, addr, is64); +	insn_get_opcode(&insn); + +	switch (insn.opcode.bytes[0]) { +	case 0xf: +		switch (insn.opcode.bytes[1]) { +		case 0x05: /* syscall */ +		case 0x34: /* sysenter */ +			ret = X86_BR_SYSCALL; +			break; +		case 0x07: /* sysret */ +		case 0x35: /* sysexit */ +			ret = X86_BR_SYSRET; +			break; +		case 0x80 ... 0x8f: /* conditional */ +			ret = X86_BR_JCC; +			break; +		default: +			ret = X86_BR_NONE; +		} +		break; +	case 0x70 ... 0x7f: /* conditional */ +		ret = X86_BR_JCC; +		break; +	case 0xc2: /* near ret */ +	case 0xc3: /* near ret */ +	case 0xca: /* far ret */ +	case 0xcb: /* far ret */ +		ret = X86_BR_RET; +		break; +	case 0xcf: /* iret */ +		ret = X86_BR_IRET; +		break; +	case 0xcc ... 0xce: /* int */ +		ret = X86_BR_INT; +		break; +	case 0xe8: /* call near rel */ +	case 0x9a: /* call far absolute */ +		ret = X86_BR_CALL; +		break; +	case 0xe0 ... 0xe3: /* loop jmp */ +		ret = X86_BR_JCC; +		break; +	case 0xe9 ... 0xeb: /* jmp */ +		ret = X86_BR_JMP; +		break; +	case 0xff: /* call near absolute, call far absolute ind */ +		insn_get_modrm(&insn); +		ext = (insn.modrm.bytes[0] >> 3) & 0x7; +		switch (ext) { +		case 2: /* near ind call */ +		case 3: /* far ind call */ +			ret = X86_BR_IND_CALL; +			break; +		case 4: +		case 5: +			ret = X86_BR_JMP; +			break; +		} +		break; +	default: +		ret = X86_BR_NONE; +	} +	/* +	 * interrupts, traps, faults (and thus ring transition) may +	 * occur on any instructions. Thus, to classify them correctly, +	 * we need to first look at the from and to priv levels. If they +	 * are different and to is in the kernel, then it indicates +	 * a ring transition. If the from instruction is not a ring +	 * transition instr (syscall, systenter, int), then it means +	 * it was a irq, trap or fault. +	 * +	 * we have no way of detecting kernel to kernel faults. +	 */ +	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL +	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT) +		ret = X86_BR_IRQ; + +	/* +	 * branch priv level determined by target as +	 * is done by HW when LBR_SELECT is implemented +	 */ +	if (ret != X86_BR_NONE) +		ret |= to_plm; + +	return ret;  } +/* + * implement actual branch filter based on user demand. + * Hardware may not exactly satisfy that request, thus + * we need to inspect opcodes. Mismatched branches are + * discarded. Therefore, the number of branches returned + * in PERF_SAMPLE_BRANCH_STACK sample may vary. + */ +static void +intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) +{ +	u64 from, to; +	int br_sel = cpuc->br_sel; +	int i, j, type; +	bool compress = false; + +	/* if sampling all branches, then nothing to filter */ +	if ((br_sel & X86_BR_ALL) == X86_BR_ALL) +		return; + +	for (i = 0; i < cpuc->lbr_stack.nr; i++) { + +		from = cpuc->lbr_entries[i].from; +		to = cpuc->lbr_entries[i].to; + +		type = branch_type(from, to); + +		/* if type does not correspond, then discard */ +		if (type == X86_BR_NONE || (br_sel & type) != type) { +			cpuc->lbr_entries[i].from = 0; +			compress = true; +		} +	} + +	if (!compress) +		return; + +	/* remove all entries with from=0 */ +	for (i = 0; i < cpuc->lbr_stack.nr; ) { +		if (!cpuc->lbr_entries[i].from) { +			j = i; +			while (++j < cpuc->lbr_stack.nr) +				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; +			cpuc->lbr_stack.nr--; +			if (!cpuc->lbr_entries[i].from) +				continue; +		} +		i++; +	} +} + +/* + * Map interface branch filters onto LBR filters + */ +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { +	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY, +	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER, +	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL, +	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN, +	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_REL_JMP +					| LBR_IND_JMP | LBR_FAR, +	/* +	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches +	 */ +	[PERF_SAMPLE_BRANCH_ANY_CALL] = +	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, +	/* +	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL +	 */ +	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, +}; + +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { +	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY, +	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER, +	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL, +	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN, +	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_FAR, +	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL +					| LBR_FAR, +	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL, +}; + +/* core */  void intel_pmu_lbr_init_core(void)  {  	x86_pmu.lbr_nr     = 4; -	x86_pmu.lbr_tos    = 0x01c9; -	x86_pmu.lbr_from   = 0x40; -	x86_pmu.lbr_to     = 0x60; +	x86_pmu.lbr_tos    = MSR_LBR_TOS; +	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM; +	x86_pmu.lbr_to     = MSR_LBR_CORE_TO; + +	/* +	 * SW branch filter usage: +	 * - compensate for lack of HW filter +	 */ +	pr_cont("4-deep LBR, ");  } +/* nehalem/westmere */  void intel_pmu_lbr_init_nhm(void)  {  	x86_pmu.lbr_nr     = 16; -	x86_pmu.lbr_tos    = 0x01c9; -	x86_pmu.lbr_from   = 0x680; -	x86_pmu.lbr_to     = 0x6c0; +	x86_pmu.lbr_tos    = MSR_LBR_TOS; +	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM; +	x86_pmu.lbr_to     = MSR_LBR_NHM_TO; + +	x86_pmu.lbr_sel_mask = LBR_SEL_MASK; +	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map; + +	/* +	 * SW branch filter usage: +	 * - workaround LBR_SEL errata (see above) +	 * - support syscall, sysret capture. +	 *   That requires LBR_FAR but that means far +	 *   jmp need to be filtered out +	 */ +	pr_cont("16-deep LBR, "); +} + +/* sandy bridge */ +void intel_pmu_lbr_init_snb(void) +{ +	x86_pmu.lbr_nr	 = 16; +	x86_pmu.lbr_tos	 = MSR_LBR_TOS; +	x86_pmu.lbr_from = MSR_LBR_NHM_FROM; +	x86_pmu.lbr_to   = MSR_LBR_NHM_TO; + +	x86_pmu.lbr_sel_mask = LBR_SEL_MASK; +	x86_pmu.lbr_sel_map  = snb_lbr_sel_map; + +	/* +	 * SW branch filter usage: +	 * - support syscall, sysret capture. +	 *   That requires LBR_FAR but that means far +	 *   jmp need to be filtered out +	 */ +	pr_cont("16-deep LBR, ");  } +/* atom */  void intel_pmu_lbr_init_atom(void)  { +	/* +	 * only models starting at stepping 10 seems +	 * to have an operational LBR which can freeze +	 * on PMU interrupt +	 */ +	if (boot_cpu_data.x86_mask < 10) { +		pr_cont("LBR disabled due to erratum"); +		return; +	} +  	x86_pmu.lbr_nr	   = 8; -	x86_pmu.lbr_tos    = 0x01c9; -	x86_pmu.lbr_from   = 0x40; -	x86_pmu.lbr_to     = 0x60; +	x86_pmu.lbr_tos    = MSR_LBR_TOS; +	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM; +	x86_pmu.lbr_to     = MSR_LBR_CORE_TO; + +	/* +	 * SW branch filter usage: +	 * - compensate for lack of HW filter +	 */ +	pr_cont("8-deep LBR, ");  } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c7f64e6f537..addf9e82a7f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)  		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },  		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },  		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 }, +		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },  		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },  		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },  		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 }, diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 642f75a68cd..11891ca7b71 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,  	if (!userbuf) {  		memcpy(buf, (vaddr + offset), csize); -		kunmap_atomic(vaddr, KM_PTE0); +		kunmap_atomic(vaddr);  	} else {  		if (!kdump_buf_page) {  			printk(KERN_WARNING "Kdump: Kdump buffer page not"  				" allocated\n"); -			kunmap_atomic(vaddr, KM_PTE0); +			kunmap_atomic(vaddr);  			return -EFAULT;  		}  		copy_page(kdump_buf_page, vaddr); -		kunmap_atomic(vaddr, KM_PTE0); +		kunmap_atomic(vaddr);  		if (copy_to_user(buf, (kdump_buf_page + offset), csize))  			return -EFAULT;  	} diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 52821799a70..3ae2ced4a87 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,6 +4,7 @@  #include <linux/bootmem.h>  #include <linux/export.h>  #include <linux/io.h> +#include <linux/irqdomain.h>  #include <linux/interrupt.h>  #include <linux/list.h>  #include <linux/of.h> @@ -17,64 +18,14 @@  #include <linux/initrd.h>  #include <asm/hpet.h> -#include <asm/irq_controller.h>  #include <asm/apic.h>  #include <asm/pci_x86.h>  __initdata u64 initial_dtb;  char __initdata cmd_line[COMMAND_LINE_SIZE]; -static LIST_HEAD(irq_domains); -static DEFINE_RAW_SPINLOCK(big_irq_lock);  int __initdata of_ioapic; -#ifdef CONFIG_X86_IO_APIC -static void add_interrupt_host(struct irq_domain *ih) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&big_irq_lock, flags); -	list_add(&ih->l, &irq_domains); -	raw_spin_unlock_irqrestore(&big_irq_lock, flags); -} -#endif - -static struct irq_domain *get_ih_from_node(struct device_node *controller) -{ -	struct irq_domain *ih, *found = NULL; -	unsigned long flags; - -	raw_spin_lock_irqsave(&big_irq_lock, flags); -	list_for_each_entry(ih, &irq_domains, l) { -		if (ih->controller ==  controller) { -			found = ih; -			break; -		} -	} -	raw_spin_unlock_irqrestore(&big_irq_lock, flags); -	return found; -} - -unsigned int irq_create_of_mapping(struct device_node *controller, -				   const u32 *intspec, unsigned int intsize) -{ -	struct irq_domain *ih; -	u32 virq, type; -	int ret; - -	ih = get_ih_from_node(controller); -	if (!ih) -		return 0; -	ret = ih->xlate(ih, intspec, intsize, &virq, &type); -	if (ret) -		return 0; -	if (type == IRQ_TYPE_NONE) -		return virq; -	irq_set_irq_type(virq, type); -	return virq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); -  unsigned long pci_address_to_pio(phys_addr_t address)  {  	/* @@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] =  	},  }; -static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, -			u32 *out_hwirq, u32 *out_type) +static int ioapic_xlate(struct irq_domain *domain, +			struct device_node *controller, +			const u32 *intspec, u32 intsize, +			irq_hw_number_t *out_hwirq, u32 *out_type)  { -	struct mp_ioapic_gsi *gsi_cfg;  	struct io_apic_irq_attr attr;  	struct of_ioapic_type *it; -	u32 line, idx, type; +	u32 line, idx; +	int rc; -	if (intsize < 2) +	if (WARN_ON(intsize < 2))  		return -EINVAL; -	line = *intspec; -	idx = (u32) id->priv; -	gsi_cfg = mp_ioapic_gsi_routing(idx); -	*out_hwirq = line + gsi_cfg->gsi_base; - -	intspec++; -	type = *intspec; +	line = intspec[0]; -	if (type >= ARRAY_SIZE(of_ioapic_type)) +	if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))  		return -EINVAL; -	it = of_ioapic_type + type; -	*out_type = it->out_type; +	it = &of_ioapic_type[intspec[1]]; +	idx = (u32) domain->host_data;  	set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); -	return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); +	rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), +					cpu_to_node(0), &attr); +	if (rc) +		return rc; + +	*out_hwirq = line; +	*out_type = it->out_type; +	return 0;  } +const struct irq_domain_ops ioapic_irq_domain_ops = { +	.xlate = ioapic_xlate, +}; +  static void __init ioapic_add_ofnode(struct device_node *np)  {  	struct resource r; @@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np)  	for (i = 0; i < nr_ioapics; i++) {  		if (r.start == mpc_ioapic_addr(i)) {  			struct irq_domain *id; +			struct mp_ioapic_gsi *gsi_cfg; + +			gsi_cfg = mp_ioapic_gsi_routing(i); -			id = kzalloc(sizeof(*id), GFP_KERNEL); +			id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, +						   &ioapic_irq_domain_ops, +						   (void*)i);  			BUG_ON(!id); -			id->controller = np; -			id->xlate = ioapic_xlate; -			id->priv = (void *)i; -			add_interrupt_host(id);  			return;  		}  	} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1aae78f775f..4025fe4f928 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)  	unsigned short ss;  	unsigned long sp;  #endif -	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +	printk(KERN_DEFAULT +	       "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);  #ifdef CONFIG_PREEMPT  	printk("PREEMPT ");  #endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c99f9ed013d..88ec9129271 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs)  	int i;  	print_modules(); -	__show_regs(regs, 0); +	__show_regs(regs, !user_mode_vm(regs));  	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",  		TASK_COMM_LEN, current->comm, task_pid_nr(current), diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6d728d9284b..17107bd6e1f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  	if (!stack) {  		if (regs)  			stack = (unsigned long *)regs->sp; -		else if (task && task != current) +		else if (task != current)  			stack = (unsigned long *)task->thread.sp;  		else  			stack = &dummy; @@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs)  		unsigned char c;  		u8 *ip; -		printk(KERN_EMERG "Stack:\n"); +		printk(KERN_DEFAULT "Stack:\n");  		show_stack_log_lvl(NULL, regs, (unsigned long *)sp, -				   0, KERN_EMERG); +				   0, KERN_DEFAULT); -		printk(KERN_EMERG "Code: "); +		printk(KERN_DEFAULT "Code: ");  		ip = (u8 *)regs->ip - code_prologue;  		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 79d97e68f04..7b784f4ef1e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -98,12 +98,6 @@  #endif  .endm -#ifdef CONFIG_VM86 -#define resume_userspace_sig	check_userspace -#else -#define resume_userspace_sig	resume_userspace -#endif -  /*   * User gs save/restore   * @@ -327,10 +321,19 @@ ret_from_exception:  	preempt_stop(CLBR_ANY)  ret_from_intr:  	GET_THREAD_INFO(%ebp) -check_userspace: +resume_userspace_sig: +#ifdef CONFIG_VM86  	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS  	movb PT_CS(%esp), %al  	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else +	/* +	 * We can be coming here from a syscall done in the kernel space, +	 * e.g. a failed kernel_execve(). +	 */ +	movl PT_CS(%esp), %eax +	andl $SEGMENT_RPL_MASK, %eax +#endif  	cmpl $USER_RPL, %eax  	jb resume_kernel		# not returning to v8086 or userspace diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8f..734ebd1d3ca 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -320,7 +320,7 @@ ENDPROC(native_usergs_sysret64)  	movq %rsp, %rsi  	leaq -RBP(%rsp),%rdi	/* arg1 for handler */ -	testl $3, CS(%rdi) +	testl $3, CS-RBP(%rsi)  	je 1f  	SWAPGS  	/* @@ -330,11 +330,10 @@ ENDPROC(native_usergs_sysret64)  	 * moving irq_enter into assembly, which would be too much work)  	 */  1:	incl PER_CPU_VAR(irq_count) -	jne 2f -	mov PER_CPU_VAR(irq_stack_ptr),%rsp +	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp  	CFI_DEF_CFA_REGISTER	rsi -2:	/* Store previous stack value */ +	/* Store previous stack value */  	pushq %rsi  	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \  			0x77 /* DW_OP_breg7 */, 0, \ @@ -813,7 +812,7 @@ ret_from_intr:  	/* Restore saved previous stack */  	popq %rsi -	CFI_DEF_CFA_REGISTER	rsi +	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */  	leaq ARGOFFSET-RBP(%rsi), %rsp  	CFI_DEF_CFA_REGISTER	rsp  	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET @@ -1530,12 +1529,20 @@ ENTRY(nmi)  	/* Use %rdx as out temp variable throughout */  	pushq_cfi %rdx +	CFI_REL_OFFSET rdx, 0 + +	/* +	 * If %cs was not the kernel segment, then the NMI triggered in user +	 * space, which means it is definitely not nested. +	 */ +	cmpl $__KERNEL_CS, 16(%rsp) +	jne first_nmi  	/*  	 * Check the special variable on the stack to see if NMIs are  	 * executing.  	 */ -	cmp $1, -8(%rsp) +	cmpl $1, -8(%rsp)  	je nested_nmi  	/* @@ -1547,6 +1554,7 @@ ENTRY(nmi)  	 */  	lea 6*8(%rsp), %rdx  	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi +	CFI_REMEMBER_STATE  nested_nmi:  	/* @@ -1578,10 +1586,12 @@ nested_nmi:  nested_nmi_out:  	popq_cfi %rdx +	CFI_RESTORE rdx  	/* No need to check faults here */  	INTERRUPT_RETURN +	CFI_RESTORE_STATE  first_nmi:  	/*  	 * Because nested NMIs will use the pushed location that we @@ -1613,10 +1623,15 @@ first_nmi:  	 * | pt_regs                 |  	 * +-------------------------+  	 * -	 * The saved RIP is used to fix up the copied RIP that a nested -	 * NMI may zero out. The original stack frame and the temp storage +	 * The saved stack frame is used to fix up the copied stack frame +	 * that a nested NMI may change to make the interrupted NMI iret jump +	 * to the repeat_nmi. The original stack frame and the temp storage  	 * is also used by nested NMIs and can not be trusted on exit.  	 */ +	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */ +	movq (%rsp), %rdx +	CFI_RESTORE rdx +  	/* Set the NMI executing variable on the stack. */  	pushq_cfi $1 @@ -1624,22 +1639,39 @@ first_nmi:  	.rept 5  	pushq_cfi 6*8(%rsp)  	.endr +	CFI_DEF_CFA_OFFSET SS+8-RIP + +	/* Everything up to here is safe from nested NMIs */ + +	/* +	 * If there was a nested NMI, the first NMI's iret will return +	 * here. But NMIs are still enabled and we can take another +	 * nested NMI. The nested NMI checks the interrupted RIP to see +	 * if it is between repeat_nmi and end_repeat_nmi, and if so +	 * it will just return, as we are about to repeat an NMI anyway. +	 * This makes it safe to copy to the stack frame that a nested +	 * NMI will update. +	 */ +repeat_nmi: +	/* +	 * Update the stack variable to say we are still in NMI (the update +	 * is benign for the non-repeat case, where 1 was pushed just above +	 * to this very stack slot). +	 */ +	movq $1, 5*8(%rsp)  	/* Make another copy, this one may be modified by nested NMIs */  	.rept 5  	pushq_cfi 4*8(%rsp)  	.endr - -	/* Do not pop rdx, nested NMIs will corrupt it */ -	movq 11*8(%rsp), %rdx +	CFI_DEF_CFA_OFFSET SS+8-RIP +end_repeat_nmi:  	/*  	 * Everything below this point can be preempted by a nested -	 * NMI if the first NMI took an exception. Repeated NMIs -	 * caused by an exception and nested NMI will start here, and -	 * can still be preempted by another NMI. +	 * NMI if the first NMI took an exception and reset our iret stack +	 * so that we repeat another NMI.  	 */ -restart_nmi:  	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */  	subq $ORIG_RAX-R15, %rsp  	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1668,26 +1700,6 @@ nmi_restore:  	CFI_ENDPROC  END(nmi) -	/* -	 * If an NMI hit an iret because of an exception or breakpoint, -	 * it can lose its NMI context, and a nested NMI may come in. -	 * In that case, the nested NMI will change the preempted NMI's -	 * stack to jump to here when it does the final iret. -	 */ -repeat_nmi: -	INTR_FRAME -	/* Update the stack variable to say we are still in NMI */ -	movq $1, 5*8(%rsp) - -	/* copy the saved stack back to copy stack */ -	.rept 5 -	pushq_cfi 4*8(%rsp) -	.endr - -	jmp restart_nmi -	CFI_ENDPROC -end_repeat_nmi: -  ENTRY(ignore_sysret)  	CFI_STARTPROC  	mov $-ENOSYS,%eax diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 739d8598f78..7734bcbb5a3 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -16,6 +16,7 @@  #include <asm/uaccess.h>  #include <asm/ptrace.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/user.h>  #ifdef CONFIG_X86_64 @@ -32,6 +33,86 @@  # define user32_fxsr_struct	user_fxsr_struct  #endif +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ +	return !__thread_has_fpu(current) && +		(read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ +	struct pt_regs *regs = get_irq_regs(); +	return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ +	return !in_interrupt() || +		interrupted_user_mode() || +		interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void kernel_fpu_begin(void) +{ +	struct task_struct *me = current; + +	WARN_ON_ONCE(!irq_fpu_usable()); +	preempt_disable(); +	if (__thread_has_fpu(me)) { +		__save_init_fpu(me); +		__thread_clear_has_fpu(me); +		/* We do 'stts()' in kernel_fpu_end() */ +	} else { +		percpu_write(fpu_owner_task, NULL); +		clts(); +	} +} +EXPORT_SYMBOL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ +	stts(); +	preempt_enable(); +} +EXPORT_SYMBOL(kernel_fpu_end); + +void unlazy_fpu(struct task_struct *tsk) +{ +	preempt_disable(); +	if (__thread_has_fpu(tsk)) { +		__save_init_fpu(tsk); +		__thread_fpu_end(tsk); +	} else +		tsk->fpu_counter = 0; +	preempt_enable(); +} +EXPORT_SYMBOL(unlazy_fpu); +  #ifdef CONFIG_MATH_EMULATION  # define HAVE_HWFP		(boot_cpu_data.hard_math)  #else @@ -44,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size);  unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);  static struct i387_fxsave_struct fx_scratch __cpuinitdata; -void __cpuinit mxcsr_feature_mask_init(void) +static void __cpuinit mxcsr_feature_mask_init(void)  {  	unsigned long mask = 0; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 40fc86161d9..58b7f27cb3e 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)  	irqctx->tinfo.task = curctx->tinfo.task;  	irqctx->tinfo.previous_esp = current_stack_pointer; -	/* -	 * Copy the softirq bits in preempt_count so that the -	 * softirq checks work in the hardirq context. -	 */ -	irqctx->tinfo.preempt_count = -		(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | -		(curctx->tinfo.preempt_count & SOFTIRQ_MASK); +	/* Copy the preempt_count so that the [soft]irq checks work. */ +	irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;  	if (unlikely(overflow))  		call_on_stack(print_stack_overflow, isp); @@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)  	if (unlikely(!desc))  		return false; -	if (!execute_on_irq_stack(overflow, desc, irq)) { +	if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {  		if (unlikely(overflow))  			print_stack_overflow();  		desc->handle_irq(irq, desc); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbc..43e2b1cff0a 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -306,10 +306,10 @@ void __init native_init_IRQ(void)  	 * us. (some of these will be overridden and become  	 * 'special' SMP interrupts)  	 */ -	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +	i = FIRST_EXTERNAL_VECTOR; +	for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {  		/* IA32_SYSCALL_VECTOR could be used in trap_init already. */ -		if (!test_bit(i, used_vectors)) -			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +		set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);  	}  	if (!acpi_ioapic && !of_ioapic) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771aca..fdc37b3d0ce 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =  	{ "ss", 4, offsetof(struct pt_regs, ss) },  	{ "ds", 4, offsetof(struct pt_regs, ds) },  	{ "es", 4, offsetof(struct pt_regs, es) }, -	{ "fs", 4, -1 }, -	{ "gs", 4, -1 },  #else  	{ "ax", 8, offsetof(struct pt_regs, ax) },  	{ "bx", 8, offsetof(struct pt_regs, bx) }, @@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =  	{ "flags", 4, offsetof(struct pt_regs, flags) },  	{ "cs", 4, offsetof(struct pt_regs, cs) },  	{ "ss", 4, offsetof(struct pt_regs, ss) }, +	{ "ds", 4, -1 }, +	{ "es", 4, -1 },  #endif +	{ "fs", 4, -1 }, +	{ "gs", 4, -1 },  };  int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h new file mode 100644 index 00000000000..3230b68ef29 --- /dev/null +++ b/arch/x86/kernel/kprobes-common.h @@ -0,0 +1,102 @@ +#ifndef __X86_KERNEL_KPROBES_COMMON_H +#define __X86_KERNEL_KPROBES_COMMON_H + +/* Kprobes and Optprobes common header */ + +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING			\ +	/* Skip cs, ip, orig_ax. */		\ +	"	subq $24, %rsp\n"		\ +	"	pushq %rdi\n"			\ +	"	pushq %rsi\n"			\ +	"	pushq %rdx\n"			\ +	"	pushq %rcx\n"			\ +	"	pushq %rax\n"			\ +	"	pushq %r8\n"			\ +	"	pushq %r9\n"			\ +	"	pushq %r10\n"			\ +	"	pushq %r11\n"			\ +	"	pushq %rbx\n"			\ +	"	pushq %rbp\n"			\ +	"	pushq %r12\n"			\ +	"	pushq %r13\n"			\ +	"	pushq %r14\n"			\ +	"	pushq %r15\n" +#define RESTORE_REGS_STRING			\ +	"	popq %r15\n"			\ +	"	popq %r14\n"			\ +	"	popq %r13\n"			\ +	"	popq %r12\n"			\ +	"	popq %rbp\n"			\ +	"	popq %rbx\n"			\ +	"	popq %r11\n"			\ +	"	popq %r10\n"			\ +	"	popq %r9\n"			\ +	"	popq %r8\n"			\ +	"	popq %rax\n"			\ +	"	popq %rcx\n"			\ +	"	popq %rdx\n"			\ +	"	popq %rsi\n"			\ +	"	popq %rdi\n"			\ +	/* Skip orig_ax, ip, cs */		\ +	"	addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING			\ +	/* Skip cs, ip, orig_ax and gs. */	\ +	"	subl $16, %esp\n"		\ +	"	pushl %fs\n"			\ +	"	pushl %es\n"			\ +	"	pushl %ds\n"			\ +	"	pushl %eax\n"			\ +	"	pushl %ebp\n"			\ +	"	pushl %edi\n"			\ +	"	pushl %esi\n"			\ +	"	pushl %edx\n"			\ +	"	pushl %ecx\n"			\ +	"	pushl %ebx\n" +#define RESTORE_REGS_STRING			\ +	"	popl %ebx\n"			\ +	"	popl %ecx\n"			\ +	"	popl %edx\n"			\ +	"	popl %esi\n"			\ +	"	popl %edi\n"			\ +	"	popl %ebp\n"			\ +	"	popl %eax\n"			\ +	/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ +	"	addl $24, %esp\n" +#endif + +/* Ensure if the instruction can be boostable */ +extern int can_boost(kprobe_opcode_t *instruction); +/* Recover instruction if given address is probed */ +extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, +					 unsigned long addr); +/* + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. + */ +extern int __copy_instruction(u8 *dest, u8 *src); + +/* Generate a relative-jump/call instruction */ +extern void synthesize_reljump(void *from, void *to); +extern void synthesize_relcall(void *from, void *to); + +#ifdef	CONFIG_OPTPROBES +extern int arch_init_optprobes(void); +extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); +extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); +#else	/* !CONFIG_OPTPROBES */ +static inline int arch_init_optprobes(void) +{ +	return 0; +} +static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ +	return 0; +} +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ +	return addr; +} +#endif +#endif diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c new file mode 100644 index 00000000000..c5e410eed40 --- /dev/null +++ b/arch/x86/kernel/kprobes-opt.c @@ -0,0 +1,512 @@ +/* + *  Kernel Probes Jump Optimization (Optprobes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Hitachi Ltd., 2012 + */ +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/kallsyms.h> +#include <linux/ftrace.h> + +#include <asm/cacheflush.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/alternative.h> +#include <asm/insn.h> +#include <asm/debugreg.h> + +#include "kprobes-common.h" + +unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ +	struct optimized_kprobe *op; +	struct kprobe *kp; +	long offs; +	int i; + +	for (i = 0; i < RELATIVEJUMP_SIZE; i++) { +		kp = get_kprobe((void *)addr - i); +		/* This function only handles jump-optimized kprobe */ +		if (kp && kprobe_optimized(kp)) { +			op = container_of(kp, struct optimized_kprobe, kp); +			/* If op->list is not empty, op is under optimizing */ +			if (list_empty(&op->list)) +				goto found; +		} +	} + +	return addr; +found: +	/* +	 * If the kprobe can be optimized, original bytes which can be +	 * overwritten by jump destination address. In this case, original +	 * bytes must be recovered from op->optinsn.copied_insn buffer. +	 */ +	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); +	if (addr == (unsigned long)kp->addr) { +		buf[0] = kp->opcode; +		memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); +	} else { +		offs = addr - (unsigned long)kp->addr - 1; +		memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); +	} + +	return (unsigned long)buf; +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +{ +#ifdef CONFIG_X86_64 +	*addr++ = 0x48; +	*addr++ = 0xbf; +#else +	*addr++ = 0xb8; +#endif +	*(unsigned long *)addr = val; +} + +static void __used __kprobes kprobes_optinsn_template_holder(void) +{ +	asm volatile ( +			".global optprobe_template_entry\n" +			"optprobe_template_entry:\n" +#ifdef CONFIG_X86_64 +			/* We don't bother saving the ss register */ +			"	pushq %rsp\n" +			"	pushfq\n" +			SAVE_REGS_STRING +			"	movq %rsp, %rsi\n" +			".global optprobe_template_val\n" +			"optprobe_template_val:\n" +			ASM_NOP5 +			ASM_NOP5 +			".global optprobe_template_call\n" +			"optprobe_template_call:\n" +			ASM_NOP5 +			/* Move flags to rsp */ +			"	movq 144(%rsp), %rdx\n" +			"	movq %rdx, 152(%rsp)\n" +			RESTORE_REGS_STRING +			/* Skip flags entry */ +			"	addq $8, %rsp\n" +			"	popfq\n" +#else /* CONFIG_X86_32 */ +			"	pushf\n" +			SAVE_REGS_STRING +			"	movl %esp, %edx\n" +			".global optprobe_template_val\n" +			"optprobe_template_val:\n" +			ASM_NOP5 +			".global optprobe_template_call\n" +			"optprobe_template_call:\n" +			ASM_NOP5 +			RESTORE_REGS_STRING +			"	addl $4, %esp\n"	/* skip cs */ +			"	popf\n" +#endif +			".global optprobe_template_end\n" +			"optprobe_template_end:\n"); +} + +#define TMPL_MOVE_IDX \ +	((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ +	((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ +	((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +{ +	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); +	unsigned long flags; + +	/* This is possible if op is under delayed unoptimizing */ +	if (kprobe_disabled(&op->kp)) +		return; + +	local_irq_save(flags); +	if (kprobe_running()) { +		kprobes_inc_nmissed_count(&op->kp); +	} else { +		/* Save skipped registers */ +#ifdef CONFIG_X86_64 +		regs->cs = __KERNEL_CS; +#else +		regs->cs = __KERNEL_CS | get_kernel_rpl(); +		regs->gs = 0; +#endif +		regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; +		regs->orig_ax = ~0UL; + +		__this_cpu_write(current_kprobe, &op->kp); +		kcb->kprobe_status = KPROBE_HIT_ACTIVE; +		opt_pre_handler(&op->kp, regs); +		__this_cpu_write(current_kprobe, NULL); +	} +	local_irq_restore(flags); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ +	int len = 0, ret; + +	while (len < RELATIVEJUMP_SIZE) { +		ret = __copy_instruction(dest + len, src + len); +		if (!ret || !can_boost(dest + len)) +			return -EINVAL; +		len += ret; +	} +	/* Check whether the address range is reserved */ +	if (ftrace_text_reserved(src, src + len - 1) || +	    alternatives_text_reserved(src, src + len - 1) || +	    jump_label_text_reserved(src, src + len - 1)) +		return -EBUSY; + +	return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ +	return ((insn->opcode.bytes[0] == 0xff && +		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ +		insn->opcode.bytes[0] == 0xea);	/* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ +	unsigned long target = 0; + +	switch (insn->opcode.bytes[0]) { +	case 0xe0:	/* loopne */ +	case 0xe1:	/* loope */ +	case 0xe2:	/* loop */ +	case 0xe3:	/* jcxz */ +	case 0xe9:	/* near relative jump */ +	case 0xeb:	/* short relative jump */ +		break; +	case 0x0f: +		if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ +			break; +		return 0; +	default: +		if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ +			break; +		return 0; +	} +	target = (unsigned long)insn->next_byte + insn->immediate.value; + +	return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ +	unsigned long addr, size = 0, offset = 0; +	struct insn insn; +	kprobe_opcode_t buf[MAX_INSN_SIZE]; + +	/* Lookup symbol including addr */ +	if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) +		return 0; + +	/* +	 * Do not optimize in the entry code due to the unstable +	 * stack handling. +	 */ +	if ((paddr >= (unsigned long)__entry_text_start) && +	    (paddr <  (unsigned long)__entry_text_end)) +		return 0; + +	/* Check there is enough space for a relative jump. */ +	if (size - offset < RELATIVEJUMP_SIZE) +		return 0; + +	/* Decode instructions */ +	addr = paddr - offset; +	while (addr < paddr - offset + size) { /* Decode until function end */ +		if (search_exception_tables(addr)) +			/* +			 * Since some fixup code will jumps into this function, +			 * we can't optimize kprobe in this function. +			 */ +			return 0; +		kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); +		insn_get_length(&insn); +		/* Another subsystem puts a breakpoint */ +		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) +			return 0; +		/* Recover address */ +		insn.kaddr = (void *)addr; +		insn.next_byte = (void *)(addr + insn.length); +		/* Check any instructions don't jump into target */ +		if (insn_is_indirect_jump(&insn) || +		    insn_jump_into_range(&insn, paddr + INT3_SIZE, +					 RELATIVE_ADDR_SIZE)) +			return 0; +		addr += insn.length; +	} + +	return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ +	int i; +	struct kprobe *p; + +	for (i = 1; i < op->optinsn.size; i++) { +		p = get_kprobe(op->kp.addr + i); +		if (p && !kprobe_disabled(p)) +			return -EEXIST; +	} + +	return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes +arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) +{ +	return ((unsigned long)op->kp.addr <= addr && +		(unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ +	if (op->optinsn.insn) { +		free_optinsn_slot(op->optinsn.insn, dirty); +		op->optinsn.insn = NULL; +		op->optinsn.size = 0; +	} +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ +	__arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ +	u8 *buf; +	int ret; +	long rel; + +	if (!can_optimize((unsigned long)op->kp.addr)) +		return -EILSEQ; + +	op->optinsn.insn = get_optinsn_slot(); +	if (!op->optinsn.insn) +		return -ENOMEM; + +	/* +	 * Verify if the address gap is in 2GB range, because this uses +	 * a relative jump. +	 */ +	rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; +	if (abs(rel) > 0x7fffffff) +		return -ERANGE; + +	buf = (u8 *)op->optinsn.insn; + +	/* Copy instructions into the out-of-line buffer */ +	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); +	if (ret < 0) { +		__arch_remove_optimized_kprobe(op, 0); +		return ret; +	} +	op->optinsn.size = ret; + +	/* Copy arch-dep-instance from template */ +	memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + +	/* Set probe information */ +	synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + +	/* Set probe function call */ +	synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + +	/* Set returning jmp instruction at the tail of out-of-line buffer */ +	synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, +			   (u8 *)op->kp.addr + op->optinsn.size); + +	flush_icache_range((unsigned long) buf, +			   (unsigned long) buf + TMPL_END_IDX + +			   op->optinsn.size + RELATIVEJUMP_SIZE); +	return 0; +} + +#define MAX_OPTIMIZE_PROBES 256 +static struct text_poke_param *jump_poke_params; +static struct jump_poke_buffer { +	u8 buf[RELATIVEJUMP_SIZE]; +} *jump_poke_bufs; + +static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, +					    u8 *insn_buf, +					    struct optimized_kprobe *op) +{ +	s32 rel = (s32)((long)op->optinsn.insn - +			((long)op->kp.addr + RELATIVEJUMP_SIZE)); + +	/* Backup instructions which will be replaced by jump address */ +	memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, +	       RELATIVE_ADDR_SIZE); + +	insn_buf[0] = RELATIVEJUMP_OPCODE; +	*(s32 *)(&insn_buf[1]) = rel; + +	tprm->addr = op->kp.addr; +	tprm->opcode = insn_buf; +	tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void __kprobes arch_optimize_kprobes(struct list_head *oplist) +{ +	struct optimized_kprobe *op, *tmp; +	int c = 0; + +	list_for_each_entry_safe(op, tmp, oplist, list) { +		WARN_ON(kprobe_disabled(&op->kp)); +		/* Setup param */ +		setup_optimize_kprobe(&jump_poke_params[c], +				      jump_poke_bufs[c].buf, op); +		list_del_init(&op->list); +		if (++c >= MAX_OPTIMIZE_PROBES) +			break; +	} + +	/* +	 * text_poke_smp doesn't support NMI/MCE code modifying. +	 * However, since kprobes itself also doesn't support NMI/MCE +	 * code probing, it's not a problem. +	 */ +	text_poke_smp_batch(jump_poke_params, c); +} + +static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, +					      u8 *insn_buf, +					      struct optimized_kprobe *op) +{ +	/* Set int3 to first byte for kprobes */ +	insn_buf[0] = BREAKPOINT_INSTRUCTION; +	memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + +	tprm->addr = op->kp.addr; +	tprm->opcode = insn_buf; +	tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, +				    struct list_head *done_list) +{ +	struct optimized_kprobe *op, *tmp; +	int c = 0; + +	list_for_each_entry_safe(op, tmp, oplist, list) { +		/* Setup param */ +		setup_unoptimize_kprobe(&jump_poke_params[c], +					jump_poke_bufs[c].buf, op); +		list_move(&op->list, done_list); +		if (++c >= MAX_OPTIMIZE_PROBES) +			break; +	} + +	/* +	 * text_poke_smp doesn't support NMI/MCE code modifying. +	 * However, since kprobes itself also doesn't support NMI/MCE +	 * code probing, it's not a problem. +	 */ +	text_poke_smp_batch(jump_poke_params, c); +} + +/* Replace a relative jump with a breakpoint (int3).  */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ +	u8 buf[RELATIVEJUMP_SIZE]; + +	/* Set int3 to first byte for kprobes */ +	buf[0] = BREAKPOINT_INSTRUCTION; +	memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); +	text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +int  __kprobes +setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ +	struct optimized_kprobe *op; + +	if (p->flags & KPROBE_FLAG_OPTIMIZED) { +		/* This kprobe is really able to run optimized path. */ +		op = container_of(p, struct optimized_kprobe, kp); +		/* Detour through copied instructions */ +		regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; +		if (!reenter) +			reset_current_kprobe(); +		preempt_enable_no_resched(); +		return 1; +	} +	return 0; +} + +int __kprobes arch_init_optprobes(void) +{ +	/* Allocate code buffer and parameter array */ +	jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * +				 MAX_OPTIMIZE_PROBES, GFP_KERNEL); +	if (!jump_poke_bufs) +		return -ENOMEM; + +	jump_poke_params = kmalloc(sizeof(struct text_poke_param) * +				   MAX_OPTIMIZE_PROBES, GFP_KERNEL); +	if (!jump_poke_params) { +		kfree(jump_poke_bufs); +		jump_poke_bufs = NULL; +		return -ENOMEM; +	} + +	return 0; +} diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7da647d8b64..e213fc8408d 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -30,16 +30,15 @@   *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi   *		<prasanna@in.ibm.com> added function-return probes.   * 2005-May	Rusty Lynch <rusty.lynch@intel.com> - * 		Added function return probes functionality + *		Added function return probes functionality   * 2006-Feb	Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added - * 		kprobe-booster and kretprobe-booster for i386. + *		kprobe-booster and kretprobe-booster for i386.   * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster - * 		and kretprobe-booster for x86-64 + *		and kretprobe-booster for x86-64   * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven - * 		<arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> - * 		unified x86 kprobes code. + *		<arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> + *		unified x86 kprobes code.   */ -  #include <linux/kprobes.h>  #include <linux/ptrace.h>  #include <linux/string.h> @@ -59,6 +58,8 @@  #include <asm/insn.h>  #include <asm/debugreg.h> +#include "kprobes-common.h" +  void jprobe_return_end(void);  DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; @@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {  			      doesn't switch kernel stack.*/  	{NULL, NULL}	/* Terminator */  }; +  const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);  static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) @@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)  }  /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes synthesize_reljump(void *from, void *to) +void __kprobes synthesize_reljump(void *from, void *to)  {  	__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);  } +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +void __kprobes synthesize_relcall(void *from, void *to) +{ +	__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} +  /*   * Skip the prefixes of the instruction.   */ @@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)   * Returns non-zero if opcode is boostable.   * RIP relative instructions are adjusted at copying time in 64 bits mode   */ -static int __kprobes can_boost(kprobe_opcode_t *opcodes) +int __kprobes can_boost(kprobe_opcode_t *opcodes)  {  	kprobe_opcode_t opcode;  	kprobe_opcode_t *orig_opcodes = opcodes; @@ -207,13 +215,15 @@ retry:  	}  } -/* Recover the probed instruction at addr for further analysis. */ -static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +static unsigned long +__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)  {  	struct kprobe *kp; +  	kp = get_kprobe((void *)addr); +	/* There is no probe, return original address */  	if (!kp) -		return -EINVAL; +		return addr;  	/*  	 *  Basically, kp->ainsn.insn has an original instruction. @@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)  	 */  	memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));  	buf[0] = kp->opcode; -	return 0; +	return (unsigned long)buf; +} + +/* + * Recover the probed instruction at addr for further analysis. + * Caller must lock kprobes by kprobe_mutex, or disable preemption + * for preventing to release referencing kprobes. + */ +unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ +	unsigned long __addr; + +	__addr = __recover_optprobed_insn(buf, addr); +	if (__addr != addr) +		return __addr; + +	return __recover_probed_insn(buf, addr);  }  /* Check if paddr is at an instruction boundary */  static int __kprobes can_probe(unsigned long paddr)  { -	int ret; -	unsigned long addr, offset = 0; +	unsigned long addr, __addr, offset = 0;  	struct insn insn;  	kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr)  	/* Decode instructions */  	addr = paddr - offset;  	while (addr < paddr) { -		kernel_insn_init(&insn, (void *)addr); -		insn_get_opcode(&insn); -  		/*  		 * Check if the instruction has been modified by another  		 * kprobe, in which case we replace the breakpoint by the  		 * original instruction in our buffer. +		 * Also, jump optimization will change the breakpoint to +		 * relative-jump. Since the relative-jump itself is +		 * normally used, we just go through if there is no kprobe.  		 */ -		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { -			ret = recover_probed_instruction(buf, addr); -			if (ret) -				/* -				 * Another debugging subsystem might insert -				 * this breakpoint. In that case, we can't -				 * recover it. -				 */ -				return 0; -			kernel_insn_init(&insn, buf); -		} +		__addr = recover_probed_instruction(buf, addr); +		kernel_insn_init(&insn, (void *)__addr);  		insn_get_length(&insn); + +		/* +		 * Another debugging subsystem might insert this breakpoint. +		 * In that case, we can't recover it. +		 */ +		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) +			return 0;  		addr += insn.length;  	} @@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)   * If not, return null.   * Only applicable to 64-bit x86.   */ -static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) +int __kprobes __copy_instruction(u8 *dest, u8 *src)  {  	struct insn insn; -	int ret;  	kprobe_opcode_t buf[MAX_INSN_SIZE]; -	kernel_insn_init(&insn, src); -	if (recover) { -		insn_get_opcode(&insn); -		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { -			ret = recover_probed_instruction(buf, -							 (unsigned long)src); -			if (ret) -				return 0; -			kernel_insn_init(&insn, buf); -		} -	} +	kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));  	insn_get_length(&insn); +	/* Another subsystem puts a breakpoint, failed to recover */ +	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) +		return 0;  	memcpy(dest, insn.kaddr, insn.length);  #ifdef CONFIG_X86_64 @@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)  		 * extension of the original signed 32-bit displacement would  		 * have given.  		 */ -		newdisp = (u8 *) src + (s64) insn.displacement.value - -			  (u8 *) dest; +		newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;  		BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */  		disp = (u8 *) dest + insn_offset_displacement(&insn);  		*(s32 *) disp = (s32) newdisp; @@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)  static void __kprobes arch_copy_kprobe(struct kprobe *p)  { +	/* Copy an instruction with recovering if other optprobe modifies it.*/ +	__copy_instruction(p->ainsn.insn, p->addr); +  	/* -	 * Copy an instruction without recovering int3, because it will be -	 * put by another subsystem. +	 * __copy_instruction can modify the displacement of the instruction, +	 * but it doesn't affect boostable check.  	 */ -	__copy_instruction(p->ainsn.insn, p->addr, 0); - -	if (can_boost(p->addr)) +	if (can_boost(p->ainsn.insn))  		p->ainsn.boostable = 0;  	else  		p->ainsn.boostable = -1; -	p->opcode = *p->addr; +	/* Also, displacement change doesn't affect the first byte */ +	p->opcode = p->ainsn.insn[0];  }  int __kprobes arch_prepare_kprobe(struct kprobe *p) @@ -442,8 +458,8 @@ static void __kprobes restore_btf(void)  	}  } -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, -				      struct pt_regs *regs) +void __kprobes +arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)  {  	unsigned long *sara = stack_addr(regs); @@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,  	*sara = (unsigned long) &kretprobe_trampoline;  } -#ifdef CONFIG_OPTPROBES -static int  __kprobes setup_detour_execution(struct kprobe *p, -					     struct pt_regs *regs, -					     int reenter); -#else -#define setup_detour_execution(p, regs, reenter) (0) -#endif - -static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, -				       struct kprobe_ctlblk *kcb, int reenter) +static void __kprobes +setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)  {  	if (setup_detour_execution(p, regs, reenter))  		return; @@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,   * within the handler. We save the original kprobes variables and just single   * step on the instruction of the new probe without calling any user handlers.   */ -static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, -				    struct kprobe_ctlblk *kcb) +static int __kprobes +reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)  {  	switch (kcb->kprobe_status) {  	case KPROBE_HIT_SSDONE: @@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)  	return 0;  } -#ifdef CONFIG_X86_64 -#define SAVE_REGS_STRING		\ -	/* Skip cs, ip, orig_ax. */	\ -	"	subq $24, %rsp\n"	\ -	"	pushq %rdi\n"		\ -	"	pushq %rsi\n"		\ -	"	pushq %rdx\n"		\ -	"	pushq %rcx\n"		\ -	"	pushq %rax\n"		\ -	"	pushq %r8\n"		\ -	"	pushq %r9\n"		\ -	"	pushq %r10\n"		\ -	"	pushq %r11\n"		\ -	"	pushq %rbx\n"		\ -	"	pushq %rbp\n"		\ -	"	pushq %r12\n"		\ -	"	pushq %r13\n"		\ -	"	pushq %r14\n"		\ -	"	pushq %r15\n" -#define RESTORE_REGS_STRING		\ -	"	popq %r15\n"		\ -	"	popq %r14\n"		\ -	"	popq %r13\n"		\ -	"	popq %r12\n"		\ -	"	popq %rbp\n"		\ -	"	popq %rbx\n"		\ -	"	popq %r11\n"		\ -	"	popq %r10\n"		\ -	"	popq %r9\n"		\ -	"	popq %r8\n"		\ -	"	popq %rax\n"		\ -	"	popq %rcx\n"		\ -	"	popq %rdx\n"		\ -	"	popq %rsi\n"		\ -	"	popq %rdi\n"		\ -	/* Skip orig_ax, ip, cs */	\ -	"	addq $24, %rsp\n" -#else -#define SAVE_REGS_STRING		\ -	/* Skip cs, ip, orig_ax and gs. */	\ -	"	subl $16, %esp\n"	\ -	"	pushl %fs\n"		\ -	"	pushl %es\n"		\ -	"	pushl %ds\n"		\ -	"	pushl %eax\n"		\ -	"	pushl %ebp\n"		\ -	"	pushl %edi\n"		\ -	"	pushl %esi\n"		\ -	"	pushl %edx\n"		\ -	"	pushl %ecx\n"		\ -	"	pushl %ebx\n" -#define RESTORE_REGS_STRING		\ -	"	popl %ebx\n"		\ -	"	popl %ecx\n"		\ -	"	popl %edx\n"		\ -	"	popl %esi\n"		\ -	"	popl %edi\n"		\ -	"	popl %ebp\n"		\ -	"	popl %eax\n"		\ -	/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ -	"	addl $24, %esp\n" -#endif -  /*   * When a retprobed function returns, this code saves registers and   * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)   * jump instruction after the copied instruction, that jumps to the next   * instruction after the probepoint.   */ -static void __kprobes resume_execution(struct kprobe *p, -		struct pt_regs *regs, struct kprobe_ctlblk *kcb) +static void __kprobes +resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)  {  	unsigned long *tos = stack_addr(regs);  	unsigned long copy_ip = (unsigned long)p->ainsn.insn; @@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)  /*   * Wrapper routine for handling exceptions.   */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, -				       unsigned long val, void *data) +int __kprobes +kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)  {  	struct die_args *args = data;  	int ret = NOTIFY_DONE; @@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)  	return 0;  } - -#ifdef CONFIG_OPTPROBES - -/* Insert a call instruction at address 'from', which calls address 'to'.*/ -static void __kprobes synthesize_relcall(void *from, void *to) -{ -	__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); -} - -/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ -static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, -					  unsigned long val) -{ -#ifdef CONFIG_X86_64 -	*addr++ = 0x48; -	*addr++ = 0xbf; -#else -	*addr++ = 0xb8; -#endif -	*(unsigned long *)addr = val; -} - -static void __used __kprobes kprobes_optinsn_template_holder(void) -{ -	asm volatile ( -			".global optprobe_template_entry\n" -			"optprobe_template_entry: \n" -#ifdef CONFIG_X86_64 -			/* We don't bother saving the ss register */ -			"	pushq %rsp\n" -			"	pushfq\n" -			SAVE_REGS_STRING -			"	movq %rsp, %rsi\n" -			".global optprobe_template_val\n" -			"optprobe_template_val: \n" -			ASM_NOP5 -			ASM_NOP5 -			".global optprobe_template_call\n" -			"optprobe_template_call: \n" -			ASM_NOP5 -			/* Move flags to rsp */ -			"	movq 144(%rsp), %rdx\n" -			"	movq %rdx, 152(%rsp)\n" -			RESTORE_REGS_STRING -			/* Skip flags entry */ -			"	addq $8, %rsp\n" -			"	popfq\n" -#else /* CONFIG_X86_32 */ -			"	pushf\n" -			SAVE_REGS_STRING -			"	movl %esp, %edx\n" -			".global optprobe_template_val\n" -			"optprobe_template_val: \n" -			ASM_NOP5 -			".global optprobe_template_call\n" -			"optprobe_template_call: \n" -			ASM_NOP5 -			RESTORE_REGS_STRING -			"	addl $4, %esp\n"	/* skip cs */ -			"	popf\n" -#endif -			".global optprobe_template_end\n" -			"optprobe_template_end: \n"); -} - -#define TMPL_MOVE_IDX \ -	((long)&optprobe_template_val - (long)&optprobe_template_entry) -#define TMPL_CALL_IDX \ -	((long)&optprobe_template_call - (long)&optprobe_template_entry) -#define TMPL_END_IDX \ -	((long)&optprobe_template_end - (long)&optprobe_template_entry) - -#define INT3_SIZE sizeof(kprobe_opcode_t) - -/* Optimized kprobe call back function: called from optinsn */ -static void __kprobes optimized_callback(struct optimized_kprobe *op, -					 struct pt_regs *regs) -{ -	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); -	unsigned long flags; - -	/* This is possible if op is under delayed unoptimizing */ -	if (kprobe_disabled(&op->kp)) -		return; - -	local_irq_save(flags); -	if (kprobe_running()) { -		kprobes_inc_nmissed_count(&op->kp); -	} else { -		/* Save skipped registers */ -#ifdef CONFIG_X86_64 -		regs->cs = __KERNEL_CS; -#else -		regs->cs = __KERNEL_CS | get_kernel_rpl(); -		regs->gs = 0; -#endif -		regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; -		regs->orig_ax = ~0UL; - -		__this_cpu_write(current_kprobe, &op->kp); -		kcb->kprobe_status = KPROBE_HIT_ACTIVE; -		opt_pre_handler(&op->kp, regs); -		__this_cpu_write(current_kprobe, NULL); -	} -	local_irq_restore(flags); -} - -static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) -{ -	int len = 0, ret; - -	while (len < RELATIVEJUMP_SIZE) { -		ret = __copy_instruction(dest + len, src + len, 1); -		if (!ret || !can_boost(dest + len)) -			return -EINVAL; -		len += ret; -	} -	/* Check whether the address range is reserved */ -	if (ftrace_text_reserved(src, src + len - 1) || -	    alternatives_text_reserved(src, src + len - 1) || -	    jump_label_text_reserved(src, src + len - 1)) -		return -EBUSY; - -	return len; -} - -/* Check whether insn is indirect jump */ -static int __kprobes insn_is_indirect_jump(struct insn *insn) -{ -	return ((insn->opcode.bytes[0] == 0xff && -		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ -		insn->opcode.bytes[0] == 0xea);	/* Segment based jump */ -} - -/* Check whether insn jumps into specified address range */ -static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) -{ -	unsigned long target = 0; - -	switch (insn->opcode.bytes[0]) { -	case 0xe0:	/* loopne */ -	case 0xe1:	/* loope */ -	case 0xe2:	/* loop */ -	case 0xe3:	/* jcxz */ -	case 0xe9:	/* near relative jump */ -	case 0xeb:	/* short relative jump */ -		break; -	case 0x0f: -		if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ -			break; -		return 0; -	default: -		if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ -			break; -		return 0; -	} -	target = (unsigned long)insn->next_byte + insn->immediate.value; - -	return (start <= target && target <= start + len); -} - -/* Decode whole function to ensure any instructions don't jump into target */ -static int __kprobes can_optimize(unsigned long paddr) -{ -	int ret; -	unsigned long addr, size = 0, offset = 0; -	struct insn insn; -	kprobe_opcode_t buf[MAX_INSN_SIZE]; - -	/* Lookup symbol including addr */ -	if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) -		return 0; - -	/* -	 * Do not optimize in the entry code due to the unstable -	 * stack handling. -	 */ -	if ((paddr >= (unsigned long )__entry_text_start) && -	    (paddr <  (unsigned long )__entry_text_end)) -		return 0; - -	/* Check there is enough space for a relative jump. */ -	if (size - offset < RELATIVEJUMP_SIZE) -		return 0; - -	/* Decode instructions */ -	addr = paddr - offset; -	while (addr < paddr - offset + size) { /* Decode until function end */ -		if (search_exception_tables(addr)) -			/* -			 * Since some fixup code will jumps into this function, -			 * we can't optimize kprobe in this function. -			 */ -			return 0; -		kernel_insn_init(&insn, (void *)addr); -		insn_get_opcode(&insn); -		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { -			ret = recover_probed_instruction(buf, addr); -			if (ret) -				return 0; -			kernel_insn_init(&insn, buf); -		} -		insn_get_length(&insn); -		/* Recover address */ -		insn.kaddr = (void *)addr; -		insn.next_byte = (void *)(addr + insn.length); -		/* Check any instructions don't jump into target */ -		if (insn_is_indirect_jump(&insn) || -		    insn_jump_into_range(&insn, paddr + INT3_SIZE, -					 RELATIVE_ADDR_SIZE)) -			return 0; -		addr += insn.length; -	} - -	return 1; -} - -/* Check optimized_kprobe can actually be optimized. */ -int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) -{ -	int i; -	struct kprobe *p; - -	for (i = 1; i < op->optinsn.size; i++) { -		p = get_kprobe(op->kp.addr + i); -		if (p && !kprobe_disabled(p)) -			return -EEXIST; -	} - -	return 0; -} - -/* Check the addr is within the optimized instructions. */ -int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, -					   unsigned long addr) -{ -	return ((unsigned long)op->kp.addr <= addr && -		(unsigned long)op->kp.addr + op->optinsn.size > addr); -} - -/* Free optimized instruction slot */ -static __kprobes -void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) -{ -	if (op->optinsn.insn) { -		free_optinsn_slot(op->optinsn.insn, dirty); -		op->optinsn.insn = NULL; -		op->optinsn.size = 0; -	} -} - -void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) -{ -	__arch_remove_optimized_kprobe(op, 1); -} - -/* - * Copy replacing target instructions - * Target instructions MUST be relocatable (checked inside) - */ -int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) -{ -	u8 *buf; -	int ret; -	long rel; - -	if (!can_optimize((unsigned long)op->kp.addr)) -		return -EILSEQ; - -	op->optinsn.insn = get_optinsn_slot(); -	if (!op->optinsn.insn) -		return -ENOMEM; - -	/* -	 * Verify if the address gap is in 2GB range, because this uses -	 * a relative jump. -	 */ -	rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; -	if (abs(rel) > 0x7fffffff) -		return -ERANGE; - -	buf = (u8 *)op->optinsn.insn; - -	/* Copy instructions into the out-of-line buffer */ -	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); -	if (ret < 0) { -		__arch_remove_optimized_kprobe(op, 0); -		return ret; -	} -	op->optinsn.size = ret; - -	/* Copy arch-dep-instance from template */ -	memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); - -	/* Set probe information */ -	synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); - -	/* Set probe function call */ -	synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); - -	/* Set returning jmp instruction at the tail of out-of-line buffer */ -	synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, -			   (u8 *)op->kp.addr + op->optinsn.size); - -	flush_icache_range((unsigned long) buf, -			   (unsigned long) buf + TMPL_END_IDX + -			   op->optinsn.size + RELATIVEJUMP_SIZE); -	return 0; -} - -#define MAX_OPTIMIZE_PROBES 256 -static struct text_poke_param *jump_poke_params; -static struct jump_poke_buffer { -	u8 buf[RELATIVEJUMP_SIZE]; -} *jump_poke_bufs; - -static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, -					    u8 *insn_buf, -					    struct optimized_kprobe *op) -{ -	s32 rel = (s32)((long)op->optinsn.insn - -			((long)op->kp.addr + RELATIVEJUMP_SIZE)); - -	/* Backup instructions which will be replaced by jump address */ -	memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, -	       RELATIVE_ADDR_SIZE); - -	insn_buf[0] = RELATIVEJUMP_OPCODE; -	*(s32 *)(&insn_buf[1]) = rel; - -	tprm->addr = op->kp.addr; -	tprm->opcode = insn_buf; -	tprm->len = RELATIVEJUMP_SIZE; -} - -/* - * Replace breakpoints (int3) with relative jumps. - * Caller must call with locking kprobe_mutex and text_mutex. - */ -void __kprobes arch_optimize_kprobes(struct list_head *oplist) -{ -	struct optimized_kprobe *op, *tmp; -	int c = 0; - -	list_for_each_entry_safe(op, tmp, oplist, list) { -		WARN_ON(kprobe_disabled(&op->kp)); -		/* Setup param */ -		setup_optimize_kprobe(&jump_poke_params[c], -				      jump_poke_bufs[c].buf, op); -		list_del_init(&op->list); -		if (++c >= MAX_OPTIMIZE_PROBES) -			break; -	} - -	/* -	 * text_poke_smp doesn't support NMI/MCE code modifying. -	 * However, since kprobes itself also doesn't support NMI/MCE -	 * code probing, it's not a problem. -	 */ -	text_poke_smp_batch(jump_poke_params, c); -} - -static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, -					      u8 *insn_buf, -					      struct optimized_kprobe *op) -{ -	/* Set int3 to first byte for kprobes */ -	insn_buf[0] = BREAKPOINT_INSTRUCTION; -	memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - -	tprm->addr = op->kp.addr; -	tprm->opcode = insn_buf; -	tprm->len = RELATIVEJUMP_SIZE; -} - -/* - * Recover original instructions and breakpoints from relative jumps. - * Caller must call with locking kprobe_mutex. - */ -extern void arch_unoptimize_kprobes(struct list_head *oplist, -				    struct list_head *done_list) -{ -	struct optimized_kprobe *op, *tmp; -	int c = 0; - -	list_for_each_entry_safe(op, tmp, oplist, list) { -		/* Setup param */ -		setup_unoptimize_kprobe(&jump_poke_params[c], -					jump_poke_bufs[c].buf, op); -		list_move(&op->list, done_list); -		if (++c >= MAX_OPTIMIZE_PROBES) -			break; -	} - -	/* -	 * text_poke_smp doesn't support NMI/MCE code modifying. -	 * However, since kprobes itself also doesn't support NMI/MCE -	 * code probing, it's not a problem. -	 */ -	text_poke_smp_batch(jump_poke_params, c); -} - -/* Replace a relative jump with a breakpoint (int3).  */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) -{ -	u8 buf[RELATIVEJUMP_SIZE]; - -	/* Set int3 to first byte for kprobes */ -	buf[0] = BREAKPOINT_INSTRUCTION; -	memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); -	text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); -} - -static int  __kprobes setup_detour_execution(struct kprobe *p, -					     struct pt_regs *regs, -					     int reenter) -{ -	struct optimized_kprobe *op; - -	if (p->flags & KPROBE_FLAG_OPTIMIZED) { -		/* This kprobe is really able to run optimized path. */ -		op = container_of(p, struct optimized_kprobe, kp); -		/* Detour through copied instructions */ -		regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; -		if (!reenter) -			reset_current_kprobe(); -		preempt_enable_no_resched(); -		return 1; -	} -	return 0; -} - -static int __kprobes init_poke_params(void) -{ -	/* Allocate code buffer and parameter array */ -	jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * -				 MAX_OPTIMIZE_PROBES, GFP_KERNEL); -	if (!jump_poke_bufs) -		return -ENOMEM; - -	jump_poke_params = kmalloc(sizeof(struct text_poke_param) * -				   MAX_OPTIMIZE_PROBES, GFP_KERNEL); -	if (!jump_poke_params) { -		kfree(jump_poke_bufs); -		jump_poke_bufs = NULL; -		return -ENOMEM; -	} - -	return 0; -} -#else	/* !CONFIG_OPTPROBES */ -static int __kprobes init_poke_params(void) -{ -	return 0; -} -#endif -  int __init arch_init_kprobes(void)  { -	return init_poke_params(); +	return arch_init_optprobes();  }  int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176..694d801bf60 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -438,9 +438,9 @@ void __init kvm_guest_init(void)  static __init int activate_jump_labels(void)  {  	if (has_steal_clock) { -		jump_label_inc(¶virt_steal_enabled); +		static_key_slow_inc(¶virt_steal_enabled);  		if (steal_acc) -			jump_label_inc(¶virt_steal_rq_enabled); +			static_key_slow_inc(¶virt_steal_rq_enabled);  	}  	return 0; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d756b2..f8492da65bf 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)  	return ret;  } +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ +	kvm_register_clock("primary cpu clock, resume"); +} +  #ifdef CONFIG_X86_LOCAL_APIC  static void __cpuinit kvm_setup_secondary_clock(void)  { @@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)  	 * we shouldn't fail.  	 */  	WARN_ON(kvm_register_clock("secondary cpu clock")); -	/* ok, done with our trickery, call native */ -	setup_secondary_APIC_clock();  }  #endif @@ -194,9 +201,11 @@ void __init kvmclock_init(void)  	x86_platform.get_wallclock = kvm_get_wallclock;  	x86_platform.set_wallclock = kvm_set_wallclock;  #ifdef CONFIG_X86_LOCAL_APIC -	x86_cpuinit.setup_percpu_clockev = +	x86_cpuinit.early_percpu_clock_init =  		kvm_setup_secondary_clock;  #endif +	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; +	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;  	machine_ops.shutdown  = kvm_shutdown;  #ifdef CONFIG_KEXEC  	machine_ops.crash_shutdown  = kvm_crash_shutdown; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index fe86493f3ed..73465aab28f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -311,13 +311,33 @@ out:  	return state;  } +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + *    amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Starting at family 15h they are in family specific firmware files: + * + *    amd-ucode/microcode_amd_fam15h.bin + *    amd-ucode/microcode_amd_fam16h.bin + *    ... + * + * These might be larger than 2K. + */  static enum ucode_state request_microcode_amd(int cpu, struct device *device)  { -	const char *fw_name = "amd-ucode/microcode_amd.bin"; +	char fw_name[36] = "amd-ucode/microcode_amd.bin";  	const struct firmware *fw;  	enum ucode_state ret = UCODE_NFOUND; +	struct cpuinfo_x86 *c = &cpu_data(cpu); + +	if (c->x86 >= 0x15) +		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); -	if (request_firmware(&fw, fw_name, device)) { +	if (request_firmware(&fw, (const char *)fw_name, device)) {  		pr_err("failed to load file %s\n", fw_name);  		goto out;  	} @@ -340,7 +360,6 @@ out:  static enum ucode_state  request_microcode_user(int cpu, const void __user *buf, size_t size)  { -	pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");  	return UCODE_ERROR;  } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fda91c30710..87a0f868830 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -86,6 +86,7 @@  #include <asm/microcode.h>  #include <asm/processor.h> +#include <asm/cpu_device_id.h>  MODULE_DESCRIPTION("Microcode Update Driver");  MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = {  	.notifier_call	= mc_cpu_callback,  }; +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL +	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD +	{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +	{} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif +  static int __init microcode_init(void)  {  	struct cpuinfo_x86 *c = &cpu_data(0); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 0d01a8ea4e1..2c39dcd510f 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -12,6 +12,7 @@  #include <linux/smp.h>  #include <linux/cpumask.h>  #include <linux/delay.h> +#include <linux/init.h>  #include <asm/apic.h>  #include <asm/nmi.h> @@ -20,35 +21,35 @@  #define FAILURE		1  #define TIMEOUT		2 -static int nmi_fail; +static int __initdata nmi_fail;  /* check to see if NMI IPIs work on this machine */ -static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; -static int testcase_total; -static int testcase_successes; -static int expected_testcase_failures; -static int unexpected_testcase_failures; -static int unexpected_testcase_unknowns; +static int __initdata testcase_total; +static int __initdata testcase_successes; +static int __initdata expected_testcase_failures; +static int __initdata unexpected_testcase_failures; +static int __initdata unexpected_testcase_unknowns; -static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)  {  	unexpected_testcase_unknowns++;  	return NMI_HANDLED;  } -static void init_nmi_testsuite(void) +static void __init init_nmi_testsuite(void)  {  	/* trap all the unknown NMIs we may generate */  	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");  } -static void cleanup_nmi_testsuite(void) +static void __init cleanup_nmi_testsuite(void)  {  	unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");  } -static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)  {          int cpu = raw_smp_processor_id(); @@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)          return NMI_DONE;  } -static void test_nmi_ipi(struct cpumask *mask) +static void __init test_nmi_ipi(struct cpumask *mask)  {  	unsigned long timeout; @@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask)  	return;  } -static void remote_ipi(void) +static void __init remote_ipi(void)  {  	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);  	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); @@ -94,19 +95,19 @@ static void remote_ipi(void)  		test_nmi_ipi(to_cpumask(nmi_ipi_mask));  } -static void local_ipi(void) +static void __init local_ipi(void)  {  	cpumask_clear(to_cpumask(nmi_ipi_mask));  	cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));  	test_nmi_ipi(to_cpumask(nmi_ipi_mask));  } -static void reset_nmi(void) +static void __init reset_nmi(void)  {  	nmi_fail = 0;  } -static void dotest(void (*testcase_fn)(void), int expected) +static void __init dotest(void (*testcase_fn)(void), int expected)  {  	testcase_fn();  	/* @@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected)  	reset_nmi();  } -static inline void print_testname(const char *testname) +static inline void __init print_testname(const char *testname)  {  	printk("%12s:", testname);  } -void nmi_selftest(void) +void __init nmi_selftest(void)  {  	init_nmi_testsuite(); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc4..9c57c02e54f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -26,6 +26,7 @@  #include <asm/bug.h>  #include <asm/paravirt.h> +#include <asm/debugreg.h>  #include <asm/desc.h>  #include <asm/setup.h>  #include <asm/pgtable.h> @@ -202,8 +203,8 @@ static void native_flush_tlb_single(unsigned long addr)  	__native_flush_tlb_single(addr);  } -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled;  static u64 native_steal_clock(int cpu)  { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769e21e..28e5e06fcba 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init);  static __devinit void via_no_dac(struct pci_dev *dev)  { -	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { +	if (forbid_dac == 0) {  		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");  		forbid_dac = 1;  	}  } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, +				PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);  #endif diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 34e06e84ce3..0bc72e2069e 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -12,6 +12,7 @@  #include <linux/pci.h>  #include <linux/export.h> +#include <asm/probe_roms.h>  #include <asm/pci-direct.h>  #include <asm/e820.h>  #include <asm/mmzone.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe..14baf78d5a1 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,6 +21,7 @@  #include <asm/idle.h>  #include <asm/uaccess.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/debugreg.h>  struct kmem_cache *task_xstate_cachep; @@ -377,8 +378,8 @@ static inline int hlt_use_halt(void)  void default_idle(void)  {  	if (hlt_use_halt()) { -		trace_power_start(POWER_CSTATE, 1, smp_processor_id()); -		trace_cpu_idle(1, smp_processor_id()); +		trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); +		trace_cpu_idle_rcuidle(1, smp_processor_id());  		current_thread_info()->status &= ~TS_POLLING;  		/*  		 * TS_POLLING-cleared state must be visible before we @@ -391,8 +392,8 @@ void default_idle(void)  		else  			local_irq_enable();  		current_thread_info()->status |= TS_POLLING; -		trace_power_end(smp_processor_id()); -		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); +		trace_power_end_rcuidle(smp_processor_id()); +		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());  	} else {  		local_irq_enable();  		/* loop is done by the caller */ @@ -450,8 +451,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);  static void mwait_idle(void)  {  	if (!need_resched()) { -		trace_power_start(POWER_CSTATE, 1, smp_processor_id()); -		trace_cpu_idle(1, smp_processor_id()); +		trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); +		trace_cpu_idle_rcuidle(1, smp_processor_id());  		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))  			clflush((void *)¤t_thread_info()->flags); @@ -461,8 +462,8 @@ static void mwait_idle(void)  			__sti_mwait(0, 0);  		else  			local_irq_enable(); -		trace_power_end(smp_processor_id()); -		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); +		trace_power_end_rcuidle(smp_processor_id()); +		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());  	} else  		local_irq_enable();  } @@ -474,13 +475,13 @@ static void mwait_idle(void)   */  static void poll_idle(void)  { -	trace_power_start(POWER_CSTATE, 0, smp_processor_id()); -	trace_cpu_idle(0, smp_processor_id()); +	trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); +	trace_cpu_idle_rcuidle(0, smp_processor_id());  	local_irq_enable();  	while (!need_resched())  		cpu_relax(); -	trace_power_end(smp_processor_id()); -	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); +	trace_power_end_rcuidle(smp_processor_id()); +	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());  }  /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 485204f58cd..9d7d4842bfa 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -45,6 +45,7 @@  #include <asm/ldt.h>  #include <asm/processor.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/desc.h>  #ifdef CONFIG_MATH_EMULATION  #include <asm/math_emu.h> @@ -119,9 +120,7 @@ void cpu_idle(void)  		}  		rcu_idle_exit();  		tick_nohz_idle_exit(); -		preempt_enable_no_resched(); -		schedule(); -		preempt_disable(); +		schedule_preempt_disabled();  	}  } @@ -214,6 +213,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	task_user_gs(p) = get_user_gs(regs); +	p->fpu_counter = 0;  	p->thread.io_bitmap_ptr = NULL;  	tsk = current;  	err = -ENOMEM; @@ -299,22 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  				 *next = &next_p->thread;  	int cpu = smp_processor_id();  	struct tss_struct *tss = &per_cpu(init_tss, cpu); -	bool preload_fpu; +	fpu_switch_t fpu;  	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ -	/* -	 * If the task has used fpu the last 5 timeslices, just do a full -	 * restore of the math state immediately to avoid the trap; the -	 * chances of needing FPU soon are obviously high now -	 */ -	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - -	__unlazy_fpu(prev_p); - -	/* we're going to use this soon, after a few expensive things */ -	if (preload_fpu) -		prefetch(next->fpu.state); +	fpu = switch_fpu_prepare(prev_p, next_p, cpu);  	/*  	 * Reload esp0. @@ -354,11 +343,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))  		__switch_to_xtra(prev_p, next_p, tss); -	/* If we're going to preload the fpu context, make sure clts -	   is run while we're batching the cpu state updates. */ -	if (preload_fpu) -		clts(); -  	/*  	 * Leave lazy mode, flushing any hypercalls made here.  	 * This must be done before restoring TLS segments so @@ -368,15 +352,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 */  	arch_end_context_switch(next_p); -	if (preload_fpu) -		__math_state_restore(); -  	/*  	 * Restore %gs if needed (which is common)  	 */  	if (prev->gs | next->gs)  		lazy_load_gs(next->gs); +	switch_fpu_finish(next_p, fpu); +  	percpu_write(current_task, next_p);  	return prev_p; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c8..292da13fc5a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -43,6 +43,7 @@  #include <asm/system.h>  #include <asm/processor.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/mmu_context.h>  #include <asm/prctl.h>  #include <asm/desc.h> @@ -156,9 +157,7 @@ void cpu_idle(void)  		}  		tick_nohz_idle_exit(); -		preempt_enable_no_resched(); -		schedule(); -		preempt_disable(); +		schedule_preempt_disabled();  	}  } @@ -286,6 +285,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	set_tsk_thread_flag(p, TIF_FORK); +	p->fpu_counter = 0;  	p->thread.io_bitmap_ptr = NULL;  	savesegment(gs, p->thread.gsindex); @@ -341,6 +341,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,  	loadsegment(es, _ds);  	loadsegment(ds, _ds);  	load_gs_index(0); +	current->thread.usersp	= new_sp;  	regs->ip		= new_ip;  	regs->sp		= new_sp;  	percpu_write(old_rsp, new_sp); @@ -386,18 +387,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	int cpu = smp_processor_id();  	struct tss_struct *tss = &per_cpu(init_tss, cpu);  	unsigned fsindex, gsindex; -	bool preload_fpu; +	fpu_switch_t fpu; -	/* -	 * If the task has used fpu the last 5 timeslices, just do a full -	 * restore of the math state immediately to avoid the trap; the -	 * chances of needing FPU soon are obviously high now -	 */ -	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - -	/* we're going to use this soon, after a few expensive things */ -	if (preload_fpu) -		prefetch(next->fpu.state); +	fpu = switch_fpu_prepare(prev_p, next_p, cpu);  	/*  	 * Reload esp0, LDT and the page table pointer: @@ -427,13 +419,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	load_TLS(next, cpu); -	/* Must be after DS reload */ -	__unlazy_fpu(prev_p); - -	/* Make sure cpu is ready for new context */ -	if (preload_fpu) -		clts(); -  	/*  	 * Leave lazy mode, flushing any hypercalls made here.  	 * This must be done before restoring TLS segments so @@ -474,6 +459,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);  	prev->gsindex = gsindex; +	switch_fpu_finish(next_p, fpu); +  	/*  	 * Switch the PDA and FPU contexts.  	 */ @@ -492,13 +479,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))  		__switch_to_xtra(prev_p, next_p, tss); -	/* -	 * Preload the FPU context, now that we've determined that the -	 * task is likely to be using it.  -	 */ -	if (preload_fpu) -		__math_state_restore(); -  	return prev_p;  } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 50267386b76..78f05e438be 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -27,6 +27,7 @@  #include <asm/system.h>  #include <asm/processor.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/debugreg.h>  #include <asm/ldt.h>  #include <asm/desc.h> diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 37a458b521a..d840e69a853 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,6 +39,14 @@ static int reboot_mode;  enum reboot_type reboot_type = BOOT_ACPI;  int reboot_force; +/* This variable is used privately to keep track of whether or not + * reboot_type is still set to its default value (i.e., reboot= hasn't + * been set on the command line).  This is needed so that we can + * suppress DMI scanning for reboot quirks.  Without it, it's + * impossible to override a faulty reboot quirk without recompiling. + */ +static int reboot_default = 1; +  #if defined(CONFIG_X86_32) && defined(CONFIG_SMP)  static int reboot_cpu = -1;  #endif @@ -67,6 +75,12 @@ bool port_cf9_safe = false;  static int __init reboot_setup(char *str)  {  	for (;;) { +		/* Having anything passed on the command line via +		 * reboot= will cause us to disable DMI checking +		 * below. +		 */ +		reboot_default = 0; +  		switch (*str) {  		case 'w':  			reboot_mode = 0x1234; @@ -295,14 +309,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),  		},  	}, -	{	/* Handle problems with rebooting on VersaLogic Menlow boards */ -		.callback = set_bios_reboot, -		.ident = "VersaLogic Menlow based board", -		.matches = { -			DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"), -			DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), -		}, -	},  	{ /* Handle reboot issue on Acer Aspire one */  		.callback = set_kbd_reboot,  		.ident = "Acer Aspire One A110", @@ -316,7 +322,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  static int __init reboot_init(void)  { -	dmi_check_system(reboot_dmi_table); +	/* Only do the DMI check if reboot_type hasn't been overridden +	 * on the command line +	 */ +	if (reboot_default) { +		dmi_check_system(reboot_dmi_table); +	}  	return 0;  }  core_initcall(reboot_init); @@ -465,7 +476,12 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {  static int __init pci_reboot_init(void)  { -	dmi_check_system(pci_reboot_dmi_table); +	/* Only do the DMI check if reboot_type hasn't been overridden +	 * on the command line +	 */ +	if (reboot_default) { +		dmi_check_system(pci_reboot_dmi_table); +	}  	return 0;  }  core_initcall(pci_reboot_init); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d7d5099fe87..88638883176 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p)  #endif  #ifdef CONFIG_EFI  	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, -		     EFI_LOADER_SIGNATURE, 4)) { +		     "EL32", 4)) {  		efi_enabled = 1; -		efi_memblock_x86_reserve_range(); +		efi_64bit = false; +	} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, +		     "EL64", 4)) { +		efi_enabled = 1; +		efi_64bit = true;  	} +	if (efi_enabled && efi_memblock_x86_reserve_range()) +		efi_enabled = 0;  #endif  	x86_init.oem.arch_setup(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc27e..25edcfc9ba5 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -24,6 +24,7 @@  #include <asm/processor.h>  #include <asm/ucontext.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/vdso.h>  #include <asm/mce.h> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d1..5104a2b685c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused)  	 * most necessary things.  	 */  	cpu_init(); +	x86_cpuinit.early_percpu_clock_init();  	preempt_disable();  	smp_callin(); @@ -291,19 +292,6 @@ notrace static void __cpuinit start_secondary(void *unused)  	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;  	x86_platform.nmi_init(); -	/* -	 * Wait until the cpu which brought this one up marked it -	 * online before enabling interrupts. If we don't do that then -	 * we can end up waking up the softirq thread before this cpu -	 * reached the active state, which makes the scheduler unhappy -	 * and schedule the softirq thread on the wrong cpu. This is -	 * only observable with forced threaded interrupts, but in -	 * theory it could also happen w/o them. It's just way harder -	 * to achieve. -	 */ -	while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) -		cpu_relax(); -  	/* enable local interrupts */  	local_irq_enable(); @@ -740,8 +728,6 @@ do_rest:  	 * the targeted processor.  	 */ -	printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); -  	atomic_set(&init_deasserted, 0);  	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { @@ -791,9 +777,10 @@ do_rest:  			schedule();  		} -		if (cpumask_test_cpu(cpu, cpu_callin_mask)) +		if (cpumask_test_cpu(cpu, cpu_callin_mask)) { +			print_cpu_msr(&cpu_data(cpu));  			pr_debug("CPU%d: has booted.\n", cpu); -		else { +		} else {  			boot_error = 1;  			if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)  			    == 0xA5A5A5A5) @@ -847,7 +834,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)  	if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||  	    !physid_isset(apicid, phys_cpu_present_map) || -	    (!x2apic_mode && apicid >= 255)) { +	    !apic->apic_id_valid(apicid)) {  		printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);  		return -EINVAL;  	} diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 051489082d5..ef59642ff1b 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,  {  	struct vm_area_struct *vma;  	struct mm_struct *mm = current->mm; -	unsigned long addr = addr0; +	unsigned long addr = addr0, start_addr;  	/* requested length too big for entire address space */  	if (len > TASK_SIZE) @@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,  		mm->free_area_cache = mm->mmap_base;  	} +try_again:  	/* either no address requested or can't fit in requested address hole */ -	addr = mm->free_area_cache; - -	/* make sure it can fit in the remaining address space */ -	if (addr > len) { -		unsigned long tmp_addr = align_addr(addr - len, filp, -						    ALIGN_TOPDOWN); - -		vma = find_vma(mm, tmp_addr); -		if (!vma || tmp_addr + len <= vma->vm_start) -			/* remember the address as a hint for next time */ -			return mm->free_area_cache = tmp_addr; -	} - -	if (mm->mmap_base < len) -		goto bottomup; +	start_addr = addr = mm->free_area_cache; -	addr = mm->mmap_base-len; +	if (addr < len) +		goto fail; +	addr -= len;  	do {  		addr = align_addr(addr, filp, ALIGN_TOPDOWN); @@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,  		addr = vma->vm_start-len;  	} while (len < vma->vm_start); +fail: +	/* +	 * if hint left us with no space for the requested +	 * mapping then try again: +	 */ +	if (start_addr != mm->mmap_base) { +		mm->free_area_cache = mm->mmap_base; +		mm->cached_hole_size = 0; +		goto try_again; +	} +  bottomup:  	/*  	 * A failed mmap() very likely causes application failure, diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index dd5fbf4101f..c6eba2b4267 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc);   */  static irqreturn_t timer_interrupt(int irq, void *dev_id)  { -	/* Keep nmi watchdog up to date */ -	inc_irq_stat(irq0_irqs); -  	global_clock_event->event_handler(global_clock_event);  	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 482ec3af206..ec61d4c1b93 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,6 +54,7 @@  #include <asm/traps.h>  #include <asm/desc.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #include <asm/mce.h>  #include <asm/mach_traps.h> @@ -571,41 +572,18 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)  }  /* - * __math_state_restore assumes that cr0.TS is already clear and the - * fpu state is all ready for use.  Used during context switch. - */ -void __math_state_restore(void) -{ -	struct thread_info *thread = current_thread_info(); -	struct task_struct *tsk = thread->task; - -	/* -	 * Paranoid restore. send a SIGSEGV if we fail to restore the state. -	 */ -	if (unlikely(restore_fpu_checking(tsk))) { -		stts(); -		force_sig(SIGSEGV, tsk); -		return; -	} - -	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */ -	tsk->fpu_counter++; -} - -/*   * 'math_state_restore()' saves the current math information in the   * old math state array, and gets the new ones from the current task   *   * Careful.. There are problems with IBM-designed IRQ13 behaviour.   * Don't touch unless you *really* know how it works.   * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). + * Must be called with kernel preemption disabled (eg with local + * local interrupts as in the case of do_device_not_available).   */ -asmlinkage void math_state_restore(void) +void math_state_restore(void)  { -	struct thread_info *thread = current_thread_info(); -	struct task_struct *tsk = thread->task; +	struct task_struct *tsk = current;  	if (!tsk_used_math(tsk)) {  		local_irq_enable(); @@ -622,9 +600,17 @@ asmlinkage void math_state_restore(void)  		local_irq_disable();  	} -	clts();				/* Allow maths ops (or we recurse) */ +	__thread_fpu_begin(tsk); +	/* +	 * Paranoid restore. send a SIGSEGV if we fail to restore the state. +	 */ +	if (unlikely(restore_fpu_checking(tsk))) { +		__thread_fpu_end(tsk); +		force_sig(SIGSEGV, tsk); +		return; +	} -	__math_state_restore(); +	tsk->fpu_counter++;  }  EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97e..899a03f2d18 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)  	if (cpu_khz) {  		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; -		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); +		*offset = ns_now - mult_frac(tsc_now, *scale, +					     (1UL << CYC2NS_SCALE_FACTOR));  	}  	sched_clock_idle_wakeup_event(0); @@ -629,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)  static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void)  {  	if (!sched_clock_stable)  		return; @@ -645,7 +646,7 @@ void save_sched_clock_state(void)   * that sched_clock() continues from the point where it was left off during   * suspend.   */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void)  {  	unsigned long long offset;  	unsigned long flags; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9eba29b46cb..fc25e60a588 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps;  /*   * TSC-warp measurement loop running on both CPUs:   */ -static __cpuinit void check_tsc_warp(void) +static __cpuinit void check_tsc_warp(unsigned int timeout)  {  	cycles_t start, now, prev, end;  	int i; @@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void)  	start = get_cycles();  	rdtsc_barrier();  	/* -	 * The measurement runs for 20 msecs: +	 * The measurement runs for 'timeout' msecs:  	 */ -	end = start + tsc_khz * 20ULL; +	end = start + (cycles_t) tsc_khz * timeout;  	now = start;  	for (i = 0; ; i++) { @@ -99,6 +99,25 @@ static __cpuinit void check_tsc_warp(void)  }  /* + * If the target CPU coming online doesn't have any of its core-siblings + * online, a timeout of 20msec will be used for the TSC-warp measurement + * loop. Otherwise a smaller timeout of 2msec will be used, as we have some + * information about this socket already (and this information grows as we + * have more and more logical-siblings in that socket). + * + * Ideally we should be able to skip the TSC sync check on the other + * core-siblings, if the first logical CPU in a socket passed the sync test. + * But as the TSC is per-logical CPU and can potentially be modified wrongly + * by the bios, TSC sync test for smaller duration should be able + * to catch such errors. Also this will catch the condition where all the + * cores in the socket doesn't get reset at the same time. + */ +static inline unsigned int loop_timeout(int cpu) +{ +	return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; +} + +/*   * Source CPU calls into this - it waits for the freshly booted   * target CPU to arrive and then starts the measurement:   */ @@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu)  	 */  	atomic_inc(&start_count); -	check_tsc_warp(); +	check_tsc_warp(loop_timeout(cpu));  	while (atomic_read(&stop_count) != cpus-1)  		cpu_relax(); @@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void)  	while (atomic_read(&start_count) != cpus)  		cpu_relax(); -	check_tsc_warp(); +	check_tsc_warp(loop_timeout(smp_processor_id()));  	/*  	 * Ok, we are done: diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba1..328cb37bb82 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)  	spinlock_t *ptl;  	int i; +	down_write(&mm->mmap_sem);  	pgd = pgd_offset(mm, 0xA0000);  	if (pgd_none_or_clear_bad(pgd))  		goto out; @@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)  	}  	pte_unmap_unlock(pte, ptl);  out: +	up_write(&mm->mmap_sem);  	flush_tlb();  } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06ccc67..e9f265fd79a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {  };  struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { +	.early_percpu_clock_init	= x86_init_noop,  	.setup_percpu_clockev		= setup_secondary_APIC_clock,  	.fixup_cpu_id			= x86_default_fixup_cpu_id,  }; @@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {  	.is_untracked_pat_range		= is_ISA_range,  	.nmi_init			= default_nmi_init,  	.get_nmi_reason			= default_get_nmi_reason, -	.i8042_detect			= default_i8042_detect +	.i8042_detect			= default_i8042_detect, +	.save_sched_clock_state 	= tsc_save_sched_clock_state, +	.restore_sched_clock_state 	= tsc_restore_sched_clock_state,  };  EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a3911343976..e62728e30b0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -6,6 +6,7 @@  #include <linux/bootmem.h>  #include <linux/compat.h>  #include <asm/i387.h> +#include <asm/fpu-internal.h>  #ifdef CONFIG_IA32_EMULATION  #include <asm/sigcontext32.h>  #endif @@ -47,7 +48,7 @@ void __sanitize_i387_state(struct task_struct *tsk)  	if (!fx)  		return; -	BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); +	BUG_ON(__thread_has_fpu(tsk));  	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; @@ -168,7 +169,7 @@ int save_i387_xstate(void __user *buf)  	if (!used_math())  		return 0; -	if (task_thread_info(tsk)->status & TS_USEDFPU) { +	if (user_has_fpu()) {  		if (use_xsave())  			err = xsave_user(buf);  		else @@ -176,8 +177,7 @@ int save_i387_xstate(void __user *buf)  		if (err)  			return err; -		task_thread_info(tsk)->status &= ~TS_USEDFPU; -		stts(); +		user_fpu_end();  	} else {  		sanitize_i387_state(tsk);  		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, @@ -292,10 +292,7 @@ int restore_i387_xstate(void __user *buf)  			return err;  	} -	if (!(task_thread_info(current)->status & TS_USEDFPU)) { -		clts(); -		task_thread_info(current)->status |= TS_USEDFPU; -	} +	user_fpu_begin();  	if (use_xsave())  		err = restore_user_xstate(buf);  	else  |