diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
39 files changed, 4977 insertions, 2339 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e..3efcb2b96a1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@  # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks  #  # Don't trace early stages of a secondary CPU boot @@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o  obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o -obj-$(CONFIG_X86_MCE)	+= mcheck/ -obj-$(CONFIG_MTRR)	+= mtrr/ -obj-$(CONFIG_CPU_FREQ)	+= cpufreq/ +obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE)			+= mcheck/ +obj-$(CONFIG_MTRR)			+= mtrr/ +obj-$(CONFIG_CPU_FREQ)			+= cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o  quiet_cmd_mkcapflags = MKCAP   $@        cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa6..e5b27d8f1b4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@  #include <asm/processor.h>  #include <asm/apic.h>  #include <asm/cpu.h> +#include <asm/pci-direct.h>  #ifdef CONFIG_X86_64  # include <asm/numa_64.h> @@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)  	int cpu = smp_processor_id();  	int node; -	unsigned apicid = hard_smp_processor_id(); +	unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;  	node = c->phys_proc_id;  	if (apicid_to_node[apicid] != NUMA_NO_NODE) @@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)  		    (c->x86_model == 8 && c->x86_mask >= 8))  			set_cpu_cap(c, X86_FEATURE_K6_MTRR);  #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) +	/* check CPU config space for extended APIC ID */ +	if (c->x86 >= 0xf) { +		unsigned int val; +		val = read_pci_config(0, 24, 0, 0x68); +		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) +			set_cpu_cap(c, X86_FEATURE_EXTD_APICID); +	} +#endif  }  static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 77848d9fca6..6b26d4deada 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@  #include <linux/io.h>  #include <asm/stackprotector.h> +#include <asm/perf_counter.h>  #include <asm/mmu_context.h>  #include <asm/hypervisor.h>  #include <asm/processor.h> @@ -107,7 +108,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {  	/* data */  	[GDT_ENTRY_APMBIOS_BASE+2]	= { { { 0x0000ffff, 0x00409200 } } }, -	[GDT_ENTRY_ESPFIX_SS]		= { { { 0x00000000, 0x00c09200 } } }, +	[GDT_ENTRY_ESPFIX_SS]		= { { { 0x0000ffff, 0x00cf9200 } } },  	[GDT_ENTRY_PERCPU]		= { { { 0x0000ffff, 0x00cf9200 } } },  	GDT_STACK_CANARY_INIT  #endif @@ -299,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)  	return NULL;		/* Not found */  } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;  void load_percpu_segment(int cpu)  { @@ -485,7 +487,6 @@ out:  static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)  {  	char *v = c->x86_vendor_id; -	static int printed;  	int i;  	for (i = 0; i < X86_VENDOR_NUM; i++) { @@ -502,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)  		}  	} -	if (!printed) { -		printed++; -		printk(KERN_ERR -		    "CPU: vendor_id '%s' unknown, using generic init.\n", v); - -		printk(KERN_ERR "CPU: Your system may be unstable.\n"); -	} +	printk_once(KERN_ERR +			"CPU: vendor_id '%s' unknown, using generic init.\n" \ +			"CPU: Your system may be unstable.\n", v);  	c->x86_vendor = X86_VENDOR_UNKNOWN;  	this_cpu = &default_cpu; @@ -768,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  	if (this_cpu->c_identify)  		this_cpu->c_identify(c); +	/* Clear/Set all flags overriden by options, after probe */ +	for (i = 0; i < NCAPINTS; i++) { +		c->x86_capability[i] &= ~cpu_caps_cleared[i]; +		c->x86_capability[i] |= cpu_caps_set[i]; +	} +  #ifdef CONFIG_X86_64  	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);  #endif @@ -813,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  #endif  	init_hypervisor(c); + +	/* +	 * Clear/Set all flags overriden by options, need do it +	 * before following smp all cpus cap AND. +	 */ +	for (i = 0; i < NCAPINTS; i++) { +		c->x86_capability[i] &= ~cpu_caps_cleared[i]; +		c->x86_capability[i] |= cpu_caps_set[i]; +	} +  	/*  	 * On SMP, boot_cpu_data holds the common feature set between  	 * all CPUs; so make sure that we indicate which features are @@ -825,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];  	} -	/* Clear all flags overriden by options */ -	for (i = 0; i < NCAPINTS; i++) -		c->x86_capability[i] &= ~cleared_cpu_caps[i]; -  #ifdef CONFIG_X86_MCE  	/* Init Machine Check Exception if available. */  	mcheck_init(c); @@ -839,6 +848,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)  	numa_add_cpu(smp_processor_id());  #endif + +	/* Cap the iomem address space to what is addressable on all CPUs */ +	iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1;  }  #ifdef CONFIG_X86_64 @@ -861,6 +873,7 @@ void __init identify_boot_cpu(void)  #else  	vgetcpu_set_mode();  #endif +	init_hw_perf_counters();  }  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6..6b2a52dd040 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@  static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);  static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag);  static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model);  static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {  	{ "value",	CPU_REG_ALL,	1	},  }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { -	{ 0x00000000, 0x00000001, CPU_MC,	CPU_INTEL_ALL		}, -	{ 0x00000006, 0x00000007, CPU_MONITOR,	CPU_CX_AT_XE		}, -	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_INTEL_ALL		}, -	{ 0x00000011, 0x00000013, CPU_PMC,	CPU_INTEL_PENTIUM	}, -	{ 0x00000017, 0x00000017, CPU_PLATFORM,	CPU_PX_CX_AT_XE		}, -	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_P6_CX_AT_XE		}, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { +	{ 0x00000000, 0x00000001, CPU_MC,	}, +	{ 0x00000006, 0x00000007, CPU_MONITOR,	}, +	{ 0x00000010, 0x00000010, CPU_TIME,	}, +	{ 0x00000011, 0x00000013, CPU_PMC,	}, +	{ 0x00000017, 0x00000017, CPU_PLATFORM,	}, +	{ 0x0000001B, 0x0000001B, CPU_APIC,	}, +	{ 0x0000002A, 0x0000002B, CPU_POWERON,	}, +	{ 0x0000002C, 0x0000002C, CPU_FREQ,	}, +	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	}, +	{ 0x00000040, 0x00000047, CPU_LBRANCH,	}, +	{ 0x00000060, 0x00000067, CPU_LBRANCH,	}, +	{ 0x00000079, 0x00000079, CPU_BIOS,	}, +	{ 0x00000088, 0x0000008A, CPU_CACHE,	}, +	{ 0x0000008B, 0x0000008B, CPU_BIOS,	}, +	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	}, +	{ 0x000000C1, 0x000000C4, CPU_PMC,	}, +	{ 0x000000CD, 0x000000CD, CPU_FREQ,	}, +	{ 0x000000E7, 0x000000E8, CPU_PERF,	}, +	{ 0x000000FE, 0x000000FE, CPU_MTRR,	}, -	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_PX_CX_AT_XE		}, -	{ 0x0000002B, 0x0000002B, CPU_POWERON,	CPU_INTEL_XEON		}, -	{ 0x0000002C, 0x0000002C, CPU_FREQ,	CPU_INTEL_XEON		}, -	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	CPU_CX_AT_XE		}, +	{ 0x00000116, 0x0000011E, CPU_CACHE,	}, +	{ 0x00000174, 0x00000176, CPU_SYSENTER,	}, +	{ 0x00000179, 0x0000017B, CPU_MC,	}, +	{ 0x00000186, 0x00000189, CPU_PMC,	}, +	{ 0x00000198, 0x00000199, CPU_PERF,	}, +	{ 0x0000019A, 0x0000019A, CPU_TIME,	}, +	{ 0x0000019B, 0x0000019D, CPU_THERM,	}, +	{ 0x000001A0, 0x000001A0, CPU_MISC,	}, +	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	}, +	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	}, +	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	}, +	{ 0x000001DA, 0x000001E0, CPU_LBRANCH,	}, -	{ 0x00000040, 0x00000043, CPU_LBRANCH,	CPU_PM_CX_AT_XE		}, -	{ 0x00000044, 0x00000047, CPU_LBRANCH,	CPU_PM_CO_AT		}, -	{ 0x00000060, 0x00000063, CPU_LBRANCH,	CPU_C2_AT		}, -	{ 0x00000064, 0x00000067, CPU_LBRANCH,	CPU_INTEL_ATOM		}, +	{ 0x00000200, 0x0000020F, CPU_MTRR,	}, +	{ 0x00000250, 0x00000250, CPU_MTRR,	}, +	{ 0x00000258, 0x00000259, CPU_MTRR,	}, +	{ 0x00000268, 0x0000026F, CPU_MTRR,	}, +	{ 0x00000277, 0x00000277, CPU_PAT,	}, +	{ 0x000002FF, 0x000002FF, CPU_MTRR,	}, -	{ 0x00000079, 0x00000079, CPU_BIOS,	CPU_P6_CX_AT_XE		}, -	{ 0x00000088, 0x0000008A, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x0000008B, 0x0000008B, CPU_BIOS,	CPU_P6_CX_AT_XE		}, -	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	CPU_INTEL_XEON		}, +	{ 0x00000300, 0x00000311, CPU_PMC,	}, +	{ 0x00000345, 0x00000345, CPU_PMC,	}, +	{ 0x00000360, 0x00000371, CPU_PMC,	}, +	{ 0x0000038D, 0x00000390, CPU_PMC,	}, +	{ 0x000003A0, 0x000003BE, CPU_PMC,	}, +	{ 0x000003C0, 0x000003CD, CPU_PMC,	}, +	{ 0x000003E0, 0x000003E1, CPU_PMC,	}, +	{ 0x000003F0, 0x000003F2, CPU_PMC,	}, -	{ 0x000000C1, 0x000000C2, CPU_PMC,	CPU_P6_CX_AT		}, -	{ 0x000000CD, 0x000000CD, CPU_FREQ,	CPU_CX_AT		}, -	{ 0x000000E7, 0x000000E8, CPU_PERF,	CPU_CX_AT		}, -	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_P6_CX_XE		}, +	{ 0x00000400, 0x00000417, CPU_MC,	}, +	{ 0x00000480, 0x0000048B, CPU_VMX,	}, -	{ 0x00000116, 0x00000116, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x00000118, 0x00000118, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x00000119, 0x00000119, CPU_CACHE,	CPU_INTEL_PX		}, -	{ 0x0000011A, 0x0000011B, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x0000011E, 0x0000011E, CPU_CACHE,	CPU_PX_CX_AT		}, +	{ 0x00000600, 0x00000600, CPU_DEBUG,	}, +	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	}, +	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	}, -	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_P6_CX_AT_XE		}, -	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_PX_CX_AT_XE		}, -	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_P6_XE		}, -	{ 0x00000186, 0x00000187, CPU_PMC,	CPU_P6_CX_AT		}, -	{ 0x00000198, 0x00000199, CPU_PERF,	CPU_PM_CX_AT_XE		}, -	{ 0x0000019A, 0x0000019A, CPU_TIME,	CPU_PM_CX_AT_XE		}, -	{ 0x0000019B, 0x0000019D, CPU_THERM,	CPU_PM_CX_AT_XE		}, -	{ 0x000001A0, 0x000001A0, CPU_MISC,	CPU_PM_CX_AT_XE		}, +	{ 0x000107CC, 0x000107D3, CPU_PMC,	}, -	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	CPU_PM_CX_AT		}, -	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	CPU_INTEL_XEON		}, -	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_CX_AT_XE		}, -	{ 0x000001DA, 0x000001DA, CPU_LBRANCH,	CPU_INTEL_XEON		}, -	{ 0x000001DB, 0x000001DB, CPU_LBRANCH,	CPU_P6_XE		}, -	{ 0x000001DC, 0x000001DC, CPU_LBRANCH,	CPU_INTEL_P6		}, -	{ 0x000001DD, 0x000001DE, CPU_LBRANCH,	CPU_PX_CX_AT_XE		}, -	{ 0x000001E0, 0x000001E0, CPU_LBRANCH,	CPU_INTEL_P6		}, +	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	}, +	{ 0xC0000081, 0xC0000084, CPU_CALL,	}, +	{ 0xC0000100, 0xC0000102, CPU_BASE,	}, +	{ 0xC0000103, 0xC0000103, CPU_TIME,	}, -	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_C2_AT_XE		}, -	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_P6_CX_XE		}, - -	{ 0x00000300, 0x00000308, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x00000309, 0x0000030B, CPU_PMC,	CPU_C2_AT_XE		}, -	{ 0x0000030C, 0x00000311, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x00000345, 0x00000345, CPU_PMC,	CPU_C2_AT		}, -	{ 0x00000360, 0x00000371, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x0000038D, 0x00000390, CPU_PMC,	CPU_C2_AT		}, -	{ 0x000003A0, 0x000003BE, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003C0, 0x000003CD, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003E0, 0x000003E1, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003F0, 0x000003F0, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003F1, 0x000003F1, CPU_PMC,	CPU_C2_AT_XE		}, -	{ 0x000003F2, 0x000003F2, CPU_PMC,	CPU_INTEL_XEON		}, - -	{ 0x00000400, 0x00000402, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x00000403, 0x00000403, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x00000404, 0x00000406, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x00000407, 0x00000407, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x00000408, 0x0000040A, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x0000040B, 0x0000040B, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x0000040C, 0x0000040E, CPU_MC,	CPU_PM_CX_XE		}, -	{ 0x0000040F, 0x0000040F, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x00000410, 0x00000412, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x00000413, 0x00000417, CPU_MC,	CPU_CX_AT_XE		}, -	{ 0x00000480, 0x0000048B, CPU_VMX,	CPU_CX_AT_XE		}, - -	{ 0x00000600, 0x00000600, CPU_DEBUG,	CPU_PM_CX_AT_XE		}, -	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	CPU_INTEL_XEON		}, -	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	CPU_INTEL_XEON		}, - -	{ 0x000107CC, 0x000107D3, CPU_PMC,	CPU_INTEL_XEON_MP	}, - -	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_INTEL_XEON		}, -	{ 0xC0000081, 0xC0000082, CPU_CALL,	CPU_INTEL_XEON		}, -	{ 0xC0000084, 0xC0000084, CPU_CALL,	CPU_INTEL_XEON		}, -	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_INTEL_XEON		}, +	{ 0xC0010000, 0xC0010007, CPU_PMC,	}, +	{ 0xC0010010, 0xC0010010, CPU_CONF,	}, +	{ 0xC0010015, 0xC0010015, CPU_CONF,	}, +	{ 0xC0010016, 0xC001001A, CPU_MTRR,	}, +	{ 0xC001001D, 0xC001001D, CPU_MTRR,	}, +	{ 0xC001001F, 0xC001001F, CPU_CONF,	}, +	{ 0xC0010030, 0xC0010035, CPU_BIOS,	}, +	{ 0xC0010044, 0xC0010048, CPU_MC,	}, +	{ 0xC0010050, 0xC0010056, CPU_SMM,	}, +	{ 0xC0010058, 0xC0010058, CPU_CONF,	}, +	{ 0xC0010060, 0xC0010060, CPU_CACHE,	}, +	{ 0xC0010061, 0xC0010068, CPU_SMM,	}, +	{ 0xC0010069, 0xC001006B, CPU_SMM,	}, +	{ 0xC0010070, 0xC0010071, CPU_SMM,	}, +	{ 0xC0010111, 0xC0010113, CPU_SMM,	}, +	{ 0xC0010114, 0xC0010118, CPU_SVM,	}, +	{ 0xC0010140, 0xC0010141, CPU_OSVM,	}, +	{ 0xC0011022, 0xC0011023, CPU_CONF,	},  }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { -	{ 0x00000000, 0x00000001, CPU_MC,	CPU_K10_PLUS,		}, -	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_K8_PLUS,		}, -	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_K8_PLUS,		}, -	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_K7_PLUS		}, -	{ 0x0000008B, 0x0000008B, CPU_VER,	CPU_K8_PLUS		}, -	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_K8_PLUS,		}, - -	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_K8_PLUS,		}, -	{ 0x00000179, 0x0000017B, CPU_MC,	CPU_K8_PLUS,		}, -	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_K8_PLUS,		}, -	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_K8_PLUS,		}, - -	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_K8_PLUS,		}, -	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_K8_PLUS,		}, - -	{ 0x00000400, 0x00000413, CPU_MC,	CPU_K8_PLUS,		}, - -	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_AMD_ALL,		}, -	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_K8_PLUS,		}, -	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_K8_PLUS,		}, -	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_K10_PLUS,		}, - -	{ 0xC0010000, 0xC0010007, CPU_PMC,	CPU_K8_PLUS,		}, -	{ 0xC0010010, 0xC0010010, CPU_CONF,	CPU_K7_PLUS,		}, -	{ 0xC0010015, 0xC0010015, CPU_CONF,	CPU_K7_PLUS,		}, -	{ 0xC0010016, 0xC001001A, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0xC001001D, 0xC001001D, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0xC001001F, 0xC001001F, CPU_CONF,	CPU_K8_PLUS,		}, -	{ 0xC0010030, 0xC0010035, CPU_BIOS,	CPU_K8_PLUS,		}, -	{ 0xC0010044, 0xC0010048, CPU_MC,	CPU_K8_PLUS,		}, -	{ 0xC0010050, 0xC0010056, CPU_SMM,	CPU_K0F_PLUS,		}, -	{ 0xC0010058, 0xC0010058, CPU_CONF,	CPU_K10_PLUS,		}, -	{ 0xC0010060, 0xC0010060, CPU_CACHE,	CPU_AMD_11,		}, -	{ 0xC0010061, 0xC0010068, CPU_SMM,	CPU_K10_PLUS,		}, -	{ 0xC0010069, 0xC001006B, CPU_SMM,	CPU_AMD_11,		}, -	{ 0xC0010070, 0xC0010071, CPU_SMM,	CPU_K10_PLUS,		}, -	{ 0xC0010111, 0xC0010113, CPU_SMM,	CPU_K8_PLUS,		}, -	{ 0xC0010114, 0xC0010118, CPU_SVM,	CPU_K10_PLUS,		}, -	{ 0xC0010140, 0xC0010141, CPU_OSVM,	CPU_K10_PLUS,		}, -	{ 0xC0011022, 0xC0011023, CPU_CONF,	CPU_K10_PLUS,		}, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ -	int flag; - -	switch (model) { -	case 0x0501: -	case 0x0502: -	case 0x0504: -		flag = CPU_INTEL_PENTIUM; -		break; -	case 0x0601: -	case 0x0603: -	case 0x0605: -	case 0x0607: -	case 0x0608: -	case 0x060A: -	case 0x060B: -		flag = CPU_INTEL_P6; -		break; -	case 0x0609: -	case 0x060D: -		flag = CPU_INTEL_PENTIUM_M; -		break; -	case 0x060E: -		flag = CPU_INTEL_CORE; -		break; -	case 0x060F: -	case 0x0617: -		flag = CPU_INTEL_CORE2; -		break; -	case 0x061C: -		flag = CPU_INTEL_ATOM; -		break; -	case 0x0F00: -	case 0x0F01: -	case 0x0F02: -	case 0x0F03: -	case 0x0F04: -		flag = CPU_INTEL_XEON_P4; -		break; -	case 0x0F06: -		flag = CPU_INTEL_XEON_MP; -		break; -	default: -		flag = CPU_NONE; -		break; -	} - -	return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ -	int flag; - -	switch (model >> 8) { -	case 0x6: -		flag = CPU_AMD_K6; -		break; -	case 0x7: -		flag = CPU_AMD_K7; -		break; -	case 0x8: -		flag = CPU_AMD_K8; -		break; -	case 0xf: -		flag = CPU_AMD_0F; -		break; -	case 0x10: -		flag = CPU_AMD_10; -		break; -	case 0x11: -		flag = CPU_AMD_11; -		break; -	default: -		flag = CPU_NONE; -		break; -	} - -	return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ -	int flag; - -	flag = per_cpu(cpu_model, cpu); - -	switch (flag >> 16) { -	case X86_VENDOR_INTEL: -		flag = get_intel_modelflag(flag); -		break; -	case X86_VENDOR_AMD: -		flag = get_amd_modelflag(flag & 0xffff); -		break; -	default: -		flag = CPU_NONE; -		break; -	} - -	return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ -	int index; - -	switch (per_cpu(cpu_model, cpu) >> 16) { -	case X86_VENDOR_INTEL: -		index = ARRAY_SIZE(cpu_intel_range); -		break; -	case X86_VENDOR_AMD: -		index = ARRAY_SIZE(cpu_amd_range); -		break; -	default: -		index = 0; -		break; -	} - -	return index; -} -  static int is_typeflag_valid(unsigned cpu, unsigned flag)  { -	unsigned vendor, modelflag; -	int i, index; +	int i;  	/* Standard Registers should be always valid */  	if (flag >= CPU_TSS)  		return 1; -	modelflag = per_cpu(cpu_modelflag, cpu); -	vendor = per_cpu(cpu_model, cpu) >> 16; -	index = get_cpu_range_count(cpu); - -	for (i = 0; i < index; i++) { -		switch (vendor) { -		case X86_VENDOR_INTEL: -			if ((cpu_intel_range[i].model & modelflag) && -			    (cpu_intel_range[i].flag & flag)) -				return 1; -			break; -		case X86_VENDOR_AMD: -			if ((cpu_amd_range[i].model & modelflag) && -			    (cpu_amd_range[i].flag & flag)) -				return 1; -			break; -		} +	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { +		if (cpu_reg_range[i].flag == flag) +			return 1;  	}  	/* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)  static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,  			      int index, unsigned flag)  { -	unsigned modelflag; - -	modelflag = per_cpu(cpu_modelflag, cpu); -	*max = 0; -	switch (per_cpu(cpu_model, cpu) >> 16) { -	case X86_VENDOR_INTEL: -		if ((cpu_intel_range[index].model & modelflag) && -		    (cpu_intel_range[index].flag & flag)) { -			*min = cpu_intel_range[index].min; -			*max = cpu_intel_range[index].max; -		} -		break; -	case X86_VENDOR_AMD: -		if ((cpu_amd_range[index].model & modelflag) && -		    (cpu_amd_range[index].flag & flag)) { -			*min = cpu_amd_range[index].min; -			*max = cpu_amd_range[index].max; -		} -		break; -	} +	if (cpu_reg_range[index].flag == flag) { +		*min = cpu_reg_range[index].min; +		*max = cpu_reg_range[index].max; +	} else +		*max = 0;  	return *max;  } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)  	unsigned msr, msr_min, msr_max;  	struct cpu_private *priv;  	u32 low, high; -	int i, range; +	int i;  	if (seq) {  		priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)  		}  	} -	range = get_cpu_range_count(cpu); - -	for (i = 0; i < range; i++) { +	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {  		if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))  			continue; @@ -588,8 +369,20 @@ static void print_apic(void *arg)  	seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));  	seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));  	seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ +	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { +		unsigned int i, v, maxeilvt; +		v = apic_read(APIC_EFEAT); +		maxeilvt = (v >> 16) & 0xff; +		seq_printf(seq, " EFEAT\t\t: %08x\n", v); +		seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + +		for (i = 0; i < maxeilvt; i++) { +			v = apic_read(APIC_EILVTn(i)); +			seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); +		} +	} +#endif /* CONFIG_X86_LOCAL_APIC */  	seq_printf(seq, "\n MSR\t:\n");  } @@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)  {  	struct dentry *cpu_dentry = NULL;  	unsigned reg, reg_min, reg_max; -	int i, range, err = 0; +	int i, err = 0;  	char reg_dir[12];  	u32 low, high; -	range = get_cpu_range_count(cpu); - -	for (i = 0; i < range; i++) { +	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {  		if (!get_cpu_range(cpu, ®_min, ®_max, i,  				   cpu_base[type].flag))  			continue; @@ -850,10 +641,6 @@ static int cpu_init_cpu(void)  		cpui = &cpu_data(cpu);  		if (!cpu_has(cpui, X86_FEATURE_MSR))  			continue; -		per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | -					   (cpui->x86 << 8) | -					   (cpui->x86_model)); -		per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);  		sprintf(cpu_dir, "cpu%d", cpu);  		cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c83987547..f138c6c389b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL  	  If in doubt, say N.  config X86_E_POWERSAVER -	tristate "VIA C7 Enhanced PowerSaver" +	tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"  	select CPU_FREQ_TABLE -	depends on X86_32 +	depends on X86_32 && EXPERIMENTAL  	help -	  This adds the CPUFreq driver for VIA C7 processors. +	  This adds the CPUFreq driver for VIA C7 processors.  However, this driver +	  does not have any safeguards to prevent operating the CPU out of spec +	  and is thus considered dangerous.  Please use the regular ACPI cpufreq +	  driver, enabled by CONFIG_X86_ACPI_CPUFREQ.  	  If in doubt, say N. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 208ecf6643d..ae9b503220c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)  {  	struct cpuinfo_x86 *cpu = &cpu_data(cpuid); -	if (cpu->x86_vendor != X86_VENDOR_INTEL || -	    !cpu_has(cpu, X86_FEATURE_EST)) -		return 0; - -	return 1; +	return cpu_has(cpu, X86_FEATURE_EST);  }  static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) @@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void)  		return -ENOMEM;  	}  	for_each_possible_cpu(i) { -		if (!alloc_cpumask_var_node( +		if (!zalloc_cpumask_var_node(  			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,  			GFP_KERNEL, cpu_to_node(i))) { @@ -693,8 +689,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)  	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&  	    policy->cpuinfo.transition_latency > 20 * 1000) {  		policy->cpuinfo.transition_latency = 20 * 1000; -			printk_once(KERN_INFO "Capping off P-state tranision" -				    " latency at 20 uS\n"); +		printk_once(KERN_INFO +			    "P-state transition latency capped at 20 uS\n");  	}  	/* table init */ diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index a8363e5be4e..d47c775eb0a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -322,7 +322,7 @@ static int powernow_acpi_init(void)  		goto err0;  	} -	if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, +	if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,  								GFP_KERNEL)) {  		retval = -ENOMEM;  		goto err05; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f6b32d11235..81cbe64ed6b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1,3 +1,4 @@ +  /*   *   (c) 2003-2006 Advanced Micro Devices, Inc.   *  Your use of this code is subject to the terms and conditions of the @@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)  	u32 i = 0;  	if (cpu_family == CPU_HW_PSTATE) { -		if (data->currpstate == HW_PSTATE_INVALID) { -			/* read (initial) hw pstate if not yet set */ -			rdmsr(MSR_PSTATE_STATUS, lo, hi); -			i = lo & HW_PSTATE_MASK; +		rdmsr(MSR_PSTATE_STATUS, lo, hi); +		i = lo & HW_PSTATE_MASK; +		data->currpstate = i; + +		/* +		 * a workaround for family 11h erratum 311 might cause +		 * an "out-of-range Pstate if the core is in Pstate-0 +		 */ +		if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) +			data->currpstate = HW_PSTATE_0; -			/* -			 * a workaround for family 11h erratum 311 might cause -			 * an "out-of-range Pstate if the core is in Pstate-0 -			 */ -			if (i >= data->numps) -				data->currpstate = HW_PSTATE_0; -			else -				data->currpstate = i; -		}  		return 0;  	}  	do { @@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,  	return 0;  } -static int check_supported_cpu(unsigned int cpu) +static void check_supported_cpu(void *_rc)  { -	cpumask_t oldmask;  	u32 eax, ebx, ecx, edx; -	unsigned int rc = 0; - -	oldmask = current->cpus_allowed; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); +	int *rc = _rc; -	if (smp_processor_id() != cpu) { -		printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); -		goto out; -	} +	*rc = -ENODEV;  	if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) -		goto out; +		return;  	eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);  	if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&  	    ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) -		goto out; +		return;  	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {  		if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||  		    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {  			printk(KERN_INFO PFX  				"Processor cpuid %x not supported\n", eax); -			goto out; +			return;  		}  		eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);  		if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {  			printk(KERN_INFO PFX  			       "No frequency change capabilities detected\n"); -			goto out; +			return;  		}  		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); @@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu)  			!= P_STATE_TRANSITION_CAPABLE) {  			printk(KERN_INFO PFX  				"Power state transitions not supported\n"); -			goto out; +			return;  		}  	} else { /* must be a HW Pstate capable processor */  		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);  		if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)  			cpu_family = CPU_HW_PSTATE;  		else -			goto out; +			return;  	} -	rc = 1; - -out: -	set_cpus_allowed_ptr(current, &oldmask); -	return rc; +	*rc = 0;  }  static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, @@ -823,19 +810,20 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,  	if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))  		return; -	control = data->acpi_data.states[index].control; data->irt = (control -			>> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> -				RVO_SHIFT) & RVO_MASK; data->exttype = (control -					>> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; -	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 -		<< ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = -		(control >> VST_SHIFT) & VST_MASK; } +	control = data->acpi_data.states[index].control; +	data->irt = (control >> IRT_SHIFT) & IRT_MASK; +	data->rvo = (control >> RVO_SHIFT) & RVO_MASK; +	data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; +	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; +	data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); +	data->vstable = (control >> VST_SHIFT) & VST_MASK; +}  static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)  {  	struct cpufreq_frequency_table *powernow_table;  	int ret_val = -ENODEV; -	acpi_integer space_id; +	acpi_integer control, status;  	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {  		dprintk("register performance failed: bad ACPI data\n"); @@ -848,12 +836,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)  		goto err_out;  	} -	space_id = data->acpi_data.control_register.space_id; -	if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || -		(space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { +	control = data->acpi_data.control_register.space_id; +	status = data->acpi_data.status_register.space_id; + +	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || +	    (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {  		dprintk("Invalid control/status registers (%x - %x)\n", -			data->acpi_data.control_register.space_id, -			space_id); +			control, status);  		goto err_out;  	} @@ -886,7 +875,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)  	/* notify BIOS that we exist */  	acpi_processor_notify_smm(THIS_MODULE); -	if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { +	if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {  		printk(KERN_ERR PFX  				"unable to alloc powernow_k8_data cpumask\n");  		ret_val = -ENOMEM; @@ -1045,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data)  		if (cur_latency > max_latency)  			max_latency = cur_latency;  	} +	if (max_latency == 0) { +		/* +		 * Fam 11h always returns 0 as transition latency. +		 * This is intended and means "very fast". While cpufreq core +		 * and governors currently can handle that gracefully, better +		 * set it to 1 to avoid problems in the future. +		 * For all others it's a BIOS bug. +		 */ +		if (!boot_cpu_data.x86 == 0x11) +			printk(KERN_ERR FW_WARN PFX "Invalid zero transition " +				"latency\n"); +		max_latency = 1; +	}  	/* value in usecs, needs to be in nanoseconds */  	return 1000 * max_latency;  } @@ -1092,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,  	freqs.old = find_khz_freq_from_fid(data->currfid);  	freqs.new = find_khz_freq_from_fid(fid); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);  	} @@ -1100,7 +1102,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,  	res = transition_fid_vid(data, fid, vid);  	freqs.new = find_khz_freq_from_fid(data->currfid); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);  	} @@ -1125,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,  			data->currpstate);  	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);  	} @@ -1133,7 +1135,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,  	res = transition_pstate(data, pstate);  	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);  	} @@ -1234,21 +1236,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol)  	return cpufreq_frequency_table_verify(pol, data->powernow_table);  } -static const char ACPI_PSS_BIOS_BUG_MSG[] = -	KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" -	KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; +struct init_on_cpu { +	struct powernow_k8_data *data; +	int rc; +}; + +static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) +{ +	struct init_on_cpu *init_on_cpu = _init_on_cpu; + +	if (pending_bit_stuck()) { +		printk(KERN_ERR PFX "failing init, change pending bit set\n"); +		init_on_cpu->rc = -ENODEV; +		return; +	} + +	if (query_current_values_with_pending_wait(init_on_cpu->data)) { +		init_on_cpu->rc = -ENODEV; +		return; +	} + +	if (cpu_family == CPU_OPTERON) +		fidvid_msr_init(); + +	init_on_cpu->rc = 0; +}  /* per CPU init entry point to the driver */  static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)  { +	static const char ACPI_PSS_BIOS_BUG_MSG[] = +		KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" +		KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";  	struct powernow_k8_data *data; -	cpumask_t oldmask; +	struct init_on_cpu init_on_cpu;  	int rc;  	if (!cpu_online(pol->cpu))  		return -ENODEV; -	if (!check_supported_cpu(pol->cpu)) +	smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); +	if (rc)  		return -ENODEV;  	data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); @@ -1288,27 +1316,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)  		pol->cpuinfo.transition_latency = get_transition_latency(data);  	/* only run on specific CPU from here on */ -	oldmask = current->cpus_allowed; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); - -	if (smp_processor_id() != pol->cpu) { -		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); -		goto err_out_unmask; -	} - -	if (pending_bit_stuck()) { -		printk(KERN_ERR PFX "failing init, change pending bit set\n"); -		goto err_out_unmask; -	} - -	if (query_current_values_with_pending_wait(data)) -		goto err_out_unmask; - -	if (cpu_family == CPU_OPTERON) -		fidvid_msr_init(); - -	/* run on any CPU again */ -	set_cpus_allowed_ptr(current, &oldmask); +	init_on_cpu.data = data; +	smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, +				 &init_on_cpu, 1); +	rc = init_on_cpu.rc; +	if (rc != 0) +		goto err_out_exit_acpi;  	if (cpu_family == CPU_HW_PSTATE)  		cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); @@ -1345,8 +1358,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)  	return 0; -err_out_unmask: -	set_cpus_allowed_ptr(current, &oldmask); +err_out_exit_acpi:  	powernow_k8_cpu_exit_acpi(data);  err_out: @@ -1371,28 +1383,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)  	return 0;  } +static void query_values_on_cpu(void *_err) +{ +	int *err = _err; +	struct powernow_k8_data *data = __get_cpu_var(powernow_data); + +	*err = query_current_values_with_pending_wait(data); +} +  static unsigned int powernowk8_get(unsigned int cpu)  { -	struct powernow_k8_data *data; -	cpumask_t oldmask = current->cpus_allowed; +	struct powernow_k8_data *data = per_cpu(powernow_data, cpu);  	unsigned int khz = 0; -	unsigned int first; - -	first = cpumask_first(cpu_core_mask(cpu)); -	data = per_cpu(powernow_data, first); +	int err;  	if (!data)  		return -EINVAL; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); -	if (smp_processor_id() != cpu) { -		printk(KERN_ERR PFX -			"limiting to CPU %d failed in powernowk8_get\n", cpu); -		set_cpus_allowed_ptr(current, &oldmask); -		return 0; -	} - -	if (query_current_values_with_pending_wait(data)) +	smp_call_function_single(cpu, query_values_on_cpu, &err, true); +	if (err)  		goto out;  	if (cpu_family == CPU_HW_PSTATE) @@ -1403,7 +1412,6 @@ static unsigned int powernowk8_get(unsigned int cpu)  out: -	set_cpus_allowed_ptr(current, &oldmask);  	return khz;  } @@ -1429,7 +1437,9 @@ static int __cpuinit powernowk8_init(void)  	unsigned int i, supported_cpus = 0;  	for_each_online_cpu(i) { -		if (check_supported_cpu(i)) +		int rc; +		smp_call_function_single(i, check_supported_cpu, &rc, 1); +		if (rc == 0)  			supported_cpus++;  	} diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 6c6698feade..c9c1190b5e1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -223,14 +223,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned  static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);  static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); - -#ifdef CONFIG_SMP -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ -} -#else -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ -	cpu_set(0, cpu_sharedcore_mask[0]); -} -#endif diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index c9f1fdc0283..8d672ef162c 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu)  {  	unsigned l, h;  	unsigned clock_freq; -	cpumask_t saved_mask; -	saved_mask = current->cpus_allowed; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); -	if (smp_processor_id() != cpu) -		return 0; - -	rdmsr(MSR_IA32_PERF_STATUS, l, h); +	rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);  	clock_freq = extract_clock(l, cpu, 0);  	if (unlikely(clock_freq == 0)) { @@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu)  		 * P-state transition (like TM2). Get the last freq set   		 * in PERF_CTL.  		 */ -		rdmsr(MSR_IA32_PERF_CTL, l, h); +		rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);  		clock_freq = extract_clock(l, cpu, 1);  	} - -	set_cpus_allowed_ptr(current, &saved_mask);  	return clock_freq;  } @@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy,  	struct cpufreq_freqs	freqs;  	int			retval = 0;  	unsigned int		j, k, first_cpu, tmp; -	cpumask_var_t saved_mask, covered_cpus; +	cpumask_var_t covered_cpus; -	if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) -		return -ENOMEM; -	if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { -		free_cpumask_var(saved_mask); +	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))  		return -ENOMEM; -	} -	cpumask_copy(saved_mask, ¤t->cpus_allowed);  	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {  		retval = -ENODEV; @@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy,  	first_cpu = 1;  	for_each_cpu(j, policy->cpus) { -		const struct cpumask *mask; +		int good_cpu;  		/* cpufreq holds the hotplug lock, so we are safe here */  		if (!cpu_online(j)) @@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy,  		 * Make sure we are running on CPU that wants to change freq  		 */  		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) -			mask = policy->cpus; +			good_cpu = cpumask_any_and(policy->cpus, +						   cpu_online_mask);  		else -			mask = cpumask_of(j); +			good_cpu = j; -		set_cpus_allowed_ptr(current, mask); -		preempt_disable(); -		if (unlikely(!cpu_isset(smp_processor_id(), *mask))) { +		if (good_cpu >= nr_cpu_ids) {  			dprintk("couldn't limit to CPUs in this domain\n");  			retval = -EAGAIN;  			if (first_cpu) {  				/* We haven't started the transition yet. */ -				goto migrate_end; +				goto out;  			} -			preempt_enable();  			break;  		}  		msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;  		if (first_cpu) { -			rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); +			rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);  			if (msr == (oldmsr & 0xffff)) {  				dprintk("no change needed - msr was and needs "  					"to be %x\n", oldmsr);  				retval = 0; -				goto migrate_end; +				goto out;  			}  			freqs.old = extract_clock(oldmsr, cpu, 0); @@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy,  			oldmsr |= msr;  		} -		wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); -		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { -			preempt_enable(); +		wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); +		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)  			break; -		} -		cpu_set(j, *covered_cpus); -		preempt_enable(); +		cpumask_set_cpu(j, covered_cpus);  	}  	for_each_cpu(k, policy->cpus) { @@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy,  		 * Best effort undo..  		 */ -		for_each_cpu_mask_nr(j, *covered_cpus) { -			set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); -			wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); -		} +		for_each_cpu(j, covered_cpus) +			wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);  		tmp = freqs.new;  		freqs.new = freqs.old; @@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy,  			cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);  		}  	} -	set_cpus_allowed_ptr(current, saved_mask);  	retval = 0; -	goto out; -migrate_end: -	preempt_enable(); -	set_cpus_allowed_ptr(current, saved_mask);  out: -	free_cpumask_var(saved_mask);  	free_cpumask_var(covered_cpus);  	return retval;  } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 016c1a4fa3f..6911e91fb4f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -89,7 +89,8 @@ static int speedstep_find_register(void)   * speedstep_set_state - set the SpeedStep state   * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)   * - *   Tries to change the SpeedStep state. + *   Tries to change the SpeedStep state.  Can be called from + *   smp_call_function_single.   */  static void speedstep_set_state(unsigned int state)  { @@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state)  	return;  } +/* Wrapper for smp_call_function_single. */ +static void _speedstep_set_state(void *_state) +{ +	speedstep_set_state(*(unsigned int *)_state); +}  /**   * speedstep_activate - activate SpeedStep control in the chipset @@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void)  	return 0;  } -static unsigned int _speedstep_get(const struct cpumask *cpus) -{ +struct get_freq_data {  	unsigned int speed; -	cpumask_t cpus_allowed; +	unsigned int processor; +}; + +static void get_freq_data(void *_data) +{ +	struct get_freq_data *data = _data; -	cpus_allowed = current->cpus_allowed; -	set_cpus_allowed_ptr(current, cpus); -	speed = speedstep_get_frequency(speedstep_processor); -	set_cpus_allowed_ptr(current, &cpus_allowed); -	dprintk("detected %u kHz as current frequency\n", speed); -	return speed; +	data->speed = speedstep_get_frequency(data->processor);  }  static unsigned int speedstep_get(unsigned int cpu)  { -	return _speedstep_get(cpumask_of(cpu)); +	struct get_freq_data data = { .processor = cpu }; + +	/* You're supposed to ensure CPU is online. */ +	if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) +		BUG(); + +	dprintk("detected %u kHz as current frequency\n", data.speed); +	return data.speed;  }  /** @@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy,  			     unsigned int target_freq,  			     unsigned int relation)  { -	unsigned int newstate = 0; +	unsigned int newstate = 0, policy_cpu;  	struct cpufreq_freqs freqs; -	cpumask_t cpus_allowed;  	int i;  	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],  				target_freq, relation, &newstate))  		return -EINVAL; -	freqs.old = _speedstep_get(policy->cpus); +	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); +	freqs.old = speedstep_get(policy_cpu);  	freqs.new = speedstep_freqs[newstate].frequency;  	freqs.cpu = policy->cpu; @@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy,  	if (freqs.old == freqs.new)  		return 0; -	cpus_allowed = current->cpus_allowed; -  	for_each_cpu(i, policy->cpus) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);  	} -	/* switch to physical CPU where state is to be changed */ -	set_cpus_allowed_ptr(current, policy->cpus); - -	speedstep_set_state(newstate); - -	/* allow to be run on all CPUs */ -	set_cpus_allowed_ptr(current, &cpus_allowed); +	smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, +				 true);  	for_each_cpu(i, policy->cpus) {  		freqs.cpu = i; @@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy)  	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);  } +struct get_freqs { +	struct cpufreq_policy *policy; +	int ret; +}; + +static void get_freqs_on_cpu(void *_get_freqs) +{ +	struct get_freqs *get_freqs = _get_freqs; + +	get_freqs->ret = +		speedstep_get_freqs(speedstep_processor, +			    &speedstep_freqs[SPEEDSTEP_LOW].frequency, +			    &speedstep_freqs[SPEEDSTEP_HIGH].frequency, +			    &get_freqs->policy->cpuinfo.transition_latency, +			    &speedstep_set_state); +}  static int speedstep_cpu_init(struct cpufreq_policy *policy)  { -	int result = 0; -	unsigned int speed; -	cpumask_t cpus_allowed; +	int result; +	unsigned int policy_cpu, speed; +	struct get_freqs gf;  	/* only run on CPU to be set, or on its sibling */  #ifdef CONFIG_SMP  	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));  #endif - -	cpus_allowed = current->cpus_allowed; -	set_cpus_allowed_ptr(current, policy->cpus); +	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);  	/* detect low and high frequency and transition latency */ -	result = speedstep_get_freqs(speedstep_processor, -				     &speedstep_freqs[SPEEDSTEP_LOW].frequency, -				     &speedstep_freqs[SPEEDSTEP_HIGH].frequency, -				     &policy->cpuinfo.transition_latency, -				     &speedstep_set_state); -	set_cpus_allowed_ptr(current, &cpus_allowed); -	if (result) -		return result; +	gf.policy = policy; +	smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); +	if (gf.ret) +		return gf.ret;  	/* get current speed setting */ -	speed = _speedstep_get(policy->cpus); +	speed = speedstep_get(policy_cpu);  	if (!speed)  		return -EIO; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 2e3c6862657..f4c290b8482 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void)  } +/* Warning: may get called from smp_call_function_single. */  unsigned int speedstep_get_frequency(unsigned int processor)  {  	switch (processor) { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c0..3260ab04499 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)  	 */  	if (c->x86 == 6 && c->x86_model < 15)  		clear_cpu_cap(c, X86_FEATURE_PAT); + +#ifdef CONFIG_KMEMCHECK +	/* +	 * P4s have a "fast strings" feature which causes single- +	 * stepping REP instructions to only generate a #DB on +	 * cache-line boundaries. +	 * +	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon +	 * (model 2) with the same problem. +	 */ +	if (c->x86 == 15) { +		u64 misc_enable; + +		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + +		if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { +			printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); + +			misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; +			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); +		} +	} +#endif  }  #ifdef CONFIG_X86_32 @@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)  }  #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)  {  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)  	unsigned node;  	int cpu = smp_processor_id(); -	int apicid = hard_smp_processor_id(); +	int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;  	/* Don't do the funky fallback heuristics the AMD version employs  	   for now. */ @@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)  	}  	/* Work around errata */ -	srat_detect_node(); +	srat_detect_node(c);  	if (cpu_has(c, X86_FEATURE_VMX))  		detect_vmx_virtcap(c); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e10..789efe217e1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@  #include <asm/processor.h>  #include <asm/smp.h> +#include <asm/k8.h>  #define LVL_1_INST	1  #define LVL_1_DATA	2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs {  	unsigned long can_disable;  }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { -	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, -	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, -	{} -}; -#endif -  unsigned short			num_cache_leaves;  /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -207,10 +200,17 @@ union l3_cache {  };  static const unsigned short __cpuinitconst assocs[] = { -	[1] = 1, [2] = 2, [4] = 4, [6] = 8, -	[8] = 16, [0xa] = 32, [0xb] = 48, +	[1] = 1, +	[2] = 2, +	[4] = 4, +	[6] = 8, +	[8] = 16, +	[0xa] = 32, +	[0xb] = 48,  	[0xc] = 64, -	[0xf] = 0xffff // ?? +	[0xd] = 96, +	[0xe] = 128, +	[0xf] = 0xffff /* fully associative - no way to show this currently */  };  static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,  	eax->split.type = types[leaf];  	eax->split.level = levels[leaf];  	if (leaf == 3) -		eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; +		eax->split.num_threads_sharing = +			current_cpu_data.x86_max_cores - 1;  	else  		eax->split.num_threads_sharing = 0;  	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; @@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)  {  	if (index < 3)  		return; + +	if (boot_cpu_data.x86 == 0x11) +		return; + +	/* see erratum #382 */ +	if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) +		return; +  	this_leaf->can_disable = 1;  } @@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)  #define to_object(k)	container_of(k, struct _index_kobject, kobj)  #define to_attr(a)	container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ -	struct pci_dev *dev = NULL; -	int i; - -	for (i = 0; i <= node; i++) { -		do { -			dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); -			if (!dev) -				break; -		} while (!pci_match_id(&k8_nb_id[0], dev)); -		if (!dev) -			break; -	} -	return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ -	return NULL; -} -#endif - -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, +				  unsigned int index)  { -	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); -	int node = cpu_to_node(cpumask_first(mask)); -	struct pci_dev *dev = NULL; -	ssize_t ret = 0; -	int i; +	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); +	int node = cpu_to_node(cpu); +	struct pci_dev *dev = node_to_k8_nb_misc(node); +	unsigned int reg = 0;  	if (!this_leaf->can_disable) -		return sprintf(buf, "Feature not enabled\n"); - -	dev = get_k8_northbridge(node); -	if (!dev) { -		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");  		return -EINVAL; -	} -	for (i = 0; i < 2; i++) { -		unsigned int reg; +	if (!dev) +		return -EINVAL; -		pci_read_config_dword(dev, 0x1BC + i * 4, ®); +	pci_read_config_dword(dev, 0x1BC + index * 4, ®); +	return sprintf(buf, "%x\n", reg); +} -		ret += sprintf(buf, "%sEntry: %d\n", buf, i); -		ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",   -			buf, -			reg & 0x80000000 ? "Disabled" : "Allowed", -			reg & 0x40000000 ? "Disabled" : "Allowed"); -		ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", -			buf, (reg & 0x30000) >> 16, reg & 0xfff); -	} -	return ret; +#define SHOW_CACHE_DISABLE(index)					\ +static ssize_t								\ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)  	\ +{									\ +	return show_cache_disable(this_leaf, buf, index);		\  } +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, -		    size_t count) +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, +	const char *buf, size_t count, unsigned int index)  { -	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); -	int node = cpu_to_node(cpumask_first(mask)); -	struct pci_dev *dev = NULL; -	unsigned int ret, index, val; +	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); +	int node = cpu_to_node(cpu); +	struct pci_dev *dev = node_to_k8_nb_misc(node); +	unsigned long val = 0; +	unsigned int scrubber = 0;  	if (!this_leaf->can_disable) -		return 0; - -	if (strlen(buf) > 15)  		return -EINVAL; -	ret = sscanf(buf, "%x %x", &index, &val); -	if (ret != 2) +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if (!dev)  		return -EINVAL; -	if (index > 1) + +	if (strict_strtoul(buf, 10, &val) < 0)  		return -EINVAL;  	val |= 0xc0000000; -	dev = get_k8_northbridge(node); -	if (!dev) { -		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); -		return -EINVAL; -	} + +	pci_read_config_dword(dev, 0x58, &scrubber); +	scrubber &= ~0x1f000000; +	pci_write_config_dword(dev, 0x58, scrubber);  	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);  	wbinvd();  	pci_write_config_dword(dev, 0x1BC + index * 4, val); +	return count; +} -	return 1; +#define STORE_CACHE_DISABLE(index)					\ +static ssize_t								\ +store_cache_disable_##index(struct _cpuid4_info *this_leaf,	     	\ +			    const char *buf, size_t count)		\ +{									\ +	return store_cache_disable(this_leaf, buf, count, index);	\  } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1)  struct _cache_attr {  	struct attribute attr; @@ -808,7 +795,10 @@ define_one_ro(size);  define_one_ro(shared_cpu_map);  define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, +		show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, +		show_cache_disable_1, store_cache_disable_1);  static struct attribute * default_attrs[] = {  	&type.attr, @@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {  	&size.attr,  	&shared_cpu_map.attr,  	&shared_cpu_list.attr, -	&cache_disable.attr, +	&cache_disable_0.attr, +	&cache_disable_1.attr,  	NULL  }; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe..188a1ca5ad2 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,7 +1,12 @@ -obj-y				=  mce_$(BITS).o therm_throt.o +obj-y				=  mce.o -obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o -obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o -obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o +obj-$(CONFIG_X86_NEW_MCE)	+= mce-severity.o +obj-$(CONFIG_X86_OLD_MCE)	+= k7.o p4.o p6.o +obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o +obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o +obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o  obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o  obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39..b945d5dbc60 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -2,25 +2,23 @@   * Athlon specific Machine Check Exception Reporting   * (C) Copyright 2002 Dave Jones <davej@redhat.com>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" - -/* Machine Check Handler For AMD Athlon/Duron */ +/* Machine Check Handler For AMD Athlon/Duron: */  static void k7_machine_check(struct pt_regs *regs, long error_code)  { -	int recover = 1;  	u32 alow, ahigh, high, low;  	u32 mcgstl, mcgsth; +	int recover = 1;  	int i;  	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -32,15 +30,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)  	for (i = 1; i < nr_mce_banks; i++) {  		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); -		if (high&(1<<31)) { +		if (high & (1<<31)) {  			char misc[20];  			char addr[24]; -			misc[0] = addr[0] = '\0'; + +			misc[0] = '\0'; +			addr[0] = '\0'; +  			if (high & (1<<29))  				recover |= 1;  			if (high & (1<<25))  				recover |= 2;  			high &= ~(1<<31); +  			if (high & (1<<27)) {  				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);  				snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,27 +51,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)  				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);  				snprintf(addr, 24, " at %08x%08x", ahigh, alow);  			} +  			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",  				smp_processor_id(), i, high, low, misc, addr); -			/* Clear it */ + +			/* Clear it: */  			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); -			/* Serialize */ +			/* Serialize: */  			wmb();  			add_taint(TAINT_MACHINE_CHECK);  		}  	} -	if (recover&2) +	if (recover & 2)  		panic("CPU context corrupt"); -	if (recover&1) +	if (recover & 1)  		panic("Unable to continue"); +  	printk(KERN_EMERG "Attempting to continue.\n"); +  	mcgstl &= ~(1<<2);  	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);  } -/* AMD K7 machine check is Intel like */ +/* AMD K7 machine check is Intel like: */  void amd_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; @@ -79,21 +85,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)  		return;  	machine_check_vector = k7_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb();  	printk(KERN_INFO "Intel machine check architecture supported.\n"); +  	rdmsr(MSR_IA32_MCG_CAP, l, h);  	if (l & (1<<8))	/* Control register present ? */  		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);  	nr_mce_banks = l & 0xff; -	/* Clear status for MC index 0 separately, we don't touch CTL, -	 * as some K7 Athlons cause spurious MCEs when its enabled. */ +	/* +	 * Clear status for MC index 0 separately, we don't touch CTL, +	 * as some K7 Athlons cause spurious MCEs when its enabled: +	 */  	if (boot_cpu_data.x86 == 6) {  		wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);  		i = 1;  	} else  		i = 0; +  	for (; i < nr_mce_banks; i++) {  		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);  		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 00000000000..a3a235a53f0 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -0,0 +1,127 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + */ +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/smp.h> +#include <asm/mce.h> + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ +	struct mce *i = &per_cpu(injectm, m->extcpu); + +	/* Make sure noone reads partially written injectm */ +	i->finished = 0; +	mb(); +	m->finished = 0; +	/* First set the fields after finished */ +	i->extcpu = m->extcpu; +	mb(); +	/* Now write record in order, finished last (except above) */ +	memcpy(i, m, sizeof(struct mce)); +	/* Finally activate it */ +	mb(); +	i->finished = 1; +} + +struct delayed_mce { +	struct timer_list timer; +	struct mce m; +}; + +/* Inject mce on current CPU */ +static void raise_mce(unsigned long data) +{ +	struct delayed_mce *dm = (struct delayed_mce *)data; +	struct mce *m = &dm->m; +	int cpu = m->extcpu; + +	inject_mce(m); +	if (m->status & MCI_STATUS_UC) { +		struct pt_regs regs; +		memset(®s, 0, sizeof(struct pt_regs)); +		regs.ip = m->ip; +		regs.cs = m->cs; +		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); +		do_machine_check(®s, 0); +		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); +	} else { +		mce_banks_t b; +		memset(&b, 0xff, sizeof(mce_banks_t)); +		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); +		machine_check_poll(0, &b); +		mce_notify_irq(); +		printk(KERN_INFO "Finished machine check poll on CPU %d\n", +		       cpu); +	} +	kfree(dm); +} + +/* Error injection interface */ +static ssize_t mce_write(struct file *filp, const char __user *ubuf, +			 size_t usize, loff_t *off) +{ +	struct delayed_mce *dm; +	struct mce m; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	/* +	 * There are some cases where real MSR reads could slip +	 * through. +	 */ +	if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) +		return -EIO; + +	if ((unsigned long)usize > sizeof(struct mce)) +		usize = sizeof(struct mce); +	if (copy_from_user(&m, ubuf, usize)) +		return -EFAULT; + +	if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) +		return -EINVAL; + +	dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); +	if (!dm) +		return -ENOMEM; + +	/* +	 * Need to give user space some time to set everything up, +	 * so do it a jiffie or two later everywhere. +	 * Should we use a hrtimer here for better synchronization? +	 */ +	memcpy(&dm->m, &m, sizeof(struct mce)); +	setup_timer(&dm->timer, raise_mce, (unsigned long)dm); +	dm->timer.expires = jiffies + 2; +	add_timer_on(&dm->timer, m.extcpu); +	return usize; +} + +static int inject_init(void) +{ +	printk(KERN_INFO "Machine check injector initialized\n"); +	mce_chrdev_ops.write = mce_write; +	return 0; +} + +module_init(inject_init); +/* + * Cannot tolerate unloading currently because we cannot + * guarantee all openers of mce_chrdev will get a reference to us. + */ +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 00000000000..54dcb8ff12e --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -0,0 +1,15 @@ +#include <asm/mce.h> + +enum severity_level { +	MCE_NO_SEVERITY, +	MCE_KEEP_SEVERITY, +	MCE_SOME_SEVERITY, +	MCE_AO_SEVERITY, +	MCE_UC_SEVERITY, +	MCE_AR_SEVERITY, +	MCE_PANIC_SEVERITY, +}; + +int mce_severity(struct mce *a, int tolerant, char **msg); + +extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 00000000000..ff0807f9705 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -0,0 +1,218 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/debugfs.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) + */ + +enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; + +static struct severity { +	u64 mask; +	u64 result; +	unsigned char sev; +	unsigned char mcgmask; +	unsigned char mcgres; +	unsigned char ser; +	unsigned char context; +	unsigned char covered; +	char *msg; +} severities[] = { +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define SEV(s) .sev = MCE_ ## s ## _SEVERITY +#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } +#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } +#define MCGMASK(x, res, s, m, r...) \ +	{ .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } +#define MASK(x, y, s, m, r...) \ +	{ .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCACOD 0xffff + +	BITCLR(MCI_STATUS_VAL, NO, "Invalid"), +	BITCLR(MCI_STATUS_EN, NO, "Not enabled"), +	BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), +	/* When MCIP is not set something is very confused */ +	MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), +	/* Neither return not error IP -- no chance to recover -> PANIC */ +	MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, +		"Neither restart nor error IP"), +	MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", +		KERNEL), +	BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), +	MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, +	     "Spurious not enabled", SER), + +	/* ignore OVER for UCNA */ +	MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, +	     "Uncorrected no action required", SER), +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, +	     "Illegal combination (UCNA with AR=1)", SER), +	MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), + +	/* AR add known MCACODs here */ +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, +	     "Action required with lost events", SER), +	MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, +	     "Action required; unknown MCACOD", SER), + +	/* known AO MCACODs: */ +	MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, +	     "Action optional: memory scrubbing error", SER), +	MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, +	     "Action optional: last level cache writeback error", SER), + +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, +	     "Action optional unknown MCACOD", SER), +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, +	     "Action optional with lost events", SER), +	BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), +	BITSET(MCI_STATUS_UC, UC, "Uncorrected"), +	BITSET(0, SOME, "No match")	/* always matches. keep at end */ +}; + +/* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ +static int error_context(struct mce *m) +{ +	if (m->mcgstatus & MCG_STATUS_EIPV) +		return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; +	/* Unknown, assume kernel */ +	return IN_KERNEL; +} + +int mce_severity(struct mce *a, int tolerant, char **msg) +{ +	enum context ctx = error_context(a); +	struct severity *s; + +	for (s = severities;; s++) { +		if ((a->status & s->mask) != s->result) +			continue; +		if ((a->mcgstatus & s->mcgmask) != s->mcgres) +			continue; +		if (s->ser == SER_REQUIRED && !mce_ser) +			continue; +		if (s->ser == NO_SER && mce_ser) +			continue; +		if (s->context && ctx != s->context) +			continue; +		if (msg) +			*msg = s->msg; +		s->covered = 1; +		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { +			if (panic_on_oops || tolerant < 1) +				return MCE_PANIC_SEVERITY; +		} +		return s->sev; +	} +} + +static void *s_start(struct seq_file *f, loff_t *pos) +{ +	if (*pos >= ARRAY_SIZE(severities)) +		return NULL; +	return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ +	if (++(*pos) >= ARRAY_SIZE(severities)) +		return NULL; +	return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ +	struct severity *ser = data; +	seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); +	return 0; +} + +static const struct seq_operations severities_seq_ops = { +	.start	= s_start, +	.next	= s_next, +	.stop	= s_stop, +	.show	= s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, +					 const char __user *ubuf, +					 size_t count, loff_t *ppos) +{ +	int i; +	for (i = 0; i < ARRAY_SIZE(severities); i++) +		severities[i].covered = 0; +	return count; +} + +static const struct file_operations severities_coverage_fops = { +	.open		= severities_coverage_open, +	.release	= seq_release, +	.read		= seq_read, +	.write		= severities_coverage_write, +}; + +static int __init severities_debugfs_init(void) +{ +	struct dentry *dmce = NULL, *fseverities_coverage = NULL; + +	dmce = debugfs_create_dir("mce", NULL); +	if (dmce == NULL) +		goto err_out; +	fseverities_coverage = debugfs_create_file("severities-coverage", +						   0444, dmce, NULL, +						   &severities_coverage_fops); +	if (fseverities_coverage == NULL) +		goto err_out; + +	return 0; + +err_out: +	if (fseverities_coverage) +		debugfs_remove(fseverities_coverage); +	if (dmce) +		debugfs_remove(dmce); +	return -ENOMEM; +} +late_initcall(severities_debugfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 00000000000..284d1de968b --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -0,0 +1,2049 @@ +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ +#include <linux/thread_info.h> +#include <linux/capability.h> +#include <linux/miscdevice.h> +#include <linux/interrupt.h> +#include <linux/ratelimit.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> +#include <linux/kobject.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/string.h> +#include <linux/sysdev.h> +#include <linux/delay.h> +#include <linux/ctype.h> +#include <linux/sched.h> +#include <linux/sysfs.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/poll.h> +#include <linux/nmi.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/processor.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/ipi.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#include "mce-internal.h" + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ +	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", +	       smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = +						unexpected_machine_check; + +int mce_disabled __read_mostly; + +#ifdef CONFIG_X86_NEW_MCE + +#define MISC_MCELOG_MINOR	227 + +#define SPINUNIT 100	/* 100ns */ + +atomic_t mce_entry; + +DEFINE_PER_CPU(unsigned, mce_exception_count); + +/* + * Tolerant levels: + *   0: always panic on uncorrected errors, log corrected errors + *   1: panic or SIGBUS on uncorrected errors, log corrected errors + *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors + *   3: never panic or SIGBUS, log all errors (for testing only) + */ +static int			tolerant		__read_mostly = 1; +static int			banks			__read_mostly; +static u64			*bank			__read_mostly; +static int			rip_msr			__read_mostly; +static int			mce_bootlog		__read_mostly = -1; +static int			monarch_timeout		__read_mostly = -1; +static int			mce_panic_timeout	__read_mostly; +static int			mce_dont_log_ce		__read_mostly; +int				mce_cmci_disabled	__read_mostly; +int				mce_ignore_ce		__read_mostly; +int				mce_ser			__read_mostly; + +/* User mode helper program triggered by machine check event */ +static unsigned long		mce_need_notify; +static char			mce_helper[128]; +static char			*mce_helper_argv[2] = { mce_helper, NULL }; + +static unsigned long		dont_init_banks; + +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DEFINE_PER_CPU(struct mce, mces_seen); +static int			cpu_missing; + + +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { +	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +static inline int skip_bank_init(int i) +{ +	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); +} + +static DEFINE_PER_CPU(struct work_struct, mce_work); + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ +	memset(m, 0, sizeof(struct mce)); +	m->cpu = m->extcpu = smp_processor_id(); +	rdtscll(m->tsc); +	/* We hope get_seconds stays lockless */ +	m->time = get_seconds(); +	m->cpuvendor = boot_cpu_data.x86_vendor; +	m->cpuid = cpuid_eax(1); +#ifdef CONFIG_SMP +	m->socketid = cpu_data(m->extcpu).phys_proc_id; +#endif +	m->apicid = cpu_data(m->extcpu).initial_apicid; +	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); +} + +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log mcelog = { +	.signature	= MCE_LOG_SIGNATURE, +	.len		= MCE_LOG_LEN, +	.recordlen	= sizeof(struct mce), +}; + +void mce_log(struct mce *mce) +{ +	unsigned next, entry; + +	mce->finished = 0; +	wmb(); +	for (;;) { +		entry = rcu_dereference(mcelog.next); +		for (;;) { +			/* +			 * When the buffer fills up discard new entries. +			 * Assume that the earlier errors are the more +			 * interesting ones: +			 */ +			if (entry >= MCE_LOG_LEN) { +				set_bit(MCE_OVERFLOW, +					(unsigned long *)&mcelog.flags); +				return; +			} +			/* Old left over entry. Skip: */ +			if (mcelog.entry[entry].finished) { +				entry++; +				continue; +			} +			break; +		} +		smp_rmb(); +		next = entry + 1; +		if (cmpxchg(&mcelog.next, entry, next) == entry) +			break; +	} +	memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); +	wmb(); +	mcelog.entry[entry].finished = 1; +	wmb(); + +	mce->finished = 1; +	set_bit(0, &mce_need_notify); +} + +static void print_mce(struct mce *m) +{ +	printk(KERN_EMERG +	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", +	       m->extcpu, m->mcgstatus, m->bank, m->status); +	if (m->ip) { +		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", +		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", +		       m->cs, m->ip); +		if (m->cs == __KERNEL_CS) +			print_symbol("{%s}", m->ip); +		printk("\n"); +	} +	printk(KERN_EMERG "TSC %llx ", m->tsc); +	if (m->addr) +		printk("ADDR %llx ", m->addr); +	if (m->misc) +		printk("MISC %llx ", m->misc); +	printk("\n"); +	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", +			m->cpuvendor, m->cpuid, m->time, m->socketid, +			m->apicid); +} + +static void print_mce_head(void) +{ +	printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); +} + +static void print_mce_tail(void) +{ +	printk(KERN_EMERG "This is not a software problem!\n" +	       KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +} + +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_paniced; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ +	long timeout = PANIC_TIMEOUT*USEC_PER_SEC; +	preempt_disable(); +	local_irq_enable(); +	while (timeout-- > 0) +		udelay(1); +	if (panic_timeout == 0) +		panic_timeout = mce_panic_timeout; +	panic("Panicing machine check CPU died"); +} + +static void mce_panic(char *msg, struct mce *final, char *exp) +{ +	int i; + +	/* +	 * Make sure only one CPU runs in machine check panic +	 */ +	if (atomic_add_return(1, &mce_paniced) > 1) +		wait_for_panic(); +	barrier(); + +	bust_spinlocks(1); +	console_verbose(); +	print_mce_head(); +	/* First print corrected ones that are still unlogged */ +	for (i = 0; i < MCE_LOG_LEN; i++) { +		struct mce *m = &mcelog.entry[i]; +		if (!(m->status & MCI_STATUS_VAL)) +			continue; +		if (!(m->status & MCI_STATUS_UC)) +			print_mce(m); +	} +	/* Now print uncorrected but with the final one last */ +	for (i = 0; i < MCE_LOG_LEN; i++) { +		struct mce *m = &mcelog.entry[i]; +		if (!(m->status & MCI_STATUS_VAL)) +			continue; +		if (!(m->status & MCI_STATUS_UC)) +			continue; +		if (!final || memcmp(m, final, sizeof(struct mce))) +			print_mce(m); +	} +	if (final) +		print_mce(final); +	if (cpu_missing) +		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); +	print_mce_tail(); +	if (exp) +		printk(KERN_EMERG "Machine check: %s\n", exp); +	if (panic_timeout == 0) +		panic_timeout = mce_panic_timeout; +	panic(msg); +} + +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ +	unsigned bank = __get_cpu_var(injectm.bank); +	if (msr == rip_msr) +		return offsetof(struct mce, ip); +	if (msr == MSR_IA32_MC0_STATUS + bank*4) +		return offsetof(struct mce, status); +	if (msr == MSR_IA32_MC0_ADDR + bank*4) +		return offsetof(struct mce, addr); +	if (msr == MSR_IA32_MC0_MISC + bank*4) +		return offsetof(struct mce, misc); +	if (msr == MSR_IA32_MCG_STATUS) +		return offsetof(struct mce, mcgstatus); +	return -1; +} + +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ +	u64 v; +	if (__get_cpu_var(injectm).finished) { +		int offset = msr_to_offset(msr); +		if (offset < 0) +			return 0; +		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); +	} +	rdmsrl(msr, v); +	return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ +	if (__get_cpu_var(injectm).finished) { +		int offset = msr_to_offset(msr); +		if (offset >= 0) +			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; +		return; +	} +	wrmsrl(msr, v); +} + +/* + * Simple lockless ring to communicate PFNs from the exception handler with the + * process context work function. This is vastly simplified because there's + * only a single reader and a single writer. + */ +#define MCE_RING_SIZE 16	/* we use one entry less */ + +struct mce_ring { +	unsigned short start; +	unsigned short end; +	unsigned long ring[MCE_RING_SIZE]; +}; +static DEFINE_PER_CPU(struct mce_ring, mce_ring); + +/* Runs with CPU affinity in workqueue */ +static int mce_ring_empty(void) +{ +	struct mce_ring *r = &__get_cpu_var(mce_ring); + +	return r->start == r->end; +} + +static int mce_ring_get(unsigned long *pfn) +{ +	struct mce_ring *r; +	int ret = 0; + +	*pfn = 0; +	get_cpu(); +	r = &__get_cpu_var(mce_ring); +	if (r->start == r->end) +		goto out; +	*pfn = r->ring[r->start]; +	r->start = (r->start + 1) % MCE_RING_SIZE; +	ret = 1; +out: +	put_cpu(); +	return ret; +} + +/* Always runs in MCE context with preempt off */ +static int mce_ring_add(unsigned long pfn) +{ +	struct mce_ring *r = &__get_cpu_var(mce_ring); +	unsigned next; + +	next = (r->end + 1) % MCE_RING_SIZE; +	if (next == r->start) +		return -1; +	r->ring[r->end] = pfn; +	wmb(); +	r->end = next; +	return 0; +} + +int mce_available(struct cpuinfo_x86 *c) +{ +	if (mce_disabled) +		return 0; +	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static void mce_schedule_work(void) +{ +	if (!mce_ring_empty()) { +		struct work_struct *work = &__get_cpu_var(mce_work); +		if (!work_pending(work)) +			schedule_work(work); +	} +} + +/* + * Get the address of the instruction at the time of the machine check + * error. + */ +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + +	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { +		m->ip = regs->ip; +		m->cs = regs->cs; +	} else { +		m->ip = 0; +		m->cs = 0; +	} +	if (rip_msr) +		m->ip = mce_rdmsrl(rip_msr); +} + +#ifdef CONFIG_X86_LOCAL_APIC  +/* + * Called after interrupts have been reenabled again + * when a MCE happened during an interrupts off region + * in the kernel. + */ +asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +{ +	ack_APIC_irq(); +	exit_idle(); +	irq_enter(); +	mce_notify_irq(); +	mce_schedule_work(); +	irq_exit(); +} +#endif + +static void mce_report_event(struct pt_regs *regs) +{ +	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { +		mce_notify_irq(); +		/* +		 * Triggering the work queue here is just an insurance +		 * policy in case the syscall exit notify handler +		 * doesn't run soon enough or ends up running on the +		 * wrong CPU (can happen when audit sleeps) +		 */ +		mce_schedule_work(); +		return; +	} + +#ifdef CONFIG_X86_LOCAL_APIC +	/* +	 * Without APIC do not notify. The event will be picked +	 * up eventually. +	 */ +	if (!cpu_has_apic) +		return; + +	/* +	 * When interrupts are disabled we cannot use +	 * kernel services safely. Trigger an self interrupt +	 * through the APIC to instead do the notification +	 * after interrupts are reenabled again. +	 */ +	apic->send_IPI_self(MCE_SELF_VECTOR); + +	/* +	 * Wait for idle afterwards again so that we don't leave the +	 * APIC in a non idle state because the normal APIC writes +	 * cannot exclude us. +	 */ +	apic_wait_icr_idle(); +#endif +} + +DEFINE_PER_CPU(unsigned, mce_poll_count); + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ +	struct mce m; +	int i; + +	__get_cpu_var(mce_poll_count)++; + +	mce_setup(&m); + +	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); +	for (i = 0; i < banks; i++) { +		if (!bank[i] || !test_bit(i, *b)) +			continue; + +		m.misc = 0; +		m.addr = 0; +		m.bank = i; +		m.tsc = 0; + +		barrier(); +		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		if (!(m.status & MCI_STATUS_VAL)) +			continue; + +		/* +		 * Uncorrected or signalled events are handled by the exception +		 * handler when it is enabled, so don't process those here. +		 * +		 * TBD do the same check for MCI_STATUS_EN here? +		 */ +		if (!(flags & MCP_UC) && +		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) +			continue; + +		if (m.status & MCI_STATUS_MISCV) +			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); +		if (m.status & MCI_STATUS_ADDRV) +			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + +		if (!(flags & MCP_TIMESTAMP)) +			m.tsc = 0; +		/* +		 * Don't get the IP here because it's unlikely to +		 * have anything to do with the actual error location. +		 */ +		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { +			mce_log(&m); +			add_taint(TAINT_MACHINE_CHECK); +		} + +		/* +		 * Clear state for this bank. +		 */ +		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	} + +	/* +	 * Don't clear MCG_STATUS here because it's only defined for +	 * exceptions. +	 */ + +	sync_core(); +} +EXPORT_SYMBOL_GPL(machine_check_poll); + +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg) +{ +	int i; + +	for (i = 0; i < banks; i++) { +		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) +			return 1; +	} +	return 0; +} + +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t) +{ +	/* +	 * The others already did panic for some reason. +	 * Bail out like in a timeout. +	 * rmb() to tell the compiler that system_state +	 * might have been modified by someone else. +	 */ +	rmb(); +	if (atomic_read(&mce_paniced)) +		wait_for_panic(); +	if (!monarch_timeout) +		goto out; +	if ((s64)*t < SPINUNIT) { +		/* CHECKME: Make panic default for 1 too? */ +		if (tolerant < 1) +			mce_panic("Timeout synchronizing machine check over CPUs", +				  NULL, NULL); +		cpu_missing = 1; +		return 1; +	} +	*t -= SPINUNIT; +out: +	touch_nmi_watchdog(); +	return 0; +} + +/* + * The Monarch's reign.  The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of an machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ +	int cpu; +	struct mce *m = NULL; +	int global_worst = 0; +	char *msg = NULL; +	char *nmsg = NULL; + +	/* +	 * This CPU is the Monarch and the other CPUs have run +	 * through their handlers. +	 * Grade the severity of the errors of all the CPUs. +	 */ +	for_each_possible_cpu(cpu) { +		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, +					    &nmsg); +		if (severity > global_worst) { +			msg = nmsg; +			global_worst = severity; +			m = &per_cpu(mces_seen, cpu); +		} +	} + +	/* +	 * Cannot recover? Panic here then. +	 * This dumps all the mces in the log buffer and stops the +	 * other CPUs. +	 */ +	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) +		mce_panic("Fatal Machine check", m, msg); + +	/* +	 * For UC somewhere we let the CPU who detects it handle it. +	 * Also must let continue the others, otherwise the handling +	 * CPU could deadlock on a lock. +	 */ + +	/* +	 * No machine check event found. Must be some external +	 * source or one CPU is hung. Panic. +	 */ +	if (!m && tolerant < 3) +		mce_panic("Machine check from unknown source", NULL, NULL); + +	/* +	 * Now clear all the mces_seen so that they don't reappear on +	 * the next mce. +	 */ +	for_each_possible_cpu(cpu) +		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int *no_way_out) +{ +	int order; +	int cpus = num_online_cpus(); +	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + +	if (!timeout) +		return -1; + +	atomic_add(*no_way_out, &global_nwo); +	/* +	 * global_nwo should be updated before mce_callin +	 */ +	smp_wmb(); +	order = atomic_add_return(1, &mce_callin); + +	/* +	 * Wait for everyone. +	 */ +	while (atomic_read(&mce_callin) != cpus) { +		if (mce_timed_out(&timeout)) { +			atomic_set(&global_nwo, 0); +			return -1; +		} +		ndelay(SPINUNIT); +	} + +	/* +	 * mce_callin should be read before global_nwo +	 */ +	smp_rmb(); + +	if (order == 1) { +		/* +		 * Monarch: Starts executing now, the others wait. +		 */ +		atomic_set(&mce_executing, 1); +	} else { +		/* +		 * Subject: Now start the scanning loop one by one in +		 * the original callin order. +		 * This way when there are any shared banks it will be +		 * only seen by one CPU before cleared, avoiding duplicates. +		 */ +		while (atomic_read(&mce_executing) < order) { +			if (mce_timed_out(&timeout)) { +				atomic_set(&global_nwo, 0); +				return -1; +			} +			ndelay(SPINUNIT); +		} +	} + +	/* +	 * Cache the global no_way_out state. +	 */ +	*no_way_out = atomic_read(&global_nwo); + +	return order; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ +	int ret = -1; +	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + +	if (!timeout) +		goto reset; +	if (order < 0) +		goto reset; + +	/* +	 * Allow others to run. +	 */ +	atomic_inc(&mce_executing); + +	if (order == 1) { +		/* CHECKME: Can this race with a parallel hotplug? */ +		int cpus = num_online_cpus(); + +		/* +		 * Monarch: Wait for everyone to go through their scanning +		 * loops. +		 */ +		while (atomic_read(&mce_executing) <= cpus) { +			if (mce_timed_out(&timeout)) +				goto reset; +			ndelay(SPINUNIT); +		} + +		mce_reign(); +		barrier(); +		ret = 0; +	} else { +		/* +		 * Subject: Wait for Monarch to finish. +		 */ +		while (atomic_read(&mce_executing) != 0) { +			if (mce_timed_out(&timeout)) +				goto reset; +			ndelay(SPINUNIT); +		} + +		/* +		 * Don't reset anything. That's done by the Monarch. +		 */ +		return 0; +	} + +	/* +	 * Reset all global state. +	 */ +reset: +	atomic_set(&global_nwo, 0); +	atomic_set(&mce_callin, 0); +	barrier(); + +	/* +	 * Let others run again. +	 */ +	atomic_set(&mce_executing, 0); +	return ret; +} + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses upto page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ +	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) +		return 0; +	if ((m->misc & 0x3f) > PAGE_SHIFT) +		return 0; +	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) +		return 0; +	return 1; +} + +static void mce_clear_state(unsigned long *toclear) +{ +	int i; + +	for (i = 0; i < banks; i++) { +		if (test_bit(i, toclear)) +			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	} +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. + */ +void do_machine_check(struct pt_regs *regs, long error_code) +{ +	struct mce m, *final; +	int i; +	int worst = 0; +	int severity; +	/* +	 * Establish sequential order between the CPUs entering the machine +	 * check handler. +	 */ +	int order; +	/* +	 * If no_way_out gets set, there is no safe way to recover from this +	 * MCE.  If tolerant is cranked up, we'll try anyway. +	 */ +	int no_way_out = 0; +	/* +	 * If kill_it gets set, there might be a way to recover from this +	 * error. +	 */ +	int kill_it = 0; +	DECLARE_BITMAP(toclear, MAX_NR_BANKS); +	char *msg = "Unknown"; + +	atomic_inc(&mce_entry); + +	__get_cpu_var(mce_exception_count)++; + +	if (notify_die(DIE_NMI, "machine check", regs, error_code, +			   18, SIGKILL) == NOTIFY_STOP) +		goto out; +	if (!banks) +		goto out; + +	mce_setup(&m); + +	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); +	no_way_out = mce_no_way_out(&m, &msg); + +	final = &__get_cpu_var(mces_seen); +	*final = m; + +	barrier(); + +	/* +	 * When no restart IP must always kill or panic. +	 */ +	if (!(m.mcgstatus & MCG_STATUS_RIPV)) +		kill_it = 1; + +	/* +	 * Go through all the banks in exclusion of the other CPUs. +	 * This way we don't report duplicated events on shared banks +	 * because the first one to see it will clear it. +	 */ +	order = mce_start(&no_way_out); +	for (i = 0; i < banks; i++) { +		__clear_bit(i, toclear); +		if (!bank[i]) +			continue; + +		m.misc = 0; +		m.addr = 0; +		m.bank = i; + +		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		if ((m.status & MCI_STATUS_VAL) == 0) +			continue; + +		/* +		 * Non uncorrected or non signaled errors are handled by +		 * machine_check_poll. Leave them alone, unless this panics. +		 */ +		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && +			!no_way_out) +			continue; + +		/* +		 * Set taint even when machine check was not enabled. +		 */ +		add_taint(TAINT_MACHINE_CHECK); + +		severity = mce_severity(&m, tolerant, NULL); + +		/* +		 * When machine check was for corrected handler don't touch, +		 * unless we're panicing. +		 */ +		if (severity == MCE_KEEP_SEVERITY && !no_way_out) +			continue; +		__set_bit(i, toclear); +		if (severity == MCE_NO_SEVERITY) { +			/* +			 * Machine check event was not enabled. Clear, but +			 * ignore. +			 */ +			continue; +		} + +		/* +		 * Kill on action required. +		 */ +		if (severity == MCE_AR_SEVERITY) +			kill_it = 1; + +		if (m.status & MCI_STATUS_MISCV) +			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); +		if (m.status & MCI_STATUS_ADDRV) +			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + +		/* +		 * Action optional error. Queue address for later processing. +		 * When the ring overflows we just ignore the AO error. +		 * RED-PEN add some logging mechanism when +		 * usable_address or mce_add_ring fails. +		 * RED-PEN don't ignore overflow for tolerant == 0 +		 */ +		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) +			mce_ring_add(m.addr >> PAGE_SHIFT); + +		mce_get_rip(&m, regs); +		mce_log(&m); + +		if (severity > worst) { +			*final = m; +			worst = severity; +		} +	} + +	if (!no_way_out) +		mce_clear_state(toclear); + +	/* +	 * Do most of the synchronization with other CPUs. +	 * When there's any problem use only local no_way_out state. +	 */ +	if (mce_end(order) < 0) +		no_way_out = worst >= MCE_PANIC_SEVERITY; + +	/* +	 * If we have decided that we just CAN'T continue, and the user +	 * has not set tolerant to an insane level, give up and die. +	 * +	 * This is mainly used in the case when the system doesn't +	 * support MCE broadcasting or it has been disabled. +	 */ +	if (no_way_out && tolerant < 3) +		mce_panic("Fatal machine check on current CPU", final, msg); + +	/* +	 * If the error seems to be unrecoverable, something should be +	 * done.  Try to kill as little as possible.  If we can kill just +	 * one task, do that.  If the user has set the tolerance very +	 * high, don't try to do anything at all. +	 */ + +	if (kill_it && tolerant < 3) +		force_sig(SIGBUS, current); + +	/* notify userspace ASAP */ +	set_thread_flag(TIF_MCE_NOTIFY); + +	if (worst > 0) +		mce_report_event(regs); +	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); +out: +	atomic_dec(&mce_entry); +	sync_core(); +} +EXPORT_SYMBOL_GPL(do_machine_check); + +/* dummy to break dependency. actual code is in mm/memory-failure.c */ +void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +{ +	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); +} + +/* + * Called after mce notification in process context. This code + * is allowed to sleep. Call the high level VM handler to process + * any corrupted pages. + * Assume that the work queue code only calls this one at a time + * per CPU. + * Note we don't disable preemption, so this code might run on the wrong + * CPU. In this case the event is picked up by the scheduled work queue. + * This is merely a fast path to expedite processing in some common + * cases. + */ +void mce_notify_process(void) +{ +	unsigned long pfn; +	mce_notify_irq(); +	while (mce_ring_get(&pfn)) +		memory_failure(pfn, MCE_VECTOR); +} + +static void mce_process_work(struct work_struct *dummy) +{ +	mce_notify_process(); +} + +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occurred. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(__u64 status) +{ +	struct mce m; + +	mce_setup(&m); +	m.bank = MCE_THERMAL_BANK; +	m.status = status; +	mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + +/* + * Periodic polling timer for "silent" machine check errors.  If the + * poller finds an MCE, poll 2x faster.  When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static int check_interval = 5 * 60; /* 5 minutes */ + +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static void mcheck_timer(unsigned long data) +{ +	struct timer_list *t = &per_cpu(mce_timer, data); +	int *n; + +	WARN_ON(smp_processor_id() != data); + +	if (mce_available(¤t_cpu_data)) { +		machine_check_poll(MCP_TIMESTAMP, +				&__get_cpu_var(mce_poll_banks)); +	} + +	/* +	 * Alert userspace if needed.  If we logged an MCE, reduce the +	 * polling interval, otherwise increase the polling interval. +	 */ +	n = &__get_cpu_var(next_interval); +	if (mce_notify_irq()) +		*n = max(*n/2, HZ/100); +	else +		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + +	t->expires = jiffies + *n; +	add_timer(t); +} + +static void mce_do_trigger(struct work_struct *work) +{ +	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_irq(void) +{ +	/* Not more than two messages every minute */ +	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + +	clear_thread_flag(TIF_MCE_NOTIFY); + +	if (test_and_clear_bit(0, &mce_need_notify)) { +		wake_up_interruptible(&mce_wait); + +		/* +		 * There is no risk of missing notifications because +		 * work_pending is always cleared before the function is +		 * executed. +		 */ +		if (mce_helper[0] && !work_pending(&mce_trigger_work)) +			schedule_work(&mce_trigger_work); + +		if (__ratelimit(&ratelimit)) +			printk(KERN_INFO "Machine check events logged\n"); + +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(mce_notify_irq); + +/* + * Initialize Machine Checks for a CPU. + */ +static int mce_cap_init(void) +{ +	unsigned b; +	u64 cap; + +	rdmsrl(MSR_IA32_MCG_CAP, cap); + +	b = cap & MCG_BANKCNT_MASK; +	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + +	if (b > MAX_NR_BANKS) { +		printk(KERN_WARNING +		       "MCE: Using only %u machine check banks out of %u\n", +			MAX_NR_BANKS, b); +		b = MAX_NR_BANKS; +	} + +	/* Don't support asymmetric configurations today */ +	WARN_ON(banks != 0 && b != banks); +	banks = b; +	if (!bank) { +		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); +		if (!bank) +			return -ENOMEM; +		memset(bank, 0xff, banks * sizeof(u64)); +	} + +	/* Use accurate RIP reporting if available. */ +	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) +		rip_msr = MSR_IA32_MCG_EIP; + +	if (cap & MCG_SER_P) +		mce_ser = 1; + +	return 0; +} + +static void mce_init(void) +{ +	mce_banks_t all_banks; +	u64 cap; +	int i; + +	/* +	 * Log the machine checks left over from the previous reset. +	 */ +	bitmap_fill(all_banks, MAX_NR_BANKS); +	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + +	set_in_cr4(X86_CR4_MCE); + +	rdmsrl(MSR_IA32_MCG_CAP, cap); +	if (cap & MCG_CTL_P) +		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + +	for (i = 0; i < banks; i++) { +		if (skip_bank_init(i)) +			continue; +		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); +		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	} +} + +/* Add per CPU specific workarounds here */ +static void mce_cpu_quirks(struct cpuinfo_x86 *c) +{ +	/* This should be disabled by the BIOS, but isn't always */ +	if (c->x86_vendor == X86_VENDOR_AMD) { +		if (c->x86 == 15 && banks > 4) { +			/* +			 * disable GART TBL walk error reporting, which +			 * trips off incorrectly with the IOMMU & 3ware +			 * & Cerberus: +			 */ +			clear_bit(10, (unsigned long *)&bank[4]); +		} +		if (c->x86 <= 17 && mce_bootlog < 0) { +			/* +			 * Lots of broken BIOS around that don't clear them +			 * by default and leave crap in there. Don't log: +			 */ +			mce_bootlog = 0; +		} +		/* +		 * Various K7s with broken bank 0 around. Always disable +		 * by default. +		 */ +		 if (c->x86 == 6 && banks > 0) +			bank[0] = 0; +	} + +	if (c->x86_vendor == X86_VENDOR_INTEL) { +		/* +		 * SDM documents that on family 6 bank 0 should not be written +		 * because it aliases to another special BIOS controlled +		 * register. +		 * But it's not aliased anymore on model 0x1a+ +		 * Don't ignore bank 0 completely because there could be a +		 * valid event later, merely don't write CTL0. +		 */ + +		if (c->x86 == 6 && c->x86_model < 0x1A) +			__set_bit(0, &dont_init_banks); + +		/* +		 * All newer Intel systems support MCE broadcasting. Enable +		 * synchronization with a one second timeout. +		 */ +		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && +			monarch_timeout < 0) +			monarch_timeout = USEC_PER_SEC; +	} +	if (monarch_timeout < 0) +		monarch_timeout = 0; +	if (mce_bootlog != 0) +		mce_panic_timeout = 30; +} + +static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +{ +	if (c->x86 != 5) +		return; +	switch (c->x86_vendor) { +	case X86_VENDOR_INTEL: +		intel_p5_mcheck_init(c); +		break; +	case X86_VENDOR_CENTAUR: +		winchip_mcheck_init(c); +		break; +	} +} + +static void mce_cpu_features(struct cpuinfo_x86 *c) +{ +	switch (c->x86_vendor) { +	case X86_VENDOR_INTEL: +		mce_intel_feature_init(c); +		break; +	case X86_VENDOR_AMD: +		mce_amd_feature_init(c); +		break; +	default: +		break; +	} +} + +static void mce_init_timer(void) +{ +	struct timer_list *t = &__get_cpu_var(mce_timer); +	int *n = &__get_cpu_var(next_interval); + +	if (mce_ignore_ce) +		return; + +	*n = check_interval * HZ; +	if (!*n) +		return; +	setup_timer(t, mcheck_timer, smp_processor_id()); +	t->expires = round_jiffies(jiffies + *n); +	add_timer(t); +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +{ +	if (mce_disabled) +		return; + +	mce_ancient_init(c); + +	if (!mce_available(c)) +		return; + +	if (mce_cap_init() < 0) { +		mce_disabled = 1; +		return; +	} +	mce_cpu_quirks(c); + +	machine_check_vector = do_machine_check; + +	mce_init(); +	mce_cpu_features(c); +	mce_init_timer(); +	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); +} + +/* + * Character device to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_state_lock); +static int		open_count;		/* #times opened */ +static int		open_exclu;		/* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ +	spin_lock(&mce_state_lock); + +	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { +		spin_unlock(&mce_state_lock); + +		return -EBUSY; +	} + +	if (file->f_flags & O_EXCL) +		open_exclu = 1; +	open_count++; + +	spin_unlock(&mce_state_lock); + +	return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ +	spin_lock(&mce_state_lock); + +	open_count--; +	open_exclu = 0; + +	spin_unlock(&mce_state_lock); + +	return 0; +} + +static void collect_tscs(void *data) +{ +	unsigned long *cpu_tsc = (unsigned long *)data; + +	rdtscll(cpu_tsc[smp_processor_id()]); +} + +static DEFINE_MUTEX(mce_read_mutex); + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, +			loff_t *off) +{ +	char __user *buf = ubuf; +	unsigned long *cpu_tsc; +	unsigned prev, next; +	int i, err; + +	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); +	if (!cpu_tsc) +		return -ENOMEM; + +	mutex_lock(&mce_read_mutex); +	next = rcu_dereference(mcelog.next); + +	/* Only supports full reads right now */ +	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { +		mutex_unlock(&mce_read_mutex); +		kfree(cpu_tsc); + +		return -EINVAL; +	} + +	err = 0; +	prev = 0; +	do { +		for (i = prev; i < next; i++) { +			unsigned long start = jiffies; + +			while (!mcelog.entry[i].finished) { +				if (time_after_eq(jiffies, start + 2)) { +					memset(mcelog.entry + i, 0, +					       sizeof(struct mce)); +					goto timeout; +				} +				cpu_relax(); +			} +			smp_rmb(); +			err |= copy_to_user(buf, mcelog.entry + i, +					    sizeof(struct mce)); +			buf += sizeof(struct mce); +timeout: +			; +		} + +		memset(mcelog.entry + prev, 0, +		       (next - prev) * sizeof(struct mce)); +		prev = next; +		next = cmpxchg(&mcelog.next, prev, 0); +	} while (next != prev); + +	synchronize_sched(); + +	/* +	 * Collect entries that were still getting written before the +	 * synchronize. +	 */ +	on_each_cpu(collect_tscs, cpu_tsc, 1); + +	for (i = next; i < MCE_LOG_LEN; i++) { +		if (mcelog.entry[i].finished && +		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { +			err |= copy_to_user(buf, mcelog.entry+i, +					    sizeof(struct mce)); +			smp_rmb(); +			buf += sizeof(struct mce); +			memset(&mcelog.entry[i], 0, sizeof(struct mce)); +		} +	} +	mutex_unlock(&mce_read_mutex); +	kfree(cpu_tsc); + +	return err ? -EFAULT : buf - ubuf; +} + +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ +	poll_wait(file, &mce_wait, wait); +	if (rcu_dereference(mcelog.next)) +		return POLLIN | POLLRDNORM; +	return 0; +} + +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ +	int __user *p = (int __user *)arg; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	switch (cmd) { +	case MCE_GET_RECORD_LEN: +		return put_user(sizeof(struct mce), p); +	case MCE_GET_LOG_LEN: +		return put_user(MCE_LOG_LEN, p); +	case MCE_GETCLEAR_FLAGS: { +		unsigned flags; + +		do { +			flags = mcelog.flags; +		} while (cmpxchg(&mcelog.flags, flags, 0) != flags); + +		return put_user(flags, p); +	} +	default: +		return -ENOTTY; +	} +} + +/* Modified in mce-inject.c, so not static or const */ +struct file_operations mce_chrdev_ops = { +	.open			= mce_open, +	.release		= mce_release, +	.read			= mce_read, +	.poll			= mce_poll, +	.unlocked_ioctl		= mce_ioctl, +}; +EXPORT_SYMBOL_GPL(mce_chrdev_ops); + +static struct miscdevice mce_log_device = { +	MISC_MCELOG_MINOR, +	"mcelog", +	&mce_chrdev_ops, +}; + +/* + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + *	monarchtimeout is how long to wait for other CPUs on machine + *	check, or 0 to not wait + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ +static int __init mcheck_enable(char *str) +{ +	if (*str == 0) +		enable_p5_mce(); +	if (*str == '=') +		str++; +	if (!strcmp(str, "off")) +		mce_disabled = 1; +	else if (!strcmp(str, "no_cmci")) +		mce_cmci_disabled = 1; +	else if (!strcmp(str, "dont_log_ce")) +		mce_dont_log_ce = 1; +	else if (!strcmp(str, "ignore_ce")) +		mce_ignore_ce = 1; +	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) +		mce_bootlog = (str[0] == 'b'); +	else if (isdigit(str[0])) { +		get_option(&str, &tolerant); +		if (*str == ',') { +			++str; +			get_option(&str, &monarch_timeout); +		} +	} else { +		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", +		       str); +		return 0; +	} +	return 1; +} +__setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ +	int i; + +	for (i = 0; i < banks; i++) { +		if (!skip_bank_init(i)) +			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +	} +	return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ +	return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ +	return mce_disable(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static int mce_resume(struct sys_device *dev) +{ +	mce_init(); +	mce_cpu_features(¤t_cpu_data); + +	return 0; +} + +static void mce_cpu_restart(void *data) +{ +	del_timer_sync(&__get_cpu_var(mce_timer)); +	if (!mce_available(¤t_cpu_data)) +		return; +	mce_init(); +	mce_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ +	on_each_cpu(mce_cpu_restart, NULL, 1); +} + +/* Toggle features for corrected errors */ +static void mce_disable_ce(void *all) +{ +	if (!mce_available(¤t_cpu_data)) +		return; +	if (all) +		del_timer_sync(&__get_cpu_var(mce_timer)); +	cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ +	if (!mce_available(¤t_cpu_data)) +		return; +	cmci_reenable(); +	cmci_recheck(); +	if (all) +		mce_init_timer(); +} + +static struct sysdev_class mce_sysclass = { +	.suspend	= mce_suspend, +	.shutdown	= mce_shutdown, +	.resume		= mce_resume, +	.name		= "machinecheck", +}; + +DEFINE_PER_CPU(struct sys_device, mce_dev); + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +			 char *buf) +{ +	u64 b = bank[attr - bank_attrs]; + +	return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +			const char *buf, size_t size) +{ +	u64 new; + +	if (strict_strtoull(buf, 0, &new) < 0) +		return -EINVAL; + +	bank[attr - bank_attrs] = new; +	mce_restart(); + +	return size; +} + +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +{ +	strcpy(buf, mce_helper); +	strcat(buf, "\n"); +	return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +				const char *buf, size_t siz) +{ +	char *p; +	int len; + +	strncpy(mce_helper, buf, sizeof(mce_helper)); +	mce_helper[sizeof(mce_helper)-1] = 0; +	len = strlen(mce_helper); +	p = strchr(mce_helper, '\n'); + +	if (*p) +		*p = 0; + +	return len; +} + +static ssize_t set_ignore_ce(struct sys_device *s, +			     struct sysdev_attribute *attr, +			     const char *buf, size_t size) +{ +	u64 new; + +	if (strict_strtoull(buf, 0, &new) < 0) +		return -EINVAL; + +	if (mce_ignore_ce ^ !!new) { +		if (new) { +			/* disable ce features */ +			on_each_cpu(mce_disable_ce, (void *)1, 1); +			mce_ignore_ce = 1; +		} else { +			/* enable ce features */ +			mce_ignore_ce = 0; +			on_each_cpu(mce_enable_ce, (void *)1, 1); +		} +	} +	return size; +} + +static ssize_t set_cmci_disabled(struct sys_device *s, +				 struct sysdev_attribute *attr, +				 const char *buf, size_t size) +{ +	u64 new; + +	if (strict_strtoull(buf, 0, &new) < 0) +		return -EINVAL; + +	if (mce_cmci_disabled ^ !!new) { +		if (new) { +			/* disable cmci */ +			on_each_cpu(mce_disable_ce, NULL, 1); +			mce_cmci_disabled = 1; +		} else { +			/* enable cmci */ +			mce_cmci_disabled = 0; +			on_each_cpu(mce_enable_ce, NULL, 1); +		} +	} +	return size; +} + +static ssize_t store_int_with_restart(struct sys_device *s, +				      struct sysdev_attribute *attr, +				      const char *buf, size_t size) +{ +	ssize_t ret = sysdev_store_int(s, attr, buf, size); +	mce_restart(); +	return ret; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); +static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); + +static struct sysdev_ext_attribute attr_check_interval = { +	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, +		     store_int_with_restart), +	&check_interval +}; + +static struct sysdev_ext_attribute attr_ignore_ce = { +	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +	&mce_ignore_ce +}; + +static struct sysdev_ext_attribute attr_cmci_disabled = { +	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +	&mce_cmci_disabled +}; + +static struct sysdev_attribute *mce_attrs[] = { +	&attr_tolerant.attr, +	&attr_check_interval.attr, +	&attr_trigger, +	&attr_monarch_timeout.attr, +	&attr_dont_log_ce.attr, +	&attr_ignore_ce.attr, +	&attr_cmci_disabled.attr, +	NULL +}; + +static cpumask_var_t mce_dev_initialized; + +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ +	int err; +	int i, j; + +	if (!mce_available(&boot_cpu_data)) +		return -EIO; + +	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); +	per_cpu(mce_dev, cpu).id	= cpu; +	per_cpu(mce_dev, cpu).cls	= &mce_sysclass; + +	err = sysdev_register(&per_cpu(mce_dev, cpu)); +	if (err) +		return err; + +	for (i = 0; mce_attrs[i]; i++) { +		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); +		if (err) +			goto error; +	} +	for (j = 0; j < banks; j++) { +		err = sysdev_create_file(&per_cpu(mce_dev, cpu), +					&bank_attrs[j]); +		if (err) +			goto error2; +	} +	cpumask_set_cpu(cpu, mce_dev_initialized); + +	return 0; +error2: +	while (--j >= 0) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); +error: +	while (--i >= 0) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + +	sysdev_unregister(&per_cpu(mce_dev, cpu)); + +	return err; +} + +static __cpuinit void mce_remove_device(unsigned int cpu) +{ +	int i; + +	if (!cpumask_test_cpu(cpu, mce_dev_initialized)) +		return; + +	for (i = 0; mce_attrs[i]; i++) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + +	for (i = 0; i < banks; i++) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + +	sysdev_unregister(&per_cpu(mce_dev, cpu)); +	cpumask_clear_cpu(cpu, mce_dev_initialized); +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ +	unsigned long action = *(unsigned long *)h; +	int i; + +	if (!mce_available(¤t_cpu_data)) +		return; +	if (!(action & CPU_TASKS_FROZEN)) +		cmci_clear(); +	for (i = 0; i < banks; i++) { +		if (!skip_bank_init(i)) +			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +	} +} + +static void mce_reenable_cpu(void *h) +{ +	unsigned long action = *(unsigned long *)h; +	int i; + +	if (!mce_available(¤t_cpu_data)) +		return; + +	if (!(action & CPU_TASKS_FROZEN)) +		cmci_reenable(); +	for (i = 0; i < banks; i++) { +		if (!skip_bank_init(i)) +			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +	} +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; +	struct timer_list *t = &per_cpu(mce_timer, cpu); + +	switch (action) { +	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN: +		mce_create_device(cpu); +		if (threshold_cpu_callback) +			threshold_cpu_callback(action, cpu); +		break; +	case CPU_DEAD: +	case CPU_DEAD_FROZEN: +		if (threshold_cpu_callback) +			threshold_cpu_callback(action, cpu); +		mce_remove_device(cpu); +		break; +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +		del_timer_sync(t); +		smp_call_function_single(cpu, mce_disable_cpu, &action, 1); +		break; +	case CPU_DOWN_FAILED: +	case CPU_DOWN_FAILED_FROZEN: +		t->expires = round_jiffies(jiffies + +						__get_cpu_var(next_interval)); +		add_timer_on(t, cpu); +		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); +		break; +	case CPU_POST_DEAD: +		/* intentionally ignoring frozen here */ +		cmci_rediscover(cpu); +		break; +	} +	return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier __cpuinitdata = { +	.notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_banks(void) +{ +	int i; + +	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, +				GFP_KERNEL); +	if (!bank_attrs) +		return -ENOMEM; + +	for (i = 0; i < banks; i++) { +		struct sysdev_attribute *a = &bank_attrs[i]; + +		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i); +		if (!a->attr.name) +			goto nomem; + +		a->attr.mode	= 0644; +		a->show		= show_bank; +		a->store	= set_bank; +	} +	return 0; + +nomem: +	while (--i >= 0) +		kfree(bank_attrs[i].attr.name); +	kfree(bank_attrs); +	bank_attrs = NULL; + +	return -ENOMEM; +} + +static __init int mce_init_device(void) +{ +	int err; +	int i = 0; + +	if (!mce_available(&boot_cpu_data)) +		return -EIO; + +	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + +	err = mce_init_banks(); +	if (err) +		return err; + +	err = sysdev_class_register(&mce_sysclass); +	if (err) +		return err; + +	for_each_online_cpu(i) { +		err = mce_create_device(i); +		if (err) +			return err; +	} + +	register_hotcpu_notifier(&mce_cpu_notifier); +	misc_register(&mce_log_device); + +	return err; +} + +device_initcall(mce_init_device); + +#else /* CONFIG_X86_OLD_MCE: */ + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */ + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ +	if (mce_disabled) +		return; + +	switch (c->x86_vendor) { +	case X86_VENDOR_AMD: +		amd_mcheck_init(c); +		break; + +	case X86_VENDOR_INTEL: +		if (c->x86 == 5) +			intel_p5_mcheck_init(c); +		if (c->x86 == 6) +			intel_p6_mcheck_init(c); +		if (c->x86 == 15) +			intel_p4_mcheck_init(c); +		break; + +	case X86_VENDOR_CENTAUR: +		if (c->x86 == 5) +			winchip_mcheck_init(c); +		break; + +	default: +		break; +	} +	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); +} + +static int __init mcheck_enable(char *str) +{ +	mce_p5_enabled = 1; +	return 1; +} +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_OLD_MCE */ + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ +	mce_disabled = 1; +	return 1; +} +__setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h deleted file mode 100644 index ae9f628838f..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ /dev/null @@ -1,14 +0,0 @@ -#include <linux/init.h> -#include <asm/mce.h> - -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); - -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); - -extern int nr_mce_banks; - diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 3552119b091..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com> - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/thread_info.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> - -#include "mce.h" - -int mce_disabled; -int nr_mce_banks; - -EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ -	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ -	if (mce_disabled == 1) -		return; - -	switch (c->x86_vendor) { -	case X86_VENDOR_AMD: -		amd_mcheck_init(c); -		break; - -	case X86_VENDOR_INTEL: -		if (c->x86 == 5) -			intel_p5_mcheck_init(c); -		if (c->x86 == 6) -			intel_p6_mcheck_init(c); -		if (c->x86 == 15) -			intel_p4_mcheck_init(c); -		break; - -	case X86_VENDOR_CENTAUR: -		if (c->x86 == 5) -			winchip_mcheck_init(c); -		break; - -	default: -		break; -	} -} - -static int __init mcheck_disable(char *str) -{ -	mce_disabled = 1; -	return 1; -} - -static int __init mcheck_enable(char *str) -{ -	mce_disabled = -1; -	return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c deleted file mode 100644 index 6fb0b359d2a..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ /dev/null @@ -1,1187 +0,0 @@ -/* - * Machine check handler. - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/string.h> -#include <linux/rcupdate.h> -#include <linux/kallsyms.h> -#include <linux/sysdev.h> -#include <linux/miscdevice.h> -#include <linux/fs.h> -#include <linux/capability.h> -#include <linux/cpu.h> -#include <linux/percpu.h> -#include <linux/poll.h> -#include <linux/thread_info.h> -#include <linux/ctype.h> -#include <linux/kmod.h> -#include <linux/kdebug.h> -#include <linux/kobject.h> -#include <linux/sysfs.h> -#include <linux/ratelimit.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> -#include <asm/uaccess.h> -#include <asm/smp.h> -#include <asm/idle.h> - -#define MISC_MCELOG_MINOR 227 - -atomic_t mce_entry; - -static int mce_dont_init; - -/* - * Tolerant levels: - *   0: always panic on uncorrected errors, log corrected errors - *   1: panic or SIGBUS on uncorrected errors, log corrected errors - *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors - *   3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; - -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); - -/* MCA banks polled by the period polling timer for corrected events */ -DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { -	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL -}; - -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) -{ -	memset(m, 0, sizeof(struct mce)); -	m->cpu = smp_processor_id(); -	rdtscll(m->tsc); -} - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { -	MCE_LOG_SIGNATURE, -	MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ -	unsigned next, entry; -	atomic_inc(&mce_events); -	mce->finished = 0; -	wmb(); -	for (;;) { -		entry = rcu_dereference(mcelog.next); -		for (;;) { -			/* When the buffer fills up discard new entries. Assume -			   that the earlier errors are the more interesting. */ -			if (entry >= MCE_LOG_LEN) { -				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); -				return; -			} -			/* Old left over entry. Skip. */ -			if (mcelog.entry[entry].finished) { -				entry++; -				continue; -			} -			break; -		} -		smp_rmb(); -		next = entry + 1; -		if (cmpxchg(&mcelog.next, entry, next) == entry) -			break; -	} -	memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); -	wmb(); -	mcelog.entry[entry].finished = 1; -	wmb(); - -	set_bit(0, ¬ify_user); -} - -static void print_mce(struct mce *m) -{ -	printk(KERN_EMERG "\n" -	       KERN_EMERG "HARDWARE ERROR\n" -	       KERN_EMERG -	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", -	       m->cpu, m->mcgstatus, m->bank, m->status); -	if (m->ip) { -		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", -		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", -		       m->cs, m->ip); -		if (m->cs == __KERNEL_CS) -			print_symbol("{%s}", m->ip); -		printk("\n"); -	} -	printk(KERN_EMERG "TSC %llx ", m->tsc); -	if (m->addr) -		printk("ADDR %llx ", m->addr); -	if (m->misc) -		printk("MISC %llx ", m->misc); -	printk("\n"); -	printk(KERN_EMERG "This is not a software problem!\n"); -	printk(KERN_EMERG "Run through mcelog --ascii to decode " -	       "and contact your hardware vendor\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ -	int i; - -	oops_begin(); -	for (i = 0; i < MCE_LOG_LEN; i++) { -		unsigned long tsc = mcelog.entry[i].tsc; - -		if (time_before(tsc, start)) -			continue; -		print_mce(&mcelog.entry[i]); -		if (backup && mcelog.entry[i].tsc == backup->tsc) -			backup = NULL; -	} -	if (backup) -		print_mce(backup); -	panic(msg); -} - -int mce_available(struct cpuinfo_x86 *c) -{ -	if (mce_dont_init) -		return 0; -	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ -	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { -		m->ip = regs->ip; -		m->cs = regs->cs; -	} else { -		m->ip = 0; -		m->cs = 0; -	} -	if (rip_msr) { -		/* Assume the RIP in the MSR is exact. Is this true? */ -		m->mcgstatus |= MCG_STATUS_EIPV; -		rdmsrl(rip_msr, m->ip); -		m->cs = 0; -	} -} - -/* - * Poll for corrected events or events that happened before reset. - * Those are just logged through /dev/mcelog. - * - * This is executed in standard interrupt context. - */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) -{ -	struct mce m; -	int i; - -	mce_setup(&m); - -	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); -	for (i = 0; i < banks; i++) { -		if (!bank[i] || !test_bit(i, *b)) -			continue; - -		m.misc = 0; -		m.addr = 0; -		m.bank = i; -		m.tsc = 0; - -		barrier(); -		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); -		if (!(m.status & MCI_STATUS_VAL)) -			continue; - -		/* -		 * Uncorrected events are handled by the exception handler -		 * when it is enabled. But when the exception is disabled log -		 * everything. -		 * -		 * TBD do the same check for MCI_STATUS_EN here? -		 */ -		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) -			continue; - -		if (m.status & MCI_STATUS_MISCV) -			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); -		if (m.status & MCI_STATUS_ADDRV) -			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - -		if (!(flags & MCP_TIMESTAMP)) -			m.tsc = 0; -		/* -		 * Don't get the IP here because it's unlikely to -		 * have anything to do with the actual error location. -		 */ -		if (!(flags & MCP_DONTLOG)) { -			mce_log(&m); -			add_taint(TAINT_MACHINE_CHECK); -		} - -		/* -		 * Clear state for this bank. -		 */ -		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); -	} - -	/* -	 * Don't clear MCG_STATUS here because it's only defined for -	 * exceptions. -	 */ -} - -/* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. - * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even - * think about putting a printk in there! - */ -void do_machine_check(struct pt_regs * regs, long error_code) -{ -	struct mce m, panicm; -	u64 mcestart = 0; -	int i; -	int panicm_found = 0; -	/* -	 * If no_way_out gets set, there is no safe way to recover from this -	 * MCE.  If tolerant is cranked up, we'll try anyway. -	 */ -	int no_way_out = 0; -	/* -	 * If kill_it gets set, there might be a way to recover from this -	 * error. -	 */ -	int kill_it = 0; -	DECLARE_BITMAP(toclear, MAX_NR_BANKS); - -	atomic_inc(&mce_entry); - -	if (notify_die(DIE_NMI, "machine check", regs, error_code, -			   18, SIGKILL) == NOTIFY_STOP) -		goto out2; -	if (!banks) -		goto out2; - -	mce_setup(&m); - -	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); -	/* if the restart IP is not valid, we're done for */ -	if (!(m.mcgstatus & MCG_STATUS_RIPV)) -		no_way_out = 1; - -	rdtscll(mcestart); -	barrier(); - -	for (i = 0; i < banks; i++) { -		__clear_bit(i, toclear); -		if (!bank[i]) -			continue; - -		m.misc = 0; -		m.addr = 0; -		m.bank = i; - -		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); -		if ((m.status & MCI_STATUS_VAL) == 0) -			continue; - -		/* -		 * Non uncorrected errors are handled by machine_check_poll -		 * Leave them alone. -		 */ -		if ((m.status & MCI_STATUS_UC) == 0) -			continue; - -		/* -		 * Set taint even when machine check was not enabled. -		 */ -		add_taint(TAINT_MACHINE_CHECK); - -		__set_bit(i, toclear); - -		if (m.status & MCI_STATUS_EN) { -			/* if PCC was set, there's no way out */ -			no_way_out |= !!(m.status & MCI_STATUS_PCC); -			/* -			 * If this error was uncorrectable and there was -			 * an overflow, we're in trouble.  If no overflow, -			 * we might get away with just killing a task. -			 */ -			if (m.status & MCI_STATUS_UC) { -				if (tolerant < 1 || m.status & MCI_STATUS_OVER) -					no_way_out = 1; -				kill_it = 1; -			} -		} else { -			/* -			 * Machine check event was not enabled. Clear, but -			 * ignore. -			 */ -			continue; -		} - -		if (m.status & MCI_STATUS_MISCV) -			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); -		if (m.status & MCI_STATUS_ADDRV) -			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - -		mce_get_rip(&m, regs); -		mce_log(&m); - -		/* Did this bank cause the exception? */ -		/* Assume that the bank with uncorrectable errors did it, -		   and that there is only a single one. */ -		if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { -			panicm = m; -			panicm_found = 1; -		} -	} - -	/* If we didn't find an uncorrectable error, pick -	   the last one (shouldn't happen, just being safe). */ -	if (!panicm_found) -		panicm = m; - -	/* -	 * If we have decided that we just CAN'T continue, and the user -	 *  has not set tolerant to an insane level, give up and die. -	 */ -	if (no_way_out && tolerant < 3) -		mce_panic("Machine check", &panicm, mcestart); - -	/* -	 * If the error seems to be unrecoverable, something should be -	 * done.  Try to kill as little as possible.  If we can kill just -	 * one task, do that.  If the user has set the tolerance very -	 * high, don't try to do anything at all. -	 */ -	if (kill_it && tolerant < 3) { -		int user_space = 0; - -		/* -		 * If the EIPV bit is set, it means the saved IP is the -		 * instruction which caused the MCE. -		 */ -		if (m.mcgstatus & MCG_STATUS_EIPV) -			user_space = panicm.ip && (panicm.cs & 3); - -		/* -		 * If we know that the error was in user space, send a -		 * SIGBUS.  Otherwise, panic if tolerance is low. -		 * -		 * force_sig() takes an awful lot of locks and has a slight -		 * risk of deadlocking. -		 */ -		if (user_space) { -			force_sig(SIGBUS, current); -		} else if (panic_on_oops || tolerant < 2) { -			mce_panic("Uncorrected machine check", -				&panicm, mcestart); -		} -	} - -	/* notify userspace ASAP */ -	set_thread_flag(TIF_MCE_NOTIFY); - -	/* the last thing we do is clear state */ -	for (i = 0; i < banks; i++) { -		if (test_bit(i, toclear)) -			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); -	} -	wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: -	atomic_dec(&mce_entry); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ -	struct mce m; - -	mce_setup(&m); -	m.bank = MCE_THERMAL_BANK; -	m.status = status; -	mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* - * Periodic polling timer for "silent" machine check errors.  If the - * poller finds an MCE, poll 2x faster.  When the poller finds no more - * errors, poll 2x slower (up to check_interval seconds). - */ - -static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static void mcheck_timer(unsigned long); -static DEFINE_PER_CPU(struct timer_list, mce_timer); - -static void mcheck_timer(unsigned long data) -{ -	struct timer_list *t = &per_cpu(mce_timer, data); -	int *n; - -	WARN_ON(smp_processor_id() != data); - -	if (mce_available(¤t_cpu_data)) -		machine_check_poll(MCP_TIMESTAMP, -				&__get_cpu_var(mce_poll_banks)); - -	/* -	 * Alert userspace if needed.  If we logged an MCE, reduce the -	 * polling interval, otherwise increase the polling interval. -	 */ -	n = &__get_cpu_var(next_interval); -	if (mce_notify_user()) { -		*n = max(*n/2, HZ/100); -	} else { -		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); -	} - -	t->expires = jiffies + *n; -	add_timer(t); -} - -static void mce_do_trigger(struct work_struct *work) -{ -	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_user(void) -{ -	/* Not more than two messages every minute */ -	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - -	clear_thread_flag(TIF_MCE_NOTIFY); -	if (test_and_clear_bit(0, ¬ify_user)) { -		wake_up_interruptible(&mce_wait); - -		/* -		 * There is no risk of missing notifications because -		 * work_pending is always cleared before the function is -		 * executed. -		 */ -		if (trigger[0] && !work_pending(&mce_trigger_work)) -			schedule_work(&mce_trigger_work); - -		if (__ratelimit(&ratelimit)) -			printk(KERN_INFO "Machine check events logged\n"); - -		return 1; -	} -	return 0; -} - -/* see if the idle task needs to notify userspace */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) -{ -	/* IDLE_END should be safe - interrupts are back on */ -	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) -		mce_notify_user(); - -	return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { -	.notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ -       idle_notifier_register(&mce_idle_notifier); -       return 0; -} -__initcall(periodic_mcheck_init); - -/* - * Initialize Machine Checks for a CPU. - */ -static int mce_cap_init(void) -{ -	u64 cap; -	unsigned b; - -	rdmsrl(MSR_IA32_MCG_CAP, cap); -	b = cap & 0xff; -	if (b > MAX_NR_BANKS) { -		printk(KERN_WARNING -		       "MCE: Using only %u machine check banks out of %u\n", -			MAX_NR_BANKS, b); -		b = MAX_NR_BANKS; -	} - -	/* Don't support asymmetric configurations today */ -	WARN_ON(banks != 0 && b != banks); -	banks = b; -	if (!bank) { -		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); -		if (!bank) -			return -ENOMEM; -		memset(bank, 0xff, banks * sizeof(u64)); -	} - -	/* Use accurate RIP reporting if available. */ -	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) -		rip_msr = MSR_IA32_MCG_EIP; - -	return 0; -} - -static void mce_init(void *dummy) -{ -	u64 cap; -	int i; -	mce_banks_t all_banks; - -	/* -	 * Log the machine checks left over from the previous reset. -	 */ -	bitmap_fill(all_banks, MAX_NR_BANKS); -	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); - -	set_in_cr4(X86_CR4_MCE); - -	rdmsrl(MSR_IA32_MCG_CAP, cap); -	if (cap & MCG_CTL_P) -		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - -	for (i = 0; i < banks; i++) { -		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); -		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); -	} -} - -/* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) -{ -	/* This should be disabled by the BIOS, but isn't always */ -	if (c->x86_vendor == X86_VENDOR_AMD) { -		if (c->x86 == 15 && banks > 4) -			/* disable GART TBL walk error reporting, which trips off -			   incorrectly with the IOMMU & 3ware & Cerberus. */ -			clear_bit(10, (unsigned long *)&bank[4]); -		if(c->x86 <= 17 && mce_bootlog < 0) -			/* Lots of broken BIOS around that don't clear them -			   by default and leave crap in there. Don't log. */ -			mce_bootlog = 0; -	} - -} - -static void mce_cpu_features(struct cpuinfo_x86 *c) -{ -	switch (c->x86_vendor) { -	case X86_VENDOR_INTEL: -		mce_intel_feature_init(c); -		break; -	case X86_VENDOR_AMD: -		mce_amd_feature_init(c); -		break; -	default: -		break; -	} -} - -static void mce_init_timer(void) -{ -	struct timer_list *t = &__get_cpu_var(mce_timer); -	int *n = &__get_cpu_var(next_interval); - -	*n = check_interval * HZ; -	if (!*n) -		return; -	setup_timer(t, mcheck_timer, smp_processor_id()); -	t->expires = round_jiffies(jiffies + *n); -	add_timer(t); -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. - */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) -{ -	if (!mce_available(c)) -		return; - -	if (mce_cap_init() < 0) { -		mce_dont_init = 1; -		return; -	} -	mce_cpu_quirks(c); - -	mce_init(NULL); -	mce_cpu_features(c); -	mce_init_timer(); -} - -/* - * Character device to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count;	/* #times opened */ -static int open_exclu;	/* already open exclusive? */ - -static int mce_open(struct inode *inode, struct file *file) -{ -	lock_kernel(); -	spin_lock(&mce_state_lock); - -	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { -		spin_unlock(&mce_state_lock); -		unlock_kernel(); -		return -EBUSY; -	} - -	if (file->f_flags & O_EXCL) -		open_exclu = 1; -	open_count++; - -	spin_unlock(&mce_state_lock); -	unlock_kernel(); - -	return nonseekable_open(inode, file); -} - -static int mce_release(struct inode *inode, struct file *file) -{ -	spin_lock(&mce_state_lock); - -	open_count--; -	open_exclu = 0; - -	spin_unlock(&mce_state_lock); - -	return 0; -} - -static void collect_tscs(void *data) -{ -	unsigned long *cpu_tsc = (unsigned long *)data; - -	rdtscll(cpu_tsc[smp_processor_id()]); -} - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, -			loff_t *off) -{ -	unsigned long *cpu_tsc; -	static DEFINE_MUTEX(mce_read_mutex); -	unsigned prev, next; -	char __user *buf = ubuf; -	int i, err; - -	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); -	if (!cpu_tsc) -		return -ENOMEM; - -	mutex_lock(&mce_read_mutex); -	next = rcu_dereference(mcelog.next); - -	/* Only supports full reads right now */ -	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { -		mutex_unlock(&mce_read_mutex); -		kfree(cpu_tsc); -		return -EINVAL; -	} - -	err = 0; -	prev = 0; -	do { -		for (i = prev; i < next; i++) { -			unsigned long start = jiffies; - -			while (!mcelog.entry[i].finished) { -				if (time_after_eq(jiffies, start + 2)) { -					memset(mcelog.entry + i, 0, -					       sizeof(struct mce)); -					goto timeout; -				} -				cpu_relax(); -			} -			smp_rmb(); -			err |= copy_to_user(buf, mcelog.entry + i, -					    sizeof(struct mce)); -			buf += sizeof(struct mce); -timeout: -			; -		} - -		memset(mcelog.entry + prev, 0, -		       (next - prev) * sizeof(struct mce)); -		prev = next; -		next = cmpxchg(&mcelog.next, prev, 0); -	} while (next != prev); - -	synchronize_sched(); - -	/* -	 * Collect entries that were still getting written before the -	 * synchronize. -	 */ -	on_each_cpu(collect_tscs, cpu_tsc, 1); -	for (i = next; i < MCE_LOG_LEN; i++) { -		if (mcelog.entry[i].finished && -		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { -			err |= copy_to_user(buf, mcelog.entry+i, -					    sizeof(struct mce)); -			smp_rmb(); -			buf += sizeof(struct mce); -			memset(&mcelog.entry[i], 0, sizeof(struct mce)); -		} -	} -	mutex_unlock(&mce_read_mutex); -	kfree(cpu_tsc); -	return err ? -EFAULT : buf - ubuf; -} - -static unsigned int mce_poll(struct file *file, poll_table *wait) -{ -	poll_wait(file, &mce_wait, wait); -	if (rcu_dereference(mcelog.next)) -		return POLLIN | POLLRDNORM; -	return 0; -} - -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ -	int __user *p = (int __user *)arg; - -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; -	switch (cmd) { -	case MCE_GET_RECORD_LEN: -		return put_user(sizeof(struct mce), p); -	case MCE_GET_LOG_LEN: -		return put_user(MCE_LOG_LEN, p); -	case MCE_GETCLEAR_FLAGS: { -		unsigned flags; - -		do { -			flags = mcelog.flags; -		} while (cmpxchg(&mcelog.flags, flags, 0) != flags); -		return put_user(flags, p); -	} -	default: -		return -ENOTTY; -	} -} - -static const struct file_operations mce_chrdev_ops = { -	.open = mce_open, -	.release = mce_release, -	.read = mce_read, -	.poll = mce_poll, -	.unlocked_ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { -	MISC_MCELOG_MINOR, -	"mcelog", -	&mce_chrdev_ops, -}; - -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ -	mce_dont_init = 1; -	return 1; -} - -/* mce=off disables machine check. -   mce=TOLERANCELEVEL (number, see above) -   mce=bootlog Log MCEs from before booting. Disabled by default on AMD. -   mce=nobootlog Don't log MCEs from before booting. */ -static int __init mcheck_enable(char *str) -{ -	if (!strcmp(str, "off")) -		mce_dont_init = 1; -	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) -		mce_bootlog = str[0] == 'b'; -	else if (isdigit(str[0])) -		get_option(&str, &tolerant); -	else -		printk("mce= argument %s ignored. Please use /sys", str); -	return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce=", mcheck_enable); - -/* - * Sysfs support - */ - -/* - * Disable machine checks on suspend and shutdown. We can't really handle - * them later. - */ -static int mce_disable(void) -{ -	int i; - -	for (i = 0; i < banks; i++) -		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -	return 0; -} - -static int mce_suspend(struct sys_device *dev, pm_message_t state) -{ -	return mce_disable(); -} - -static int mce_shutdown(struct sys_device *dev) -{ -	return mce_disable(); -} - -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. -   Only one CPU is active at this time, the others get readded later using -   CPU hotplug. */ -static int mce_resume(struct sys_device *dev) -{ -	mce_init(NULL); -	mce_cpu_features(¤t_cpu_data); -	return 0; -} - -static void mce_cpu_restart(void *data) -{ -	del_timer_sync(&__get_cpu_var(mce_timer)); -	if (mce_available(¤t_cpu_data)) -		mce_init(NULL); -	mce_init_timer(); -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ -	on_each_cpu(mce_cpu_restart, NULL, 1); -} - -static struct sysdev_class mce_sysclass = { -	.suspend = mce_suspend, -	.shutdown = mce_shutdown, -	.resume = mce_resume, -	.name = "machinecheck", -}; - -DEFINE_PER_CPU(struct sys_device, device_mce); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ -	static ssize_t show_ ## name(struct sys_device *s,		\ -				     struct sysdev_attribute *attr,	\ -				     char *buf) {			\ -		return sprintf(buf, "%lx\n", (unsigned long)var);	\ -	}								\ -	static ssize_t set_ ## name(struct sys_device *s,		\ -				    struct sysdev_attribute *attr,	\ -				    const char *buf, size_t siz) {	\ -		char *end;						\ -		unsigned long new = simple_strtoul(buf, &end, 0);	\ -		if (end == buf) return -EINVAL;				\ -		var = new;						\ -		start;							\ -		return end-buf;						\ -	}								\ -	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -static struct sysdev_attribute *bank_attrs; - -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, -			 char *buf) -{ -	u64 b = bank[attr - bank_attrs]; -	return sprintf(buf, "%llx\n", b); -} - -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, -			const char *buf, size_t siz) -{ -	char *end; -	u64 new = simple_strtoull(buf, &end, 0); -	if (end == buf) -		return -EINVAL; -	bank[attr - bank_attrs] = new; -	mce_restart(); -	return end-buf; -} - -static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, -				char *buf) -{ -	strcpy(buf, trigger); -	strcat(buf, "\n"); -	return strlen(trigger) + 1; -} - -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, -				const char *buf,size_t siz) -{ -	char *p; -	int len; -	strncpy(trigger, buf, sizeof(trigger)); -	trigger[sizeof(trigger)-1] = 0; -	len = strlen(trigger); -	p = strchr(trigger, '\n'); -	if (*p) *p = 0; -	return len; -} - -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval,check_interval,mce_restart()) -static struct sysdev_attribute *mce_attributes[] = { -	&attr_tolerant.attr, &attr_check_interval, &attr_trigger, -	NULL -}; - -static cpumask_var_t mce_device_initialized; - -/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */ -static __cpuinit int mce_create_device(unsigned int cpu) -{ -	int err; -	int i; - -	if (!mce_available(&boot_cpu_data)) -		return -EIO; - -	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); -	per_cpu(device_mce,cpu).id = cpu; -	per_cpu(device_mce,cpu).cls = &mce_sysclass; - -	err = sysdev_register(&per_cpu(device_mce,cpu)); -	if (err) -		return err; - -	for (i = 0; mce_attributes[i]; i++) { -		err = sysdev_create_file(&per_cpu(device_mce,cpu), -					 mce_attributes[i]); -		if (err) -			goto error; -	} -	for (i = 0; i < banks; i++) { -		err = sysdev_create_file(&per_cpu(device_mce, cpu), -					&bank_attrs[i]); -		if (err) -			goto error2; -	} -	cpumask_set_cpu(cpu, mce_device_initialized); - -	return 0; -error2: -	while (--i >= 0) { -		sysdev_remove_file(&per_cpu(device_mce, cpu), -					&bank_attrs[i]); -	} -error: -	while (--i >= 0) { -		sysdev_remove_file(&per_cpu(device_mce,cpu), -				   mce_attributes[i]); -	} -	sysdev_unregister(&per_cpu(device_mce,cpu)); - -	return err; -} - -static __cpuinit void mce_remove_device(unsigned int cpu) -{ -	int i; - -	if (!cpumask_test_cpu(cpu, mce_device_initialized)) -		return; - -	for (i = 0; mce_attributes[i]; i++) -		sysdev_remove_file(&per_cpu(device_mce,cpu), -			mce_attributes[i]); -	for (i = 0; i < banks; i++) -		sysdev_remove_file(&per_cpu(device_mce, cpu), -			&bank_attrs[i]); -	sysdev_unregister(&per_cpu(device_mce,cpu)); -	cpumask_clear_cpu(cpu, mce_device_initialized); -} - -/* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) -{ -	int i; -	unsigned long action = *(unsigned long *)h; - -	if (!mce_available(¤t_cpu_data)) -		return; -	if (!(action & CPU_TASKS_FROZEN)) -		cmci_clear(); -	for (i = 0; i < banks; i++) -		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -} - -static void mce_reenable_cpu(void *h) -{ -	int i; -	unsigned long action = *(unsigned long *)h; - -	if (!mce_available(¤t_cpu_data)) -		return; -	if (!(action & CPU_TASKS_FROZEN)) -		cmci_reenable(); -	for (i = 0; i < banks; i++) -		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, -				      unsigned long action, void *hcpu) -{ -	unsigned int cpu = (unsigned long)hcpu; -	struct timer_list *t = &per_cpu(mce_timer, cpu); - -	switch (action) { -	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN: -		mce_create_device(cpu); -		if (threshold_cpu_callback) -			threshold_cpu_callback(action, cpu); -		break; -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		if (threshold_cpu_callback) -			threshold_cpu_callback(action, cpu); -		mce_remove_device(cpu); -		break; -	case CPU_DOWN_PREPARE: -	case CPU_DOWN_PREPARE_FROZEN: -		del_timer_sync(t); -		smp_call_function_single(cpu, mce_disable_cpu, &action, 1); -		break; -	case CPU_DOWN_FAILED: -	case CPU_DOWN_FAILED_FROZEN: -		t->expires = round_jiffies(jiffies + -						__get_cpu_var(next_interval)); -		add_timer_on(t, cpu); -		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); -		break; -	case CPU_POST_DEAD: -		/* intentionally ignoring frozen here */ -		cmci_rediscover(cpu); -		break; -	} -	return NOTIFY_OK; -} - -static struct notifier_block mce_cpu_notifier __cpuinitdata = { -	.notifier_call = mce_cpu_callback, -}; - -static __init int mce_init_banks(void) -{ -	int i; - -	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, -				GFP_KERNEL); -	if (!bank_attrs) -		return -ENOMEM; - -	for (i = 0; i < banks; i++) { -		struct sysdev_attribute *a = &bank_attrs[i]; -		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); -		if (!a->attr.name) -			goto nomem; -		a->attr.mode = 0644; -		a->show = show_bank; -		a->store = set_bank; -	} -	return 0; - -nomem: -	while (--i >= 0) -		kfree(bank_attrs[i].attr.name); -	kfree(bank_attrs); -	bank_attrs = NULL; -	return -ENOMEM; -} - -static __init int mce_init_device(void) -{ -	int err; -	int i = 0; - -	if (!mce_available(&boot_cpu_data)) -		return -EIO; - -	alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); - -	err = mce_init_banks(); -	if (err) -		return err; - -	err = sysdev_class_register(&mce_sysclass); -	if (err) -		return err; - -	for_each_online_cpu(i) { -		err = mce_create_device(i); -		if (err) -			return err; -	} - -	register_hotcpu_notifier(&mce_cpu_notifier); -	misc_register(&mce_log_device); -	return err; -} - -device_initcall(mce_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 56dde9c4bc9..ddae21620bd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -13,22 +13,22 @@   *   *  All MC4_MISCi registers are shared between multi-cores   */ - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/init.h>  #include <linux/interrupt.h> -#include <linux/kobject.h>  #include <linux/notifier.h> -#include <linux/sched.h> -#include <linux/smp.h> +#include <linux/kobject.h> +#include <linux/percpu.h>  #include <linux/sysdev.h> +#include <linux/errno.h> +#include <linux/sched.h>  #include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/smp.h> +  #include <asm/apic.h> +#include <asm/idle.h>  #include <asm/mce.h>  #include <asm/msr.h> -#include <asm/percpu.h> -#include <asm/idle.h>  #define PFX               "mce_threshold: "  #define VERSION           "version 1.1.1" @@ -48,26 +48,26 @@  #define MCG_XBLK_ADDR     0xC0000400  struct threshold_block { -	unsigned int block; -	unsigned int bank; -	unsigned int cpu; -	u32 address; -	u16 interrupt_enable; -	u16 threshold_limit; -	struct kobject kobj; -	struct list_head miscj; +	unsigned int		block; +	unsigned int		bank; +	unsigned int		cpu; +	u32			address; +	u16			interrupt_enable; +	u16			threshold_limit; +	struct kobject		kobj; +	struct list_head	miscj;  };  /* defaults used early on boot */  static struct threshold_block threshold_defaults = { -	.interrupt_enable = 0, -	.threshold_limit = THRESHOLD_MAX, +	.interrupt_enable	= 0, +	.threshold_limit	= THRESHOLD_MAX,  };  struct threshold_bank { -	struct kobject *kobj; -	struct threshold_block *blocks; -	cpumask_var_t cpus; +	struct kobject		*kobj; +	struct threshold_block	*blocks; +	cpumask_var_t		cpus;  };  static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); @@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);   */  struct thresh_restart { -	struct threshold_block *b; -	int reset; -	u16 old_limit; +	struct threshold_block	*b; +	int			reset; +	u16			old_limit;  };  /* must be called with correct cpu affinity */ @@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)  	} else if (tr->old_limit) {	/* change limit w/o reset */  		int new_count = (mci_misc_hi & THRESHOLD_MAX) +  		    (tr->old_limit - tr->b->threshold_limit); +  		mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |  		    (new_count & THRESHOLD_MAX);  	} @@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)  /* cpu init entry point, called from mce.c with preempt off */  void mce_amd_feature_init(struct cpuinfo_x86 *c)  { -	unsigned int bank, block;  	unsigned int cpu = smp_processor_id(); -	u8 lvt_off;  	u32 low = 0, high = 0, address = 0; +	unsigned int bank, block;  	struct thresh_restart tr; +	u8 lvt_off;  	for (bank = 0; bank < NR_BANKS; ++bank) {  		for (block = 0; block < NR_BLOCKS; ++block) { @@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  				if (!address)  					break;  				address += MCG_XBLK_ADDR; -			} -			else +			} else  				++address;  			if (rdmsr_safe(address, &low, &high)) @@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)   */  static void amd_threshold_interrupt(void)  { +	u32 low = 0, high = 0, address = 0;  	unsigned int bank, block;  	struct mce m; -	u32 low = 0, high = 0, address = 0;  	mce_setup(&m); @@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)  		if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))  			continue;  		for (block = 0; block < NR_BLOCKS; ++block) { -			if (block == 0) +			if (block == 0) {  				address = MSR_IA32_MC0_MISC + bank * 4; -			else if (block == 1) { +			} else if (block == 1) {  				address = (low & MASK_BLKPTR_LO) >> 21;  				if (!address)  					break;  				address += MCG_XBLK_ADDR; -			} -			else +			} else {  				++address; +			}  			if (rdmsr_safe(address, &low, &high))  				break; @@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)  			     (high & MASK_LOCKED_HI))  				continue; -			/* Log the machine check that caused the threshold -			   event. */ +			/* +			 * Log the machine check that caused the threshold +			 * event. +			 */  			machine_check_poll(MCP_TIMESTAMP,  					&__get_cpu_var(mce_poll_banks)); @@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)  struct threshold_attr {  	struct attribute attr; -	ssize_t(*show) (struct threshold_block *, char *); -	ssize_t(*store) (struct threshold_block *, const char *, size_t count); +	ssize_t (*show) (struct threshold_block *, char *); +	ssize_t (*store) (struct threshold_block *, const char *, size_t count);  }; -#define SHOW_FIELDS(name)                                           \ -static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ -{                                                                   \ -        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \ +#define SHOW_FIELDS(name)						\ +static ssize_t show_ ## name(struct threshold_block *b, char *buf)	\ +{									\ +	return sprintf(buf, "%lx\n", (unsigned long) b->name);		\  }  SHOW_FIELDS(interrupt_enable)  SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_block *b, -				      const char *buf, size_t count) +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)  { -	char *end;  	struct thresh_restart tr; -	unsigned long new = simple_strtoul(buf, &end, 0); -	if (end == buf) +	unsigned long new; + +	if (strict_strtoul(buf, 0, &new) < 0)  		return -EINVAL; +  	b->interrupt_enable = !!new; -	tr.b = b; -	tr.reset = 0; -	tr.old_limit = 0; +	tr.b		= b; +	tr.reset	= 0; +	tr.old_limit	= 0; +  	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); -	return end - buf; +	return size;  } -static ssize_t store_threshold_limit(struct threshold_block *b, -				     const char *buf, size_t count) +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)  { -	char *end;  	struct thresh_restart tr; -	unsigned long new = simple_strtoul(buf, &end, 0); -	if (end == buf) +	unsigned long new; + +	if (strict_strtoul(buf, 0, &new) < 0)  		return -EINVAL; +  	if (new > THRESHOLD_MAX)  		new = THRESHOLD_MAX;  	if (new < 1)  		new = 1; +  	tr.old_limit = b->threshold_limit;  	b->threshold_limit = new;  	tr.b = b; @@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,  	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); -	return end - buf; +	return size;  }  struct threshold_block_cross_cpu { -	struct threshold_block *tb; -	long retval; +	struct threshold_block	*tb; +	long			retval;  };  static void local_error_count_handler(void *_tbcc) @@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,  	return 1;  } -#define THRESHOLD_ATTR(_name,_mode,_show,_store) {            \ -        .attr = {.name = __stringify(_name), .mode = _mode }, \ -        .show = _show,                                        \ -        .store = _store,                                      \ +#define RW_ATTR(val)							\ +static struct threshold_attr val = {					\ +	.attr	= {.name = __stringify(val), .mode = 0644 },		\ +	.show	= show_## val,						\ +	.store	= store_## val,						\  }; -#define RW_ATTR(name)                                           \ -static struct threshold_attr name =                             \ -        THRESHOLD_ATTR(name, 0644, show_## name, store_## name) -  RW_ATTR(interrupt_enable);  RW_ATTR(threshold_limit);  RW_ATTR(error_count); @@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {  	NULL  }; -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) +#define to_block(k)	container_of(k, struct threshold_block, kobj) +#define to_attr(a)	container_of(a, struct threshold_attr, attr)  static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)  {  	struct threshold_block *b = to_block(kobj);  	struct threshold_attr *a = to_attr(attr);  	ssize_t ret; +  	ret = a->show ? a->show(b, buf) : -EIO; +  	return ret;  } @@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,  	struct threshold_block *b = to_block(kobj);  	struct threshold_attr *a = to_attr(attr);  	ssize_t ret; +  	ret = a->store ? a->store(b, buf, count) : -EIO; +  	return ret;  }  static struct sysfs_ops threshold_ops = { -	.show = show, -	.store = store, +	.show			= show, +	.store			= store,  };  static struct kobj_type threshold_ktype = { -	.sysfs_ops = &threshold_ops, -	.default_attrs = default_attrs, +	.sysfs_ops		= &threshold_ops, +	.default_attrs		= default_attrs,  };  static __cpuinit int allocate_threshold_blocks(unsigned int cpu, @@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,  					       unsigned int block,  					       u32 address)  { -	int err; -	u32 low, high;  	struct threshold_block *b = NULL; +	u32 low, high; +	int err;  	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))  		return 0; @@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,  	if (!b)  		return -ENOMEM; -	b->block = block; -	b->bank = bank; -	b->cpu = cpu; -	b->address = address; -	b->interrupt_enable = 0; -	b->threshold_limit = THRESHOLD_MAX; +	b->block		= block; +	b->bank			= bank; +	b->cpu			= cpu; +	b->address		= address; +	b->interrupt_enable	= 0; +	b->threshold_limit	= THRESHOLD_MAX;  	INIT_LIST_HEAD(&b->miscj); -	if (per_cpu(threshold_banks, cpu)[bank]->blocks) +	if (per_cpu(threshold_banks, cpu)[bank]->blocks) {  		list_add(&b->miscj,  			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); -	else +	} else {  		per_cpu(threshold_banks, cpu)[bank]->blocks = b; +	}  	err = kobject_init_and_add(&b->kobj, &threshold_ktype,  				   per_cpu(threshold_banks, cpu)[bank]->kobj, @@ -447,8 +455,9 @@ recurse:  		if (!address)  			return 0;  		address += MCG_XBLK_ADDR; -	} else +	} else {  		++address; +	}  	err = allocate_threshold_blocks(cpu, bank, ++block, address);  	if (err) @@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (!b)  			goto out; -		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, +		err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,  					b->kobj, name);  		if (err)  			goto out;  		cpumask_copy(b->cpus, cpu_core_mask(cpu));  		per_cpu(threshold_banks, cpu)[bank] = b; +  		goto out;  	}  #endif @@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		goto out;  	} -	b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); +	b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);  	if (!b->kobj)  		goto out_free; @@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (i == cpu)  			continue; -		err = sysfs_create_link(&per_cpu(device_mce, i).kobj, +		err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,  					b->kobj, name);  		if (err)  			goto out; @@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,  static void threshold_remove_bank(unsigned int cpu, int bank)  { -	int i = 0;  	struct threshold_bank *b;  	char name[32]; +	int i = 0;  	b = per_cpu(threshold_banks, cpu)[bank]; -  	if (!b)  		return; -  	if (!b->blocks)  		goto free_out; @@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  #ifdef CONFIG_SMP  	/* sibling symlink */  	if (shared_bank[bank] && b->blocks->cpu != cpu) { -		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); +		sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);  		per_cpu(threshold_banks, cpu)[bank] = NULL; +  		return;  	}  #endif @@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  		if (i == cpu)  			continue; -		sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); +		sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);  		per_cpu(threshold_banks, i)[bank] = NULL;  	} @@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)  }  /* get notified when a cpu comes on/off */ -static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, -						     unsigned int cpu) +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)  { -	if (cpu >= NR_CPUS) -		return; -  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: @@ -686,11 +692,12 @@ static __init int threshold_init_device(void)  	/* to hit CPUs online before the notifier is up */  	for_each_online_cpu(lcpu) {  		int err = threshold_create_device(lcpu); +  		if (err)  			return err;  	}  	threshold_cpu_callback = amd_64_threshold_cpu_callback; +  	return 0;  } -  device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index cef3ee30744..e1acec0f7a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,85 +8,10 @@  #include <linux/init.h>  #include <linux/interrupt.h>  #include <linux/percpu.h> -#include <asm/processor.h>  #include <asm/apic.h> +#include <asm/processor.h>  #include <asm/msr.h>  #include <asm/mce.h> -#include <asm/hw_irq.h> -#include <asm/idle.h> -#include <asm/therm_throt.h> -#include <asm/apic.h> - -asmlinkage void smp_thermal_interrupt(void) -{ -	__u64 msr_val; - -	ack_APIC_irq(); - -	exit_idle(); -	irq_enter(); - -	rdmsrl(MSR_IA32_THERM_STATUS, msr_val); -	if (therm_throt_process(msr_val & 1)) -		mce_log_therm_throt_event(msr_val); - -	inc_irq_stat(irq_thermal_count); -	irq_exit(); -} - -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ -	u32 l, h; -	int tm2 = 0; -	unsigned int cpu = smp_processor_id(); - -	if (!cpu_has(c, X86_FEATURE_ACPI)) -		return; - -	if (!cpu_has(c, X86_FEATURE_ACC)) -		return; - -	/* first check if TM1 is already enabled by the BIOS, in which -	 * case there might be some SMM goo which handles it, so we can't even -	 * put a handler since it might be delivered via SMI already. -	 */ -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	h = apic_read(APIC_LVTTHMR); -	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { -		printk(KERN_DEBUG -		       "CPU%d: Thermal monitoring handled by SMI\n", cpu); -		return; -	} - -	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) -		tm2 = 1; - -	if (h & APIC_VECTOR_MASK) { -		printk(KERN_DEBUG -		       "CPU%d: Thermal LVT vector (%#x) already " -		       "installed\n", cpu, (h & APIC_VECTOR_MASK)); -		return; -	} - -	h = THERMAL_APIC_VECTOR; -	h |= (APIC_DM_FIXED | APIC_LVT_MASKED); -	apic_write(APIC_LVTTHMR, h); - -	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); -	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - -	l = apic_read(APIC_LVTTHMR); -	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); -	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", -		cpu, tm2 ? "TM2" : "TM1"); - -	/* enable thermal throttle processing */ -	atomic_set(&therm_throt_en, 1); -	return; -}  /*   * Support for Intel Correct Machine Check Interrupts. This allows @@ -109,6 +34,9 @@ static int cmci_supported(int *banks)  {  	u64 cap; +	if (mce_cmci_disabled || mce_ignore_ce) +		return 0; +  	/*  	 * Vendor check is not strictly needed, but the initial  	 * initialization is vendor keyed and this @@ -132,7 +60,7 @@ static int cmci_supported(int *banks)  static void intel_threshold_interrupt(void)  {  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); -	mce_notify_user(); +	mce_notify_irq();  }  static void print_update(char *type, int *hdr, int num) @@ -248,7 +176,7 @@ void cmci_rediscover(int dying)  		return;  	cpumask_copy(old, ¤t->cpus_allowed); -	for_each_online_cpu (cpu) { +	for_each_online_cpu(cpu) {  		if (cpu == dying)  			continue;  		if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc..f5f2d6f71fb 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -6,25 +6,23 @@   * This file contains routines to check for non-fatal MCEs every 15s   *   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/jiffies.h> -#include <linux/workqueue.h>  #include <linux/interrupt.h> -#include <linux/smp.h> +#include <linux/workqueue.h> +#include <linux/jiffies.h> +#include <linux/kernel.h>  #include <linux/module.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" +static int		firstbank; -static int firstbank; - -#define MCE_RATE	15*HZ	/* timer rate is 15s */ +#define MCE_RATE	(15*HZ)	/* timer rate is 15s */  static void mce_checkregs(void *info)  { @@ -34,23 +32,24 @@ static void mce_checkregs(void *info)  	for (i = firstbank; i < nr_mce_banks; i++) {  		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); -		if (high & (1<<31)) { -			printk(KERN_INFO "MCE: The hardware reports a non " -				"fatal, correctable incident occurred on " -				"CPU %d.\n", +		if (!(high & (1<<31))) +			continue; + +		printk(KERN_INFO "MCE: The hardware reports a non fatal, " +			"correctable incident occurred on CPU %d.\n",  				smp_processor_id()); -			printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); -			/* -			 * Scrub the error so we don't pick it up in MCE_RATE -			 * seconds time. -			 */ -			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); +		printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); -			/* Serialize */ -			wmb(); -			add_taint(TAINT_MACHINE_CHECK); -		} +		/* +		 * Scrub the error so we don't pick it up in MCE_RATE +		 * seconds time: +		 */ +		wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + +		/* Serialize: */ +		wmb(); +		add_taint(TAINT_MACHINE_CHECK);  	}  } @@ -77,16 +76,17 @@ static int __init init_nonfatal_mce_checker(void)  	/* Some Athlons misbehave when we frob bank 0 */  	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && -		boot_cpu_data.x86 == 6) -			firstbank = 1; +						boot_cpu_data.x86 == 6) +		firstbank = 1;  	else -			firstbank = 0; +		firstbank = 0;  	/*  	 * Check for non-fatal errors every MCE_RATE s  	 */  	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));  	printk(KERN_INFO "Machine check exception polling timer started.\n"); +  	return 0;  }  module_init(init_nonfatal_mce_checker); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf38..4482aea9aa2 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -1,21 +1,14 @@  /*   * P4 specific Machine Check Exception Reporting   */ - -#include <linux/init.h> -#include <linux/types.h>  #include <linux/kernel.h> -#include <linux/interrupt.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h> -#include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include <asm/apic.h> - -#include <asm/therm_throt.h> - -#include "mce.h"  /* as supported by the P4/Xeon family */  struct intel_mce_extended_msrs { @@ -34,98 +27,8 @@ struct intel_mce_extended_msrs {  static int mce_num_extended_msrs; - -#ifdef CONFIG_X86_MCE_P4THERMAL -static void unexpected_thermal_interrupt(struct pt_regs *regs) -{ -	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", -			smp_processor_id()); -	add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler */ -static void intel_thermal_interrupt(struct pt_regs *regs) -{ -	__u64 msr_val; - -	ack_APIC_irq(); - -	rdmsrl(MSR_IA32_THERM_STATUS, msr_val); -	therm_throt_process(msr_val & 0x1); -} - -/* Thermal interrupt handler for this CPU setup */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; - -void smp_thermal_interrupt(struct pt_regs *regs) -{ -	irq_enter(); -	vendor_thermal_interrupt(regs); -	__get_cpu_var(irq_stat).irq_thermal_count++; -	irq_exit(); -} - -/* P4/Xeon Thermal regulation detect and init */ -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ -	u32 l, h; -	unsigned int cpu = smp_processor_id(); - -	/* Thermal monitoring */ -	if (!cpu_has(c, X86_FEATURE_ACPI)) -		return;	/* -ENODEV */ - -	/* Clock modulation */ -	if (!cpu_has(c, X86_FEATURE_ACC)) -		return;	/* -ENODEV */ - -	/* first check if its enabled already, in which case there might -	 * be some SMM goo which handles it, so we can't even put a handler -	 * since it might be delivered via SMI already -zwanem. -	 */ -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	h = apic_read(APIC_LVTTHMR); -	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { -		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", -				cpu); -		return; /* -EBUSY */ -	} - -	/* check whether a vector already exists, temporarily masked? */ -	if (h & APIC_VECTOR_MASK) { -		printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " -				"installed\n", -			cpu, (h & APIC_VECTOR_MASK)); -		return; /* -EBUSY */ -	} - -	/* The temperature transition interrupt handler setup */ -	h = THERMAL_APIC_VECTOR;		/* our delivery vector */ -	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */ -	apic_write(APIC_LVTTHMR, h); - -	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); -	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); - -	/* ok we're good to go... */ -	vendor_thermal_interrupt = intel_thermal_interrupt; - -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - -	l = apic_read(APIC_LVTTHMR); -	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); -	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); - -	/* enable thermal throttle processing */ -	atomic_set(&therm_throt_en, 1); -	return; -} -#endif /* CONFIG_X86_MCE_P4THERMAL */ - -  /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)  {  	u32 h; @@ -143,9 +46,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)  static void intel_machine_check(struct pt_regs *regs, long error_code)  { -	int recover = 1;  	u32 alow, ahigh, high, low;  	u32 mcgstl, mcgsth; +	int recover = 1;  	int i;  	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -157,7 +60,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	if (mce_num_extended_msrs > 0) {  		struct intel_mce_extended_msrs dbg; +  		intel_get_extended_msrs(&dbg); +  		printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"  			"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"  			"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", @@ -171,6 +76,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  		if (high & (1<<31)) {  			char misc[20];  			char addr[24]; +  			misc[0] = addr[0] = '\0';  			if (high & (1<<29))  				recover |= 1; @@ -196,6 +102,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  		panic("Unable to continue");  	printk(KERN_EMERG "Attempting to continue.\n"); +  	/*  	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not  	 * recoverable/continuable.This will allow BIOS to look at the MSRs @@ -217,7 +124,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);  } -  void intel_p4_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69ed..5c0e6533d9b 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -2,52 +2,67 @@   * P5 specific Machine Check Exception Reporting   * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" +/* By default disabled */ +int mce_p5_enabled __read_mostly; -/* Machine check handler for Pentium class Intel */ +/* Machine check handler for Pentium class Intel CPUs: */  static void pentium_machine_check(struct pt_regs *regs, long error_code)  {  	u32 loaddr, hi, lotype; +  	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);  	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); -	printk(KERN_EMERG "CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); -	if (lotype&(1<<5)) -		printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + +	printk(KERN_EMERG +		"CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n", +		smp_processor_id(), loaddr, lotype); + +	if (lotype & (1<<5)) { +		printk(KERN_EMERG +			"CPU#%d: Possible thermal failure (CPU on fire ?).\n", +			smp_processor_id()); +	} +  	add_taint(TAINT_MACHINE_CHECK);  } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */  void intel_p5_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; -	/*Check for MCE support */ -	if (!cpu_has(c, X86_FEATURE_MCE)) +	/* Default P5 to off as its often misconnected: */ +	if (!mce_p5_enabled)  		return; -	/* Default P5 to off as its often misconnected */ -	if (mce_disabled != -1) +	/* Check for MCE support: */ +	if (!cpu_has(c, X86_FEATURE_MCE))  		return; +  	machine_check_vector = pentium_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb(); -	/* Read registers before enabling */ +	/* Read registers before enabling: */  	rdmsr(MSR_IA32_P5_MC_ADDR, l, h);  	rdmsr(MSR_IA32_P5_MC_TYPE, l, h); -	printk(KERN_INFO "Intel old style machine check architecture supported.\n"); +	printk(KERN_INFO +	       "Intel old style machine check architecture supported.\n"); -	/* Enable MCE */ +	/* Enable MCE: */  	set_in_cr4(X86_CR4_MCE); -	printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); +	printk(KERN_INFO +	       "Intel old style machine check reporting enabled on CPU#%d.\n", +	       smp_processor_id());  } diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434..01e4f817818 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -2,25 +2,23 @@   * P6 specific Machine Check Exception Reporting   * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" -  /* Machine Check Handler For PII/PIII */  static void intel_machine_check(struct pt_regs *regs, long error_code)  { -	int recover = 1;  	u32 alow, ahigh, high, low;  	u32 mcgstl, mcgsth; +	int recover = 1;  	int i;  	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -35,12 +33,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  		if (high & (1<<31)) {  			char misc[20];  			char addr[24]; -			misc[0] = addr[0] = '\0'; + +			misc[0] = '\0'; +			addr[0] = '\0'; +  			if (high & (1<<29))  				recover |= 1;  			if (high & (1<<25))  				recover |= 2;  			high &= ~(1<<31); +  			if (high & (1<<27)) {  				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);  				snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,6 +51,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);  				snprintf(addr, 24, " at %08x%08x", ahigh, alow);  			} +  			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",  				smp_processor_id(), i, high, low, misc, addr);  		} @@ -63,16 +66,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	/*  	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not  	 * recoverable/continuable.This will allow BIOS to look at the MSRs -	 * for errors if the OS could not log the error. +	 * for errors if the OS could not log the error:  	 */  	for (i = 0; i < nr_mce_banks; i++) {  		unsigned int msr; +  		msr = MSR_IA32_MC0_STATUS+i*4;  		rdmsr(msr, low, high);  		if (high & (1<<31)) { -			/* Clear it */ +			/* Clear it: */  			wrmsr(msr, 0UL, 0UL); -			/* Serialize */ +			/* Serialize: */  			wmb();  			add_taint(TAINT_MACHINE_CHECK);  		} @@ -81,7 +85,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);  } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */  void intel_p6_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; @@ -97,6 +101,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)  	/* Ok machine check is available */  	machine_check_vector = intel_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb();  	printk(KERN_INFO "Intel machine check architecture supported.\n"); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b..bff8dd191dd 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -1,7 +1,7 @@  /* - *   * Thermal throttle event support code (such as syslog messaging and rate   * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + *   * This allows consistent reporting of CPU thermal throttle events.   *   * Maintains a counter in /sys that keeps track of the number of thermal @@ -13,43 +13,53 @@   * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.   *          Inspired by Ross Biro's and Al Borchers' counter code.   */ - +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <linux/kernel.h>  #include <linux/percpu.h>  #include <linux/sysdev.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h>  #include <linux/cpu.h> -#include <asm/cpu.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <asm/therm_throt.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> +#include <asm/msr.h>  /* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL              (300 * HZ) +#define CHECK_INTERVAL		(300 * HZ)  static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;  static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); + +static atomic_t therm_throt_en		= ATOMIC_INIT(0);  #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name)                              \ -        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) +#define define_therm_throt_sysdev_one_ro(_name)				\ +	static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) -#define define_therm_throt_sysdev_show_func(name)                            \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \ -					struct sysdev_attribute *attr,	     \ -                                              char *buf)                     \ -{                                                                            \ -	unsigned int cpu = dev->id;                                          \ -	ssize_t ret;                                                         \ -                                                                             \ -	preempt_disable();              /* CPU hotplug */                    \ -	if (cpu_online(cpu))                                                 \ -		ret = sprintf(buf, "%lu\n",                                  \ -			      per_cpu(thermal_throttle_##name, cpu));        \ -	else                                                                 \ -		ret = 0;                                                     \ -	preempt_enable();                                                    \ -                                                                             \ -	return ret;                                                          \ +#define define_therm_throt_sysdev_show_func(name)			\ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,	\ +					struct sysdev_attribute *attr,	\ +					      char *buf)		\ +{									\ +	unsigned int cpu = dev->id;					\ +	ssize_t ret;							\ +									\ +	preempt_disable();	/* CPU hotplug */			\ +	if (cpu_online(cpu))						\ +		ret = sprintf(buf, "%lu\n",				\ +			      per_cpu(thermal_throttle_##name, cpu));	\ +	else								\ +		ret = 0;						\ +	preempt_enable();						\ +									\ +	return ret;							\  }  define_therm_throt_sysdev_show_func(count); @@ -61,8 +71,8 @@ static struct attribute *thermal_throttle_attrs[] = {  };  static struct attribute_group thermal_throttle_attr_group = { -	.attrs = thermal_throttle_attrs, -	.name = "thermal_throttle" +	.attrs	= thermal_throttle_attrs, +	.name	= "thermal_throttle"  };  #endif /* CONFIG_SYSFS */ @@ -82,7 +92,7 @@ static struct attribute_group thermal_throttle_attr_group = {   *          1 : Event should be logged further, and a message has been   *              printed to the syslog.   */ -int therm_throt_process(int curr) +static int therm_throt_process(int curr)  {  	unsigned int cpu = smp_processor_id();  	__u64 tmp_jiffs = get_jiffies_64(); @@ -110,10 +120,11 @@ int therm_throt_process(int curr)  }  #ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device */ +/* Add/Remove thermal_throttle interface for CPU device: */  static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)  { -	return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); +	return sysfs_create_group(&sys_dev->kobj, +				  &thermal_throttle_attr_group);  }  static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -121,19 +132,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)  	sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);  } -/* Mutex protecting device creation against CPU hotplug */ +/* Mutex protecting device creation against CPU hotplug: */  static DEFINE_MUTEX(therm_cpu_lock);  /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, -						   unsigned long action, -						   void *hcpu) +static __cpuinit int +thermal_throttle_cpu_callback(struct notifier_block *nfb, +			      unsigned long action, +			      void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu;  	struct sys_device *sys_dev;  	int err = 0;  	sys_dev = get_cpu_sysdev(cpu); +  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: @@ -183,6 +196,94 @@ static __init int thermal_throttle_init_device(void)  	return 0;  } -  device_initcall(thermal_throttle_init_device); +  #endif /* CONFIG_SYSFS */ + +/* Thermal transition interrupt handler */ +static void intel_thermal_interrupt(void) +{ +	__u64 msr_val; + +	rdmsrl(MSR_IA32_THERM_STATUS, msr_val); +	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) +		mce_log_therm_throt_event(msr_val); +} + +static void unexpected_thermal_interrupt(void) +{ +	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", +			smp_processor_id()); +	add_taint(TAINT_MACHINE_CHECK); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ +	exit_idle(); +	irq_enter(); +	inc_irq_stat(irq_thermal_count); +	smp_thermal_vector(); +	irq_exit(); +	/* Ack only at the end to avoid potential reentry */ +	ack_APIC_irq(); +} + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ +	unsigned int cpu = smp_processor_id(); +	int tm2 = 0; +	u32 l, h; + +	/* Thermal monitoring depends on ACPI and clock modulation*/ +	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) +		return; + +	/* +	 * First check if its enabled already, in which case there might +	 * be some SMM goo which handles it, so we can't even put a handler +	 * since it might be delivered via SMI already: +	 */ +	rdmsr(MSR_IA32_MISC_ENABLE, l, h); +	h = apic_read(APIC_LVTTHMR); +	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { +		printk(KERN_DEBUG +		       "CPU%d: Thermal monitoring handled by SMI\n", cpu); +		return; +	} + +	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) +		tm2 = 1; + +	/* Check whether a vector already exists */ +	if (h & APIC_VECTOR_MASK) { +		printk(KERN_DEBUG +		       "CPU%d: Thermal LVT vector (%#x) already installed\n", +		       cpu, (h & APIC_VECTOR_MASK)); +		return; +	} + +	/* We'll mask the thermal vector in the lapic till we're ready: */ +	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; +	apic_write(APIC_LVTTHMR, h); + +	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); +	wrmsr(MSR_IA32_THERM_INTERRUPT, +		l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + +	smp_thermal_vector = intel_thermal_interrupt; + +	rdmsr(MSR_IA32_MISC_ENABLE, l, h); +	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + +	/* Unmask the thermal vector: */ +	l = apic_read(APIC_LVTTHMR); +	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + +	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", +	       cpu, tm2 ? "TM2" : "TM1"); + +	/* enable thermal throttle processing */ +	atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f7..d746df2909c 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)  void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void mce_threshold_interrupt(void) +asmlinkage void smp_threshold_interrupt(void)  {  	exit_idle();  	irq_enter(); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811..54060f56597 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -2,19 +2,17 @@   * IDT Winchip specific Machine Check Exception Reporting   * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" - -/* Machine check handler for WinChip C6 */ +/* Machine check handler for WinChip C6: */  static void winchip_machine_check(struct pt_regs *regs, long error_code)  {  	printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); @@ -25,12 +23,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)  void winchip_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 lo, hi; +  	machine_check_vector = winchip_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb(); +  	rdmsr(MSR_IDT_FCR1, lo, hi);  	lo |= (1<<2);	/* Enable EIERRINT (int 18 MCE) */  	lo &= ~(1<<4);	/* Enable MCE */  	wrmsr(MSR_IDT_FCR1, lo, hi); +  	set_in_cr4(X86_CR4_MCE); -	printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); + +	printk(KERN_INFO +	       "Winchip machine check reporting enabled on CPU#0.\n");  } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04..1d584a18a50 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)  	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)  		return 0; -	rdmsr(MTRRdefType_MSR, def, dummy); +	rdmsr(MSR_MTRRdefType, def, dummy);  	def &= 0xff;  	if (def != MTRR_TYPE_UNCACHABLE)  		return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)  	 */  	if (!is_cpu(INTEL) || disable_mtrr_trim)  		return 0; -	rdmsr(MTRRdefType_MSR, def, dummy); +	rdmsr(MSR_MTRRdefType, def, dummy);  	def &= 0xff;  	if (def != MTRR_TYPE_UNCACHABLE)  		return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d21d4fb161f..0543f69f0b2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,9 +20,9 @@ struct fixed_range_block {  };  static struct fixed_range_block fixed_range_blocks[] = { -	{ MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */ -	{ MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */ -	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */ +	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */ +	{ MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */ +	{ MSR_MTRRfix4K_C0000,  8 }, /* eight 4k MTRRs */  	{}  }; @@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)  	k8_check_syscfg_dram_mod_en(); -	rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); +	rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);  	for (i = 0; i < 2; i++) -		rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); +		rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);  	for (i = 0; i < 8; i++) -		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); +		rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);  }  void mtrr_save_fixed_ranges(void *info) @@ -310,7 +310,7 @@ void __init get_mtrr_state(void)  	vrs = mtrr_state.var_ranges; -	rdmsr(MTRRcap_MSR, lo, dummy); +	rdmsr(MSR_MTRRcap, lo, dummy);  	mtrr_state.have_fixed = (lo >> 8) & 1;  	for (i = 0; i < num_var_ranges; i++) @@ -318,7 +318,7 @@ void __init get_mtrr_state(void)  	if (mtrr_state.have_fixed)  		get_fixed_ranges(mtrr_state.fixed_ranges); -	rdmsr(MTRRdefType_MSR, lo, dummy); +	rdmsr(MSR_MTRRdefType, lo, dummy);  	mtrr_state.def_type = (lo & 0xff);  	mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)  	__flush_tlb();  	/*  Save MTRR state */ -	rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); +	rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);  	/*  Disable MTRRs, and set the default type to uncached  */ -	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); +	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);  }  static void post_set(void) __releases(set_atomicity_lock) @@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)  	__flush_tlb();  	/* Intel (P6) standard MTRRs */ -	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); +	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);  	/*  Enable caches  */  	write_cr0(read_cr0() & 0xbfffffff); @@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i  static int generic_have_wrcomb(void)  {  	unsigned long config, dummy; -	rdmsr(MTRRcap_MSR, config, dummy); +	rdmsr(MSR_MTRRcap, config, dummy);  	return (config & (1 << 10));  } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c..8fc248b5aea 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)  	unsigned long config = 0, dummy;  	if (use_intel()) { -		rdmsr(MTRRcap_MSR, config, dummy); +		rdmsr(MSR_MTRRcap, config, dummy);  	} else if (is_cpu(AMD))  		config = 2;  	else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347..7538b767f20 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,21 +5,6 @@  #include <linux/types.h>  #include <linux/stddef.h> -#define MTRRcap_MSR     0x0fe -#define MTRRdefType_MSR 0x2ff - -#define MTRRfix64K_00000_MSR 0x250 -#define MTRRfix16K_80000_MSR 0x258 -#define MTRRfix16K_A0000_MSR 0x259 -#define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f -  #define MTRR_CHANGE_MASK_FIXED     0x01  #define MTRR_CHANGE_MASK_VARIABLE  0x02  #define MTRR_CHANGE_MASK_DEFTYPE   0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685..1f5fb1588d1 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)  		if (use_intel())  			/*  Save MTRR state */ -			rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); +			rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);  		else  			/* Cyrix ARRs - everything else were excluded at the top */  			ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)  {  	if (use_intel())  		/*  Disable MTRRs, and set the default type to uncached  */ -		mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, +		mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,  		      ctxt->deftype_hi);  	else if (is_cpu(CYRIX))  		/* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)  		/*  Restore MTRRdefType  */  		if (use_intel())  			/* Intel (P6) standard MTRRs */ -			mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); +			mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);  		else  			/* Cyrix ARRs - everything else was excluded at the top */  			setCx86(CX86_CCR3, ctxt->ccr3); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 00000000000..76dfef23f78 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,1721 @@ +/* + * Performance counter x86 architecture code + * + *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + *  Copyright (C) 2009 Jaswinder Singh Rajput + *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + *  For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_counter.h> +#include <linux/capability.h> +#include <linux/notifier.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> + +#include <asm/apic.h> +#include <asm/stacktrace.h> +#include <asm/nmi.h> + +static u64 perf_counter_mask __read_mostly; + +struct cpu_hw_counters { +	struct perf_counter	*counters[X86_PMC_IDX_MAX]; +	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +	unsigned long		interrupts; +	int			enabled; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { +	const char	*name; +	int		version; +	int		(*handle_irq)(struct pt_regs *); +	void		(*disable_all)(void); +	void		(*enable_all)(void); +	void		(*enable)(struct hw_perf_counter *, int); +	void		(*disable)(struct hw_perf_counter *, int); +	unsigned	eventsel; +	unsigned	perfctr; +	u64		(*event_map)(int); +	u64		(*raw_event)(u64); +	int		max_events; +	int		num_counters; +	int		num_counters_fixed; +	int		counter_bits; +	u64		counter_mask; +	u64		max_period; +	u64		intel_ctrl; +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +	.enabled = 1, +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ +  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c, +  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, +  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e, +  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e, +  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, +  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, +  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c, +}; + +static u64 intel_pmu_event_map(int event) +{ +	return intel_perfmon_event_map[event]; +} + +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */ +		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */ +		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */ +		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */ +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */ +		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */ +		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */ +		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */ +		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */ +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */ +		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */ +		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */ +		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ +		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static const u64 core2_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */ +		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */ +		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */ +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */ +		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */ +		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */ +		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static const u64 atom_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */ +		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */ +		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */ +		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static u64 intel_pmu_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL +#define CORE_EVNTSEL_INV_MASK		0x00800000ULL +#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL + +#define CORE_EVNTSEL_MASK		\ +	(CORE_EVNTSEL_EVENT_MASK |	\ +	 CORE_EVNTSEL_UNIT_MASK  |	\ +	 CORE_EVNTSEL_EDGE_MASK  |	\ +	 CORE_EVNTSEL_INV_MASK  |	\ +	 CORE_EVNTSEL_COUNTER_MASK) + +	return event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */ +		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */ +		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */ +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */ +		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ +		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */ +		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */ +		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */ +		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ +  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076, +  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, +  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080, +  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081, +  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, +  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, +}; + +static u64 amd_pmu_event_map(int event) +{ +	return amd_perfmon_event_map[event]; +} + +static u64 amd_pmu_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL +#define K7_EVNTSEL_INV_MASK	0x000800000ULL +#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL + +#define K7_EVNTSEL_MASK			\ +	(K7_EVNTSEL_EVENT_MASK |	\ +	 K7_EVNTSEL_UNIT_MASK  |	\ +	 K7_EVNTSEL_EDGE_MASK  |	\ +	 K7_EVNTSEL_INV_MASK   |	\ +	 K7_EVNTSEL_COUNTER_MASK) + +	return event & K7_EVNTSEL_MASK; +} + +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_counter_update(struct perf_counter *counter, +			struct hw_perf_counter *hwc, int idx) +{ +	int shift = 64 - x86_pmu.counter_bits; +	u64 prev_raw_count, new_raw_count; +	s64 delta; + +	/* +	 * Careful: an NMI might modify the previous counter value. +	 * +	 * Our tactic to handle this is to first atomically read and +	 * exchange a new raw count - then add that new-prev delta +	 * count to the generic counter atomically: +	 */ +again: +	prev_raw_count = atomic64_read(&hwc->prev_count); +	rdmsrl(hwc->counter_base + idx, new_raw_count); + +	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, +					new_raw_count) != prev_raw_count) +		goto again; + +	/* +	 * Now we have the new raw value and have updated the prev +	 * timestamp already. We can now calculate the elapsed delta +	 * (counter-)time and add that to the generic counter. +	 * +	 * Careful, not all hw sign-extends above the physical width +	 * of the count. +	 */ +	delta = (new_raw_count << shift) - (prev_raw_count << shift); +	delta >>= shift; + +	atomic64_add(delta, &counter->count); +	atomic64_sub(delta, &hwc->period_left); + +	return new_raw_count; +} + +static atomic_t active_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ +	int i; + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		disable_lapic_nmi_watchdog(); + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) +			goto perfctr_fail; +	} + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) +			goto eventsel_fail; +	} + +	return true; + +eventsel_fail: +	for (i--; i >= 0; i--) +		release_evntsel_nmi(x86_pmu.eventsel + i); + +	i = x86_pmu.num_counters; + +perfctr_fail: +	for (i--; i >= 0; i--) +		release_perfctr_nmi(x86_pmu.perfctr + i); + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		enable_lapic_nmi_watchdog(); + +	return false; +} + +static void release_pmc_hardware(void) +{ +	int i; + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		release_perfctr_nmi(x86_pmu.perfctr + i); +		release_evntsel_nmi(x86_pmu.eventsel + i); +	} + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ +	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { +		release_pmc_hardware(); +		mutex_unlock(&pmc_reserve_mutex); +	} +} + +static inline int x86_pmu_initialized(void) +{ +	return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ +	unsigned int cache_type, cache_op, cache_result; +	u64 config, val; + +	config = attr->config; + +	cache_type = (config >>  0) & 0xff; +	if (cache_type >= PERF_COUNT_HW_CACHE_MAX) +		return -EINVAL; + +	cache_op = (config >>  8) & 0xff; +	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) +		return -EINVAL; + +	cache_result = (config >> 16) & 0xff; +	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) +		return -EINVAL; + +	val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + +	if (val == 0) +		return -ENOENT; + +	if (val == -1) +		return -EINVAL; + +	hwc->config |= val; + +	return 0; +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_counter_init(struct perf_counter *counter) +{ +	struct perf_counter_attr *attr = &counter->attr; +	struct hw_perf_counter *hwc = &counter->hw; +	int err; + +	if (!x86_pmu_initialized()) +		return -ENODEV; + +	err = 0; +	if (!atomic_inc_not_zero(&active_counters)) { +		mutex_lock(&pmc_reserve_mutex); +		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) +			err = -EBUSY; +		else +			atomic_inc(&active_counters); +		mutex_unlock(&pmc_reserve_mutex); +	} +	if (err) +		return err; + +	/* +	 * Generate PMC IRQs: +	 * (keep 'enabled' bit clear for now) +	 */ +	hwc->config = ARCH_PERFMON_EVENTSEL_INT; + +	/* +	 * Count user and OS events unless requested not to. +	 */ +	if (!attr->exclude_user) +		hwc->config |= ARCH_PERFMON_EVENTSEL_USR; +	if (!attr->exclude_kernel) +		hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + +	if (!hwc->sample_period) { +		hwc->sample_period = x86_pmu.max_period; +		hwc->last_period = hwc->sample_period; +		atomic64_set(&hwc->period_left, hwc->sample_period); +	} + +	counter->destroy = hw_perf_counter_destroy; + +	/* +	 * Raw event type provide the config in the event structure +	 */ +	if (attr->type == PERF_TYPE_RAW) { +		hwc->config |= x86_pmu.raw_event(attr->config); +		return 0; +	} + +	if (attr->type == PERF_TYPE_HW_CACHE) +		return set_ext_hw_attr(hwc, attr); + +	if (attr->config >= x86_pmu.max_events) +		return -EINVAL; +	/* +	 * The generic map: +	 */ +	hwc->config |= x86_pmu.event_map(attr->config); + +	return 0; +} + +static void intel_pmu_disable_all(void) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void amd_pmu_disable_all(void) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	int idx; + +	if (!cpuc->enabled) +		return; + +	cpuc->enabled = 0; +	/* +	 * ensure we write the disable before we start disabling the +	 * counters proper, so that amd_pmu_enable_counter() does the +	 * right thing. +	 */ +	barrier(); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		u64 val; + +		if (!test_bit(idx, cpuc->active_mask)) +			continue; +		rdmsrl(MSR_K7_EVNTSEL0 + idx, val); +		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) +			continue; +		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; +		wrmsrl(MSR_K7_EVNTSEL0 + idx, val); +	} +} + +void hw_perf_disable(void) +{ +	if (!x86_pmu_initialized()) +		return; +	return x86_pmu.disable_all(); +} + +static void intel_pmu_enable_all(void) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static void amd_pmu_enable_all(void) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	int idx; + +	if (cpuc->enabled) +		return; + +	cpuc->enabled = 1; +	barrier(); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		u64 val; + +		if (!test_bit(idx, cpuc->active_mask)) +			continue; +		rdmsrl(MSR_K7_EVNTSEL0 + idx, val); +		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) +			continue; +		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +		wrmsrl(MSR_K7_EVNTSEL0 + idx, val); +	} +} + +void hw_perf_enable(void) +{ +	if (!x86_pmu_initialized()) +		return; +	x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ +	u64 status; + +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + +	return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	int err; +	err = checking_wrmsrl(hwc->config_base + idx, +			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	int err; +	err = checking_wrmsrl(hwc->config_base + idx, +			      hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +{ +	int idx = __idx - X86_PMC_IDX_FIXED; +	u64 ctrl_val, mask; +	int err; + +	mask = 0xfULL << (idx * 4); + +	rdmsrl(hwc->config_base, ctrl_val); +	ctrl_val &= ~mask; +	err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { +		intel_pmu_disable_fixed(hwc, idx); +		return; +	} + +	x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	x86_pmu_disable_counter(hwc, idx); +} + +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static int +x86_perf_counter_set_period(struct perf_counter *counter, +			     struct hw_perf_counter *hwc, int idx) +{ +	s64 left = atomic64_read(&hwc->period_left); +	s64 period = hwc->sample_period; +	int err, ret = 0; + +	/* +	 * If we are way outside a reasoable range then just skip forward: +	 */ +	if (unlikely(left <= -period)) { +		left = period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +		ret = 1; +	} + +	if (unlikely(left <= 0)) { +		left += period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +		ret = 1; +	} +	/* +	 * Quirk: certain CPUs dont like it if just 1 event is left: +	 */ +	if (unlikely(left < 2)) +		left = 2; + +	if (left > x86_pmu.max_period) +		left = x86_pmu.max_period; + +	per_cpu(prev_left[idx], smp_processor_id()) = left; + +	/* +	 * The hw counter starts counting from this counter offset, +	 * mark it to be able to extra future deltas: +	 */ +	atomic64_set(&hwc->prev_count, (u64)-left); + +	err = checking_wrmsrl(hwc->counter_base + idx, +			     (u64)(-left) & x86_pmu.counter_mask); + +	return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +{ +	int idx = __idx - X86_PMC_IDX_FIXED; +	u64 ctrl_val, bits, mask; +	int err; + +	/* +	 * Enable IRQ generation (0x8), +	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1) +	 * if requested: +	 */ +	bits = 0x8ULL; +	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) +		bits |= 0x2; +	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) +		bits |= 0x1; +	bits <<= (idx * 4); +	mask = 0xfULL << (idx * 4); + +	rdmsrl(hwc->config_base, ctrl_val); +	ctrl_val &= ~mask; +	ctrl_val |= bits; +	err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { +		intel_pmu_enable_fixed(hwc, idx); +		return; +	} + +	x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + +	if (cpuc->enabled) +		x86_pmu_enable_counter(hwc, idx); +	else +		x86_pmu_disable_counter(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +{ +	unsigned int event; + +	if (!x86_pmu.num_counters_fixed) +		return -1; + +	/* +	 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: +	 */ +	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && +					boot_cpu_data.x86_model == 28) +		return -1; + +	event = hwc->config & ARCH_PERFMON_EVENT_MASK; + +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) +		return X86_PMC_IDX_FIXED_INSTRUCTIONS; +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) +		return X86_PMC_IDX_FIXED_CPU_CYCLES; +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) +		return X86_PMC_IDX_FIXED_BUS_CYCLES; + +	return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ +static int x86_pmu_enable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; +	int idx; + +	idx = fixed_mode_idx(counter, hwc); +	if (idx >= 0) { +		/* +		 * Try to get the fixed counter, if that is already taken +		 * then try to get a generic counter: +		 */ +		if (test_and_set_bit(idx, cpuc->used_mask)) +			goto try_generic; + +		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; +		/* +		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to +		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: +		 */ +		hwc->counter_base = +			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; +		hwc->idx = idx; +	} else { +		idx = hwc->idx; +		/* Try to get the previous generic counter again */ +		if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: +			idx = find_first_zero_bit(cpuc->used_mask, +						  x86_pmu.num_counters); +			if (idx == x86_pmu.num_counters) +				return -EAGAIN; + +			set_bit(idx, cpuc->used_mask); +			hwc->idx = idx; +		} +		hwc->config_base  = x86_pmu.eventsel; +		hwc->counter_base = x86_pmu.perfctr; +	} + +	perf_counters_lapic_init(); + +	x86_pmu.disable(hwc, idx); + +	cpuc->counters[idx] = counter; +	set_bit(idx, cpuc->active_mask); + +	x86_perf_counter_set_period(counter, hwc, idx); +	x86_pmu.enable(hwc, idx); + +	return 0; +} + +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; + +	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || +				cpuc->counters[hwc->idx] != counter)) +		return; + +	x86_pmu.enable(hwc, hwc->idx); +} + +void perf_counter_print_debug(void) +{ +	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; +	struct cpu_hw_counters *cpuc; +	unsigned long flags; +	int cpu, idx; + +	if (!x86_pmu.num_counters) +		return; + +	local_irq_save(flags); + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	if (x86_pmu.version >= 2) { +		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); +		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); +		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); +		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + +		pr_info("\n"); +		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl); +		pr_info("CPU#%d: status:     %016llx\n", cpu, status); +		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow); +		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed); +	} +	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); +		rdmsrl(x86_pmu.perfctr  + idx, pmc_count); + +		prev_left = per_cpu(prev_left[idx], cpu); + +		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n", +			cpu, idx, pmc_ctrl); +		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n", +			cpu, idx, pmc_count); +		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n", +			cpu, idx, prev_left); +	} +	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + +		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", +			cpu, idx, pmc_count); +	} +	local_irq_restore(flags); +} + +static void x86_pmu_disable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; +	int idx = hwc->idx; + +	/* +	 * Must be done before we disable, otherwise the nmi handler +	 * could reenable again: +	 */ +	clear_bit(idx, cpuc->active_mask); +	x86_pmu.disable(hwc, idx); + +	/* +	 * Make sure the cleared pointer becomes visible before we +	 * (potentially) free the counter: +	 */ +	barrier(); + +	/* +	 * Drain the remaining delta count out of a counter +	 * that we are disabling: +	 */ +	x86_perf_counter_update(counter, hwc, idx); +	cpuc->counters[idx] = NULL; +	clear_bit(idx, cpuc->used_mask); +} + +/* + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: + */ +static int intel_pmu_save_and_restart(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	int idx = hwc->idx; +	int ret; + +	x86_perf_counter_update(counter, hwc, idx); +	ret = x86_perf_counter_set_period(counter, hwc, idx); + +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) +		intel_pmu_enable_counter(hwc, idx); + +	return ret; +} + +static void intel_pmu_reset(void) +{ +	unsigned long flags; +	int idx; + +	if (!x86_pmu.num_counters) +		return; + +	local_irq_save(flags); + +	printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); +		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull); +	} +	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); +	} + +	local_irq_restore(flags); +} + + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ +	struct perf_sample_data data; +	struct cpu_hw_counters *cpuc; +	int bit, cpu, loops; +	u64 ack, status; + +	data.regs = regs; +	data.addr = 0; + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	perf_disable(); +	status = intel_pmu_get_status(); +	if (!status) { +		perf_enable(); +		return 0; +	} + +	loops = 0; +again: +	if (++loops > 100) { +		WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); +		perf_counter_print_debug(); +		intel_pmu_reset(); +		perf_enable(); +		return 1; +	} + +	inc_irq_stat(apic_perf_irqs); +	ack = status; +	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { +		struct perf_counter *counter = cpuc->counters[bit]; + +		clear_bit(bit, (unsigned long *) &status); +		if (!test_bit(bit, cpuc->active_mask)) +			continue; + +		if (!intel_pmu_save_and_restart(counter)) +			continue; + +		data.period = counter->hw.last_period; + +		if (perf_counter_overflow(counter, 1, &data)) +			intel_pmu_disable_counter(&counter->hw, bit); +	} + +	intel_pmu_ack_status(ack); + +	/* +	 * Repeat if there is more work to be done: +	 */ +	status = intel_pmu_get_status(); +	if (status) +		goto again; + +	perf_enable(); + +	return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ +	struct perf_sample_data data; +	struct cpu_hw_counters *cpuc; +	struct perf_counter *counter; +	struct hw_perf_counter *hwc; +	int cpu, idx, handled = 0; +	u64 val; + +	data.regs = regs; +	data.addr = 0; + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		if (!test_bit(idx, cpuc->active_mask)) +			continue; + +		counter = cpuc->counters[idx]; +		hwc = &counter->hw; + +		val = x86_perf_counter_update(counter, hwc, idx); +		if (val & (1ULL << (x86_pmu.counter_bits - 1))) +			continue; + +		/* +		 * counter overflow +		 */ +		handled		= 1; +		data.period	= counter->hw.last_period; + +		if (!x86_perf_counter_set_period(counter, hwc, idx)) +			continue; + +		if (perf_counter_overflow(counter, 1, &data)) +			amd_pmu_disable_counter(hwc, idx); +	} + +	if (handled) +		inc_irq_stat(apic_perf_irqs); + +	return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ +	irq_enter(); +	ack_APIC_irq(); +	inc_irq_stat(apic_pending_irqs); +	perf_counter_do_pending(); +	irq_exit(); +} + +void set_perf_counter_pending(void) +{ +	apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + +void perf_counters_lapic_init(void) +{ +	if (!x86_pmu_initialized()) +		return; + +	/* +	 * Always use NMI for PMU +	 */ +	apic_write(APIC_LVTPC, APIC_DM_NMI); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, +			 unsigned long cmd, void *__args) +{ +	struct die_args *args = __args; +	struct pt_regs *regs; + +	if (!atomic_read(&active_counters)) +		return NOTIFY_DONE; + +	switch (cmd) { +	case DIE_NMI: +	case DIE_NMI_IPI: +		break; + +	default: +		return NOTIFY_DONE; +	} + +	regs = args->regs; + +	apic_write(APIC_LVTPC, APIC_DM_NMI); +	/* +	 * Can't rely on the handled return value to say it was our NMI, two +	 * counters could trigger 'simultaneously' raising two back-to-back NMIs. +	 * +	 * If the first NMI handles both, the latter will be empty and daze +	 * the CPU. +	 */ +	x86_pmu.handle_irq(regs); + +	return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { +	.notifier_call		= perf_counter_nmi_handler, +	.next			= NULL, +	.priority		= 1 +}; + +static struct x86_pmu intel_pmu = { +	.name			= "Intel", +	.handle_irq		= intel_pmu_handle_irq, +	.disable_all		= intel_pmu_disable_all, +	.enable_all		= intel_pmu_enable_all, +	.enable			= intel_pmu_enable_counter, +	.disable		= intel_pmu_disable_counter, +	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0, +	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0, +	.event_map		= intel_pmu_event_map, +	.raw_event		= intel_pmu_raw_event, +	.max_events		= ARRAY_SIZE(intel_perfmon_event_map), +	/* +	 * Intel PMCs cannot be accessed sanely above 32 bit width, +	 * so we install an artificial 1<<31 period regardless of +	 * the generic counter period: +	 */ +	.max_period		= (1ULL << 31) - 1, +}; + +static struct x86_pmu amd_pmu = { +	.name			= "AMD", +	.handle_irq		= amd_pmu_handle_irq, +	.disable_all		= amd_pmu_disable_all, +	.enable_all		= amd_pmu_enable_all, +	.enable			= amd_pmu_enable_counter, +	.disable		= amd_pmu_disable_counter, +	.eventsel		= MSR_K7_EVNTSEL0, +	.perfctr		= MSR_K7_PERFCTR0, +	.event_map		= amd_pmu_event_map, +	.raw_event		= amd_pmu_raw_event, +	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), +	.num_counters		= 4, +	.counter_bits		= 48, +	.counter_mask		= (1ULL << 48) - 1, +	/* use highest bit to detect overflow */ +	.max_period		= (1ULL << 47) - 1, +}; + +static int intel_pmu_init(void) +{ +	union cpuid10_edx edx; +	union cpuid10_eax eax; +	unsigned int unused; +	unsigned int ebx; +	int version; + +	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) +		return -ENODEV; + +	/* +	 * Check whether the Architectural PerfMon supports +	 * Branch Misses Retired Event or not. +	 */ +	cpuid(10, &eax.full, &ebx, &unused, &edx.full); +	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) +		return -ENODEV; + +	version = eax.split.version_id; +	if (version < 2) +		return -ENODEV; + +	x86_pmu				= intel_pmu; +	x86_pmu.version			= version; +	x86_pmu.num_counters		= eax.split.num_counters; +	x86_pmu.counter_bits		= eax.split.bit_width; +	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1; + +	/* +	 * Quirk: v2 perfmon does not report fixed-purpose counters, so +	 * assume at least 3 counters: +	 */ +	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3); + +	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + +	/* +	 * Install the hw-cache-events table: +	 */ +	switch (boot_cpu_data.x86_model) { +	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ +	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ +	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ +	case 29: /* six-core 45 nm xeon "Dunnington" */ +		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Core2 events, "); +		break; +	default: +	case 26: +		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Nehalem/Corei7 events, "); +		break; +	case 28: +		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Atom events, "); +		break; +	} +	return 0; +} + +static int amd_pmu_init(void) +{ +	/* Performance-monitoring supported from K7 and later: */ +	if (boot_cpu_data.x86 < 6) +		return -ENODEV; + +	x86_pmu = amd_pmu; + +	/* Events are common for all AMDs */ +	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, +	       sizeof(hw_cache_event_ids)); + +	return 0; +} + +void __init init_hw_perf_counters(void) +{ +	int err; + +	pr_info("Performance Counters: "); + +	switch (boot_cpu_data.x86_vendor) { +	case X86_VENDOR_INTEL: +		err = intel_pmu_init(); +		break; +	case X86_VENDOR_AMD: +		err = amd_pmu_init(); +		break; +	default: +		return; +	} +	if (err != 0) { +		pr_cont("no PMU driver, software counters only.\n"); +		return; +	} + +	pr_cont("%s PMU driver.\n", x86_pmu.name); + +	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { +		x86_pmu.num_counters = X86_PMC_MAX_GENERIC; +		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", +		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC); +	} +	perf_counter_mask = (1 << x86_pmu.num_counters) - 1; +	perf_max_counters = x86_pmu.num_counters; + +	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { +		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; +		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", +		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); +	} + +	perf_counter_mask |= +		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + +	perf_counters_lapic_init(); +	register_die_notifier(&perf_counter_nmi_notifier); + +	pr_info("... version:                 %d\n",     x86_pmu.version); +	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits); +	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters); +	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask); +	pr_info("... max period:              %016Lx\n", x86_pmu.max_period); +	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed); +	pr_info("... counter mask:            %016Lx\n", perf_counter_mask); +} + +static inline void x86_pmu_read(struct perf_counter *counter) +{ +	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + +static const struct pmu pmu = { +	.enable		= x86_pmu_enable, +	.disable	= x86_pmu_disable, +	.read		= x86_pmu_read, +	.unthrottle	= x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ +	int err; + +	err = __hw_perf_counter_init(counter); +	if (err) +		return ERR_PTR(err); + +	return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ +	if (entry->nr < PERF_MAX_STACK_DEPTH) +		entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ +	/* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ +	/* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ +	/* Process all stacks: */ +	return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ +	struct perf_callchain_entry *entry = data; + +	if (reliable) +		callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { +	.warning		= backtrace_warning, +	.warning_symbol		= backtrace_warning_symbol, +	.stack			= backtrace_stack, +	.address		= backtrace_address, +}; + +#include "../dumpstack.h" + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	callchain_store(entry, PERF_CONTEXT_KERNEL); +	callchain_store(entry, regs->ip); + +	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ +	unsigned long offset, addr = (unsigned long)from; +	int type = in_nmi() ? KM_NMI : KM_IRQ0; +	unsigned long size, len = 0; +	struct page *page; +	void *map; +	int ret; + +	do { +		ret = __get_user_pages_fast(addr, 1, 0, &page); +		if (!ret) +			break; + +		offset = addr & (PAGE_SIZE - 1); +		size = min(PAGE_SIZE - offset, n - len); + +		map = kmap_atomic(page, type); +		memcpy(to, map+offset, size); +		kunmap_atomic(map, type); +		put_page(page); + +		len  += size; +		to   += size; +		addr += size; + +	} while (len < n); + +	return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ +	unsigned long bytes; + +	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + +	return bytes == sizeof(*frame); +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	struct stack_frame frame; +	const void __user *fp; + +	if (!user_mode(regs)) +		regs = task_pt_regs(current); + +	fp = (void __user *)regs->bp; + +	callchain_store(entry, PERF_CONTEXT_USER); +	callchain_store(entry, regs->ip); + +	while (entry->nr < PERF_MAX_STACK_DEPTH) { +		frame.next_frame	     = NULL; +		frame.return_address = 0; + +		if (!copy_stack_frame(fp, &frame)) +			break; + +		if ((unsigned long)fp < regs->sp) +			break; + +		callchain_store(entry, frame.return_address); +		fp = frame.next_frame; +	} +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	int is_user; + +	if (!regs) +		return; + +	is_user = user_mode(regs); + +	if (!current || current->pid == 0) +		return; + +	if (is_user && current->state != TASK_RUNNING) +		return; + +	if (!is_user) +		perf_callchain_kernel(regs, entry); + +	if (current->mm) +		perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ +	struct perf_callchain_entry *entry; + +	if (in_nmi()) +		entry = &__get_cpu_var(nmi_entry); +	else +		entry = &__get_cpu_var(irq_entry); + +	entry->nr = 0; + +	perf_do_callchain(regs, entry); + +	return entry; +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f6c70a164e3..5c481f6205b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -19,8 +19,8 @@  #include <linux/nmi.h>  #include <linux/kprobes.h> -#include <asm/genapic.h> -#include <asm/intel_arch_perfmon.h> +#include <asm/apic.h> +#include <asm/perf_counter.h>  struct nmi_watchdog_ctlblk {  	unsigned int cccr_msr; @@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void)  		wd_ops = &k7_wd_ops;  		break;  	case X86_VENDOR_INTEL: -		/* -		 * Work around Core Duo (Yonah) errata AE49 where perfctr1 -		 * doesn't have a working enable bit. +		/* Work around where perfctr1 doesn't have a working enable +		 * bit as described in the following errata: +		 * AE49 Core Duo and Intel Core Solo 65 nm +		 * AN49 Intel Pentium Dual-Core +		 * AF49 Dual-Core Intel Xeon Processor LV  		 */ -		if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { +		if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || +		    ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && +		     boot_cpu_data.x86_mask == 4))) {  			intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;  			intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;  		}  |