diff options
Diffstat (limited to 'arch/x86/kernel')
57 files changed, 1312 insertions, 599 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4558f0d0822..ce664f33ea8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -219,6 +219,8 @@ static int __init  acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)  {  	struct acpi_madt_local_x2apic *processor = NULL; +	int apic_id; +	u8 enabled;  	processor = (struct acpi_madt_local_x2apic *)header; @@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)  	acpi_table_print_madt_entry(header); +	apic_id = processor->local_apic_id; +	enabled = processor->lapic_flags & ACPI_MADT_ENABLED;  #ifdef CONFIG_X86_X2APIC  	/*  	 * We need to register disabled CPU as well to permit @@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)  	 * to not preallocating memory for all NR_CPUS  	 * when we use CPU hotplug.  	 */ -	acpi_register_lapic(processor->local_apic_id,	/* APIC ID */ -			    processor->lapic_flags & ACPI_MADT_ENABLED); +	if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) +		printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); +	else +		acpi_register_lapic(apic_id, enabled);  #else  	printk(KERN_WARNING PREFIX "x2apic entry ignored\n");  #endif diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa8fac..013c1810ce7 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu)  {  	struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;  	unsigned int mask; -	int cuid = 0; +	int cuid;  	if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))  		return 0;  	pci_read_config_dword(link, 0x1d4, &mask); -#ifdef CONFIG_SMP  	cuid = cpu_data(cpu).compute_unit_id; -#endif  	return (mask >> (4 * cuid)) & 0xf;  } @@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask)  	static unsigned int reset, ban;  	struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));  	unsigned int reg; -	int cuid = 0; +	int cuid;  	if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)  		return -EINVAL; @@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask)  		pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);  	} -#ifdef CONFIG_SMP  	cuid = cpu_data(cpu).compute_unit_id; -#endif  	mask <<= 4 * cuid;  	mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3d2661ca654..6e76c191a83 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void)  	 */  	addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,  				      aper_size, aper_size); -	if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { +	if (!addr || addr + aper_size > GART_MAX_ADDR) {  		printk(KERN_ERR  			"Cannot allocate aperture memory hole (%lx,%uK)\n",  				addr, aper_size>>10);  		return 0;  	} -	memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); +	memblock_reserve(addr, aper_size);  	/*  	 * Kmemleak should not scan this block as it may not be mapped via the  	 * kernel direct mapping. diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 767fd04f284..0ae0323b1f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_SMP)		+= ipi.o  ifeq ($(CONFIG_X86_64),y)  # APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP)	+= apic_numachip.o  obj-$(CONFIG_X86_UV)		+= x2apic_uv_x.o  obj-$(CONFIG_X86_X2APIC)	+= x2apic_phys.o  obj-$(CONFIG_X86_X2APIC)	+= x2apic_cluster.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f98d84caf94..2eec05b6d1b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);  int x2apic_mode;  #ifdef CONFIG_X86_X2APIC  /* x2apic enabled before OS handover */ -static int x2apic_preenabled; +int x2apic_preenabled; +static int x2apic_disabled; +static int nox2apic;  static __init int setup_nox2apic(char *str)  {  	if (x2apic_enabled()) { -		pr_warning("Bios already enabled x2apic, " -			   "can't enforce nox2apic"); -		return 0; -	} +		int apicid = native_apic_msr_read(APIC_ID); + +		if (apicid >= 255) { +			pr_warning("Apicid: %08x, cannot enforce nox2apic\n", +				   apicid); +			return 0; +		} + +		pr_warning("x2apic already enabled. will disable it\n"); +	} else +		setup_clear_cpu_cap(X86_FEATURE_X2APIC); + +	nox2apic = 1; -	setup_clear_cpu_cap(X86_FEATURE_X2APIC);  	return 0;  }  early_param("nox2apic", setup_nox2apic); @@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)  		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;  		if (!send_status)  			break; +		inc_irq_stat(icr_read_retry_count);  		udelay(100);  	} while (timeout++ < 1000); @@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)  	 * Besides, if we don't timer interrupts ignore the global  	 * interrupt lock, which is the WrongThing (tm) to do.  	 */ -	exit_idle();  	irq_enter(); +	exit_idle();  	local_apic_timer_interrupt();  	irq_exit(); @@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)  }  #ifdef CONFIG_X86_X2APIC +/* + * Need to disable xapic and x2apic at the same time and then enable xapic mode + */ +static inline void __disable_x2apic(u64 msr) +{ +	wrmsrl(MSR_IA32_APICBASE, +	       msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); +	wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); +} + +static __init void disable_x2apic(void) +{ +	u64 msr; + +	if (!cpu_has_x2apic) +		return; + +	rdmsrl(MSR_IA32_APICBASE, msr); +	if (msr & X2APIC_ENABLE) { +		u32 x2apic_id = read_apic_id(); + +		if (x2apic_id >= 255) +			panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + +		pr_info("Disabling x2apic\n"); +		__disable_x2apic(msr); + +		if (nox2apic) { +			clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); +			setup_clear_cpu_cap(X86_FEATURE_X2APIC); +		} + +		x2apic_disabled = 1; +		x2apic_mode = 0; + +		register_lapic_address(mp_lapic_addr); +	} +} +  void check_x2apic(void)  {  	if (x2apic_enabled()) { @@ -1441,15 +1491,20 @@ void check_x2apic(void)  void enable_x2apic(void)  { -	int msr, msr2; +	u64 msr; + +	rdmsrl(MSR_IA32_APICBASE, msr); +	if (x2apic_disabled) { +		__disable_x2apic(msr); +		return; +	}  	if (!x2apic_mode)  		return; -	rdmsr(MSR_IA32_APICBASE, msr, msr2);  	if (!(msr & X2APIC_ENABLE)) {  		printk_once(KERN_INFO "Enabling x2apic\n"); -		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); +		wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);  	}  }  #endif /* CONFIG_X86_X2APIC */ @@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void)  	ret = save_ioapic_entries();  	if (ret) {  		pr_info("Saving IO-APIC state failed: %d\n", ret); -		goto out; +		return;  	}  	local_irq_save(flags);  	legacy_pic->mask_all();  	mask_ioapic_entries(); +	if (x2apic_preenabled && nox2apic) +		disable_x2apic(); +  	if (dmar_table_init_ret)  		ret = -1;  	else  		ret = enable_IR(); +	if (!x2apic_supported()) +		goto skip_x2apic; +  	if (ret < 0) {  		/* IR is required if there is APIC ID > 255 even when running  		 * under KVM  		 */  		if (max_physical_apicid > 255 || -		    !hypervisor_x2apic_available()) -			goto nox2apic; +		    !hypervisor_x2apic_available()) { +			if (x2apic_preenabled) +				disable_x2apic(); +			goto skip_x2apic; +		}  		/*  		 * without IR all CPUs can be addressed by IOAPIC/MSI  		 * only in physical mode @@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void)  		x2apic_force_phys();  	} -	if (ret == IRQ_REMAP_XAPIC_MODE) -		goto nox2apic; +	if (ret == IRQ_REMAP_XAPIC_MODE) { +		pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); +		goto skip_x2apic; +	}  	x2apic_enabled = 1; @@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void)  		pr_info("Enabled x2apic\n");  	} -nox2apic: +skip_x2apic:  	if (ret < 0) /* IR enabling failed */  		restore_ioapic_entries();  	legacy_pic->restore_mask();  	local_irq_restore(flags); - -out: -	if (x2apic_enabled || !x2apic_supported()) -		return; - -	if (x2apic_preenabled) -		panic("x2apic: enabled by BIOS but kernel init failed."); -	else if (ret == IRQ_REMAP_XAPIC_MODE) -		pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); -	else if (ret < 0) -		pr_info("x2apic not enabled, IRQ remapping init failed\n");  }  #ifdef CONFIG_X86_64 @@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)  {  	u32 v; -	exit_idle();  	irq_enter(); +	exit_idle();  	/*  	 * Check if this really is a spurious interrupt and ACK it  	 * if it is a vectored one.  Just in case... @@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)  		"Illegal register address",	/* APIC Error Bit 7 */  	}; -	exit_idle();  	irq_enter(); +	exit_idle();  	/* First tickle the hardware, only then report what went on. -- REW */  	v0 = apic_read(APIC_ESR);  	apic_write(APIC_ESR, 0); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f7a41e4cae4..8c3cdded6f2 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)   * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel   * document number 292116).  So here it goes...   */ -static void flat_init_apic_ldr(void) +void flat_init_apic_ldr(void)  {  	unsigned long val;  	unsigned long num, id; @@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)  	return initial_apic_id >> index_msb;  } +static int flat_probe(void) +{ +	return 1; +} +  static struct apic apic_flat =  {  	.name				= "flat", -	.probe				= NULL, +	.probe				= flat_probe,  	.acpi_madt_oem_check		= flat_acpi_madt_oem_check,  	.apic_id_registered		= flat_apic_id_registered, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 00000000000..09d3d8c1cd9 --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,294 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License.  See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#include <linux/errno.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/hardirq.h> +#include <linux/delay.h> + +#include <asm/numachip/numachip_csr.h> +#include <asm/smp.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/apic_flat_64.h> + +static int numachip_system __read_mostly; + +static struct apic apic_numachip __read_mostly; + +static unsigned int get_apic_id(unsigned long x) +{ +	unsigned long value; +	unsigned int id; + +	rdmsrl(MSR_FAM10H_NODE_ID, value); +	id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + +	return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ +	unsigned long x; + +	x = ((id & 0xffU) << 24); +	return x; +} + +static unsigned int read_xapic_id(void) +{ +	return get_apic_id(apic_read(APIC_ID)); +} + +static int numachip_apic_id_registered(void) +{ +	return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ +	return initial_apic_id >> index_msb; +} + +static const struct cpumask *numachip_target_cpus(void) +{ +	return cpu_online_mask; +} + +static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ +	cpumask_clear(retmask); +	cpumask_set_cpu(cpu, retmask); +} + +static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ +	union numachip_csr_g3_ext_irq_gen int_gen; + +	int_gen.s._destination_apic_id = phys_apicid; +	int_gen.s._vector = 0; +	int_gen.s._msgtype = APIC_DM_INIT >> 8; +	int_gen.s._index = 0; + +	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + +	int_gen.s._msgtype = APIC_DM_STARTUP >> 8; +	int_gen.s._vector = start_rip >> 12; + +	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + +	atomic_set(&init_deasserted, 1); +	return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ +	union numachip_csr_g3_ext_irq_gen int_gen; +	int apicid = per_cpu(x86_cpu_to_apicid, cpu); + +	int_gen.s._destination_apic_id = apicid; +	int_gen.s._vector = vector; +	int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; +	int_gen.s._index = 0; + +	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ +	unsigned int cpu; + +	for_each_cpu(cpu, mask) +		numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, +						int vector) +{ +	unsigned int this_cpu = smp_processor_id(); +	unsigned int cpu; + +	for_each_cpu(cpu, mask) { +		if (cpu != this_cpu) +			numachip_send_IPI_one(cpu, vector); +	} +} + +static void numachip_send_IPI_allbutself(int vector) +{ +	unsigned int this_cpu = smp_processor_id(); +	unsigned int cpu; + +	for_each_online_cpu(cpu) { +		if (cpu != this_cpu) +			numachip_send_IPI_one(cpu, vector); +	} +} + +static void numachip_send_IPI_all(int vector) +{ +	numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ +	__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} + +static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ +	int cpu; + +	/* +	 * We're using fixed IRQ delivery, can only return one phys APIC ID. +	 * May as well be the first. +	 */ +	cpu = cpumask_first(cpumask); +	if (likely((unsigned)cpu < nr_cpu_ids)) +		return per_cpu(x86_cpu_to_apicid, cpu); + +	return BAD_APICID; +} + +static unsigned int +numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +				const struct cpumask *andmask) +{ +	int cpu; + +	/* +	 * We're using fixed IRQ delivery, can only return one phys APIC ID. +	 * May as well be the first. +	 */ +	for_each_cpu_and(cpu, cpumask, andmask) { +		if (cpumask_test_cpu(cpu, cpu_online_mask)) +			break; +	} +	return per_cpu(x86_cpu_to_apicid, cpu); +} + +static int __init numachip_probe(void) +{ +	return apic == &apic_numachip; +} + +static void __init map_csrs(void) +{ +	printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", +		NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); +	init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + +	printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", +		NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); +	init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ +	c->phys_proc_id = node; +	per_cpu(cpu_llc_id, smp_processor_id()) = node; +} + +static int __init numachip_system_init(void) +{ +	unsigned int val; + +	if (!numachip_system) +		return 0; + +	x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + +	map_csrs(); + +	val = read_lcsr(CSR_G0_NODE_IDS); +	printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); + +	return 0; +} +early_initcall(numachip_system_init); + +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +	if (!strncmp(oem_id, "NUMASC", 6)) { +		numachip_system = 1; +		return 1; +	} + +	return 0; +} + +static struct apic apic_numachip __refconst = { + +	.name				= "NumaConnect system", +	.probe				= numachip_probe, +	.acpi_madt_oem_check		= numachip_acpi_madt_oem_check, +	.apic_id_registered		= numachip_apic_id_registered, + +	.irq_delivery_mode		= dest_Fixed, +	.irq_dest_mode			= 0, /* physical */ + +	.target_cpus			= numachip_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= 0, +	.check_apicid_used		= NULL, +	.check_apicid_present		= NULL, + +	.vector_allocation_domain	= numachip_vector_allocation_domain, +	.init_apic_ldr			= flat_init_apic_ldr, + +	.ioapic_phys_id_map		= NULL, +	.setup_apic_routing		= NULL, +	.multi_timer_check		= NULL, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= NULL, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= numachip_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= get_apic_id, +	.set_apic_id			= set_apic_id, +	.apic_id_mask			= 0xffU << 24, + +	.cpu_mask_to_apicid		= numachip_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= numachip_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= numachip_send_IPI_mask, +	.send_IPI_mask_allbutself	= numachip_send_IPI_mask_allbutself, +	.send_IPI_allbutself		= numachip_send_IPI_allbutself, +	.send_IPI_all			= numachip_send_IPI_all, +	.send_IPI_self			= numachip_send_IPI_self, + +	.wakeup_secondary_cpu		= numachip_wakeup_secondary, +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, +	.wait_for_init_deassert		= NULL, +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= NULL, /* REMRD not supported */ + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; +apic_driver(apic_numachip); + diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d939d7847e..fb072754bc1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)  	unsigned vector, me;  	ack_APIC_irq(); -	exit_idle();  	irq_enter(); +	exit_idle();  	me = smp_processor_id();  	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -2948,6 +2948,10 @@ static inline void __init check_timer(void)  	}  	local_irq_disable();  	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); +	if (x2apic_preenabled) +		apic_printk(APIC_QUIET, KERN_INFO +			    "Perhaps problem with the pre-enabled x2apic mode\n" +			    "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");  	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "  		"report.  Then try booting with the 'noapic' option.\n");  out: diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 452932d3473..5da1269e8dd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);  void __init setup_bios_corruption_check(void)  { -	u64 addr = PAGE_SIZE;	/* assume first page is reserved anyway */ +	phys_addr_t start, end; +	u64 i;  	if (memory_corruption_check == -1) {  		memory_corruption_check = @@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void)  	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); -	while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { -		u64 size; -		addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); +	for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { +		start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), +				PAGE_SIZE, corruption_check_size); +		end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), +			      PAGE_SIZE, corruption_check_size); +		if (start >= end) +			continue; -		if (addr == MEMBLOCK_ERROR) -			break; - -		if (addr >= corruption_check_size) -			break; - -		if ((addr + size) > corruption_check_size) -			size = corruption_check_size - addr; - -		memblock_x86_reserve_range(addr, addr + size, "SCAN RAM"); -		scan_areas[num_scan_areas].addr = addr; -		scan_areas[num_scan_areas].size = size; -		num_scan_areas++; +		memblock_reserve(start, end - start); +		scan_areas[num_scan_areas].addr = start; +		scan_areas[num_scan_areas].size = end - start;  		/* Assume we've already mapped this early memory */ -		memset(__va(addr), 0, size); +		memset(__va(start), 0, end - start); -		addr += size; +		if (++num_scan_areas >= MAX_SCAN_AREAS) +			break;  	}  	if (num_scan_areas) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0bab2b18bb2..f4773f4aae3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)  static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)  { -#ifdef CONFIG_SMP  	/* calling is from identify_secondary_cpu() ? */  	if (!c->cpu_index)  		return; @@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)  valid_k7:  	; -#endif  }  static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) @@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)  	if (node == NUMA_NO_NODE)  		node = per_cpu(cpu_llc_id, cpu); +	/* +	 * If core numbers are inconsistent, it's likely a multi-fabric platform, +	 * so invoke platform-specific handler +	 */ +	if (c->phys_proc_id != node) +		x86_cpuinit.fixup_cpu_id(c, node); +  	if (!node_online(node)) {  		/*  		 * Two possibilities here: diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e58d978e075..159103c0b1f 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)  	}  #ifdef CONFIG_X86_32  	/* Cyrix III family needs CX8 & PGE explicitly enabled. */ -	if (c->x86_model >= 6 && c->x86_model <= 9) { +	if (c->x86_model >= 6 && c->x86_model <= 13) {  		rdmsr(MSR_VIA_FCR, lo, hi);  		lo |= (1<<1 | 1<<7);  		wrmsr(MSR_VIA_FCR, lo, hi); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a83..850f2963a42 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	if (this_cpu->c_early_init)  		this_cpu->c_early_init(c); -#ifdef CONFIG_SMP  	c->cpu_index = 0; -#endif  	filter_cpuid_features(c, false);  	setup_smep(c); @@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)  		c->apicid = c->initial_apicid;  # endif  #endif - -#ifdef CONFIG_X86_HT  		c->phys_proc_id = c->initial_apicid; -#endif  	}  	setup_smep(c); @@ -1141,6 +1136,15 @@ static void dbg_restore_debug_regs(void)  #endif /* ! CONFIG_KGDB */  /* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ +	pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} + +/*   * cpu_init() initializes state that is per-CPU. Some data is already   * initialized (naturally) in the bootstrap process, such as the GDT   * and IDT. We reload them nevertheless, this function acts as a diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1b22dcc51af..8bacc7826fb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,5 +1,4 @@  #ifndef ARCH_X86_CPU_H -  #define ARCH_X86_CPU_H  struct cpu_model_info { @@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],  extern void get_cpu_cap(struct cpuinfo_x86 *c);  extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -extern void get_cpu_cap(struct cpuinfo_x86 *c); - -#endif +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 523131213f0..3e6ff6cbf42 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)  static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)  { -#ifdef CONFIG_SMP  	/* calling is from identify_secondary_cpu() ? */  	if (!c->cpu_index)  		return; @@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)  		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"  				    "with B stepping processors.\n");  	} -#endif  }  static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a3b0811693c..6b45e5e7a90 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)  #include <linux/kobject.h>  #include <linux/sysfs.h> - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include <linux/cpu.h>  /* pointer to kobject for cpuX/cache */  static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -1073,9 +1072,9 @@ err_out:  static DECLARE_BITMAP(cache_dev_map, NR_CPUS);  /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int __cpuinit cache_add_dev(struct device *dev)  { -	unsigned int cpu = sys_dev->id; +	unsigned int cpu = dev->id;  	unsigned long i, j;  	struct _index_kobject *this_object;  	struct _cpuid4_info   *this_leaf; @@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)  	retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),  				      &ktype_percpu_entry, -				      &sys_dev->kobj, "%s", "cache"); +				      &dev->kobj, "%s", "cache");  	if (retval < 0) {  		cpuid4_cache_sysfs_exit(cpu);  		return retval; @@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)  	return 0;  } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct device *dev)  { -	unsigned int cpu = sys_dev->id; +	unsigned int cpu = dev->id;  	unsigned long i;  	if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; -	struct sys_device *sys_dev; +	struct device *dev; -	sys_dev = get_cpu_sysdev(cpu); +	dev = get_cpu_device(cpu);  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: -		cache_add_dev(sys_dev); +		cache_add_dev(dev);  		break;  	case CPU_DEAD:  	case CPU_DEAD_FROZEN: -		cache_remove_dev(sys_dev); +		cache_remove_dev(dev);  		break;  	}  	return NOTIFY_OK; @@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)  	for_each_online_cpu(i) {  		int err; -		struct sys_device *sys_dev = get_cpu_sysdev(i); +		struct device *dev = get_cpu_device(i); -		err = cache_add_dev(sys_dev); +		err = cache_add_dev(dev);  		if (err)  			return err;  	} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 319882ef848..fc4beb39357 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@  #include <linux/kernel.h>  #include <linux/string.h>  #include <linux/fs.h> +#include <linux/preempt.h>  #include <linux/smp.h>  #include <linux/notifier.h>  #include <linux/kdebug.h> @@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)  	return NMI_HANDLED;  } +static void mce_irq_ipi(void *info) +{ +	int cpu = smp_processor_id(); +	struct mce *m = &__get_cpu_var(injectm); + +	if (cpumask_test_cpu(cpu, mce_inject_cpumask) && +			m->inject_flags & MCJ_EXCEPTION) { +		cpumask_clear_cpu(cpu, mce_inject_cpumask); +		raise_exception(m, NULL); +	} +} +  /* Inject mce on current CPU */  static int raise_local(void)  { @@ -139,9 +152,10 @@ static void raise_mce(struct mce *m)  		return;  #ifdef CONFIG_X86_LOCAL_APIC -	if (m->inject_flags & MCJ_NMI_BROADCAST) { +	if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {  		unsigned long start;  		int cpu; +  		get_online_cpus();  		cpumask_copy(mce_inject_cpumask, cpu_online_mask);  		cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -151,13 +165,25 @@ static void raise_mce(struct mce *m)  			    MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)  				cpumask_clear_cpu(cpu, mce_inject_cpumask);  		} -		if (!cpumask_empty(mce_inject_cpumask)) -			apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); +		if (!cpumask_empty(mce_inject_cpumask)) { +			if (m->inject_flags & MCJ_IRQ_BRAODCAST) { +				/* +				 * don't wait because mce_irq_ipi is necessary +				 * to be sync with following raise_local +				 */ +				preempt_disable(); +				smp_call_function_many(mce_inject_cpumask, +					mce_irq_ipi, NULL, 0); +				preempt_enable(); +			} else if (m->inject_flags & MCJ_NMI_BROADCAST) +				apic->send_IPI_mask(mce_inject_cpumask, +						NMI_VECTOR); +		}  		start = jiffies;  		while (!cpumask_empty(mce_inject_cpumask)) {  			if (!time_before(jiffies, start + 2*HZ)) {  				printk(KERN_ERR -				"Timeout waiting for mce inject NMI %lx\n", +				"Timeout waiting for mce inject %lx\n",  					*cpumask_bits(mce_inject_cpumask));  				break;  			} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69ee8b..ed44c8a6585 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include <linux/sysdev.h> +#include <linux/device.h>  #include <asm/mce.h>  enum severity_level { @@ -17,7 +17,7 @@ enum severity_level {  struct mce_bank {  	u64			ctl;			/* subevents to enable */  	unsigned char init;				/* initialise bank? */ -	struct sysdev_attribute attr;			/* sysdev attribute */ +	struct device_attribute attr;			/* device attribute */  	char			attrname[ATTR_LEN];	/* attribute name */  }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d4c3d..f22a9f7f639 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -19,7 +19,7 @@  #include <linux/kernel.h>  #include <linux/percpu.h>  #include <linux/string.h> -#include <linux/sysdev.h> +#include <linux/device.h>  #include <linux/syscore_ops.h>  #include <linux/delay.h>  #include <linux/ctype.h> @@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);  static DEFINE_PER_CPU(struct mce, mces_seen);  static int			cpu_missing; -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); -  /* MCA banks polled by the period polling timer for corrected events */  DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {  static DEFINE_PER_CPU(struct work_struct, mce_work); +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +  /* Do initial initialization of a struct mce */  void mce_setup(struct mce *m)  { @@ -119,9 +118,7 @@ void mce_setup(struct mce *m)  	m->time = get_seconds();  	m->cpuvendor = boot_cpu_data.x86_vendor;  	m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP  	m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif  	m->apicid = cpu_data(m->extcpu).initial_apicid;  	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);  } @@ -190,6 +187,57 @@ void mce_log(struct mce *mce)  	set_bit(0, &mce_need_notify);  } +static void drain_mcelog_buffer(void) +{ +	unsigned int next, i, prev = 0; + +	next = rcu_dereference_check_mce(mcelog.next); + +	do { +		struct mce *m; + +		/* drain what was logged during boot */ +		for (i = prev; i < next; i++) { +			unsigned long start = jiffies; +			unsigned retries = 1; + +			m = &mcelog.entry[i]; + +			while (!m->finished) { +				if (time_after_eq(jiffies, start + 2*retries)) +					retries++; + +				cpu_relax(); + +				if (!m->finished && retries >= 4) { +					pr_err("MCE: skipping error being logged currently!\n"); +					break; +				} +			} +			smp_rmb(); +			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); +		} + +		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); +		prev = next; +		next = cmpxchg(&mcelog.next, prev, 0); +	} while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ +	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); +	drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ +	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); +  static void print_mce(struct mce *m)  {  	int ret = 0; @@ -1770,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {  };  /* - * mce_sysdev: Sysfs support + * mce_device: Sysfs support   */  static void mce_cpu_restart(void *data) @@ -1806,27 +1854,28 @@ static void mce_enable_ce(void *all)  		__mcheck_cpu_init_timer();  } -static struct sysdev_class mce_sysdev_class = { +static struct bus_type mce_subsys = {  	.name		= "machinecheck", +	.dev_name	= "machinecheck",  }; -DEFINE_PER_CPU(struct sys_device, mce_sysdev); +DEFINE_PER_CPU(struct device, mce_device);  __cpuinitdata  void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)  {  	return container_of(attr, struct mce_bank, attr);  } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr,  			 char *buf)  {  	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);  } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr,  			const char *buf, size_t size)  {  	u64 new; @@ -1841,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,  }  static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf)  {  	strcpy(buf, mce_helper);  	strcat(buf, "\n");  	return strlen(mce_helper) + 1;  } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr,  				const char *buf, size_t siz)  {  	char *p; @@ -1863,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,  	return strlen(mce_helper) + !!p;  } -static ssize_t set_ignore_ce(struct sys_device *s, -			     struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, +			     struct device_attribute *attr,  			     const char *buf, size_t size)  {  	u64 new; @@ -1887,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,  	return size;  } -static ssize_t set_cmci_disabled(struct sys_device *s, -				 struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, +				 struct device_attribute *attr,  				 const char *buf, size_t size)  {  	u64 new; @@ -1910,108 +1959,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s,  	return size;  } -static ssize_t store_int_with_restart(struct sys_device *s, -				      struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, +				      struct device_attribute *attr,  				      const char *buf, size_t size)  { -	ssize_t ret = sysdev_store_int(s, attr, buf, size); +	ssize_t ret = device_store_int(s, attr, buf, size);  	mce_restart();  	return ret;  } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { -	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, -		     store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { +	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),  	&check_interval  }; -static struct sysdev_ext_attribute attr_ignore_ce = { -	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +static struct dev_ext_attribute dev_attr_ignore_ce = { +	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),  	&mce_ignore_ce  }; -static struct sysdev_ext_attribute attr_cmci_disabled = { -	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +static struct dev_ext_attribute dev_attr_cmci_disabled = { +	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),  	&mce_cmci_disabled  }; -static struct sysdev_attribute *mce_sysdev_attrs[] = { -	&attr_tolerant.attr, -	&attr_check_interval.attr, -	&attr_trigger, -	&attr_monarch_timeout.attr, -	&attr_dont_log_ce.attr, -	&attr_ignore_ce.attr, -	&attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { +	&dev_attr_tolerant.attr, +	&dev_attr_check_interval.attr, +	&dev_attr_trigger, +	&dev_attr_monarch_timeout.attr, +	&dev_attr_dont_log_ce.attr, +	&dev_attr_ignore_ce.attr, +	&dev_attr_cmci_disabled.attr,  	NULL  }; -static cpumask_var_t mce_sysdev_initialized; +static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_sysdev_create(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_device_create(unsigned int cpu)  { -	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); +	struct device *dev = &per_cpu(mce_device, cpu);  	int err;  	int i, j;  	if (!mce_available(&boot_cpu_data))  		return -EIO; -	memset(&sysdev->kobj, 0, sizeof(struct kobject)); -	sysdev->id  = cpu; -	sysdev->cls = &mce_sysdev_class; +	memset(&dev->kobj, 0, sizeof(struct kobject)); +	dev->id  = cpu; +	dev->bus = &mce_subsys; -	err = sysdev_register(sysdev); +	err = device_register(dev);  	if (err)  		return err; -	for (i = 0; mce_sysdev_attrs[i]; i++) { -		err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); +	for (i = 0; mce_device_attrs[i]; i++) { +		err = device_create_file(dev, mce_device_attrs[i]);  		if (err)  			goto error;  	}  	for (j = 0; j < banks; j++) { -		err = sysdev_create_file(sysdev, &mce_banks[j].attr); +		err = device_create_file(dev, &mce_banks[j].attr);  		if (err)  			goto error2;  	} -	cpumask_set_cpu(cpu, mce_sysdev_initialized); +	cpumask_set_cpu(cpu, mce_device_initialized);  	return 0;  error2:  	while (--j >= 0) -		sysdev_remove_file(sysdev, &mce_banks[j].attr); +		device_remove_file(dev, &mce_banks[j].attr);  error:  	while (--i >= 0) -		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); +		device_remove_file(dev, mce_device_attrs[i]); -	sysdev_unregister(sysdev); +	device_unregister(dev);  	return err;  } -static __cpuinit void mce_sysdev_remove(unsigned int cpu) +static __cpuinit void mce_device_remove(unsigned int cpu)  { -	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); +	struct device *dev = &per_cpu(mce_device, cpu);  	int i; -	if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) +	if (!cpumask_test_cpu(cpu, mce_device_initialized))  		return; -	for (i = 0; mce_sysdev_attrs[i]; i++) -		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); +	for (i = 0; mce_device_attrs[i]; i++) +		device_remove_file(dev, mce_device_attrs[i]);  	for (i = 0; i < banks; i++) -		sysdev_remove_file(sysdev, &mce_banks[i].attr); +		device_remove_file(dev, &mce_banks[i].attr); -	sysdev_unregister(sysdev); -	cpumask_clear_cpu(cpu, mce_sysdev_initialized); +	device_unregister(dev); +	cpumask_clear_cpu(cpu, mce_device_initialized);  }  /* Make sure there are no machine checks on offlined CPUs. */ @@ -2061,7 +2109,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: -		mce_sysdev_create(cpu); +		mce_device_create(cpu);  		if (threshold_cpu_callback)  			threshold_cpu_callback(action, cpu);  		break; @@ -2069,7 +2117,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)  	case CPU_DEAD_FROZEN:  		if (threshold_cpu_callback)  			threshold_cpu_callback(action, cpu); -		mce_sysdev_remove(cpu); +		mce_device_remove(cpu);  		break;  	case CPU_DOWN_PREPARE:  	case CPU_DOWN_PREPARE_FROZEN: @@ -2103,7 +2151,7 @@ static __init void mce_init_banks(void)  	for (i = 0; i < banks; i++) {  		struct mce_bank *b = &mce_banks[i]; -		struct sysdev_attribute *a = &b->attr; +		struct device_attribute *a = &b->attr;  		sysfs_attr_init(&a->attr);  		a->attr.name	= b->attrname; @@ -2123,16 +2171,16 @@ static __init int mcheck_init_device(void)  	if (!mce_available(&boot_cpu_data))  		return -EIO; -	zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); +	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);  	mce_init_banks(); -	err = sysdev_class_register(&mce_sysdev_class); +	err = subsys_system_register(&mce_subsys, NULL);  	if (err)  		return err;  	for_each_online_cpu(i) { -		err = mce_sysdev_create(i); +		err = mce_device_create(i);  		if (err)  			return err;  	} diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f5474218cff..ba0b94a7e20 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -17,7 +17,6 @@  #include <linux/notifier.h>  #include <linux/kobject.h>  #include <linux/percpu.h> -#include <linux/sysdev.h>  #include <linux/errno.h>  #include <linux/sched.h>  #include <linux/sysfs.h> @@ -64,11 +63,9 @@ struct threshold_bank {  };  static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP  static unsigned char shared_bank[NR_BANKS] = {  	0, 0, 0, 0, 1  }; -#endif  static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */ @@ -202,10 +199,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  			if (!block)  				per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP  			if (shared_bank[bank] && c->cpu_core_id)  				break; -#endif +  			offset = setup_APIC_mce(offset,  						(high & MASK_LVTOFF_HI) >> 20); @@ -531,7 +527,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  	sprintf(name, "threshold_bank%i", bank); -#ifdef CONFIG_SMP  	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */  		i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -548,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (!b)  			goto out; -		err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, +		err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,  					b->kobj, name);  		if (err)  			goto out; @@ -558,7 +553,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		goto out;  	} -#endif  	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);  	if (!b) { @@ -571,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		goto out;  	} -	b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); +	b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);  	if (!b->kobj)  		goto out_free; @@ -591,7 +585,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (i == cpu)  			continue; -		err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, +		err = sysfs_create_link(&per_cpu(mce_device, i).kobj,  					b->kobj, name);  		if (err)  			goto out; @@ -669,7 +663,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  #ifdef CONFIG_SMP  	/* sibling symlink */  	if (shared_bank[bank] && b->blocks->cpu != cpu) { -		sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); +		sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);  		per_cpu(threshold_banks, cpu)[bank] = NULL;  		return; @@ -681,7 +675,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  		if (i == cpu)  			continue; -		sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); +		sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);  		per_cpu(threshold_banks, i)[bank] = NULL;  	} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c84ea..67bb17a37a0 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -19,7 +19,6 @@  #include <linux/kernel.h>  #include <linux/percpu.h>  #include <linux/export.h> -#include <linux/sysdev.h>  #include <linux/types.h>  #include <linux/init.h>  #include <linux/smp.h> @@ -69,16 +68,16 @@ static atomic_t therm_throt_en	= ATOMIC_INIT(0);  static u32 lvtthmr_init __read_mostly;  #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name)				\ -	static SYSDEV_ATTR(_name, 0444,					\ -			   therm_throt_sysdev_show_##_name,		\ +#define define_therm_throt_device_one_ro(_name)				\ +	static DEVICE_ATTR(_name, 0444,					\ +			   therm_throt_device_show_##_name,		\  				   NULL)				\ -#define define_therm_throt_sysdev_show_func(event, name)		\ +#define define_therm_throt_device_show_func(event, name)		\  									\ -static ssize_t therm_throt_sysdev_show_##event##_##name(		\ -			struct sys_device *dev,				\ -			struct sysdev_attribute *attr,			\ +static ssize_t therm_throt_device_show_##event##_##name(		\ +			struct device *dev,				\ +			struct device_attribute *attr,			\  			char *buf)					\  {									\  	unsigned int cpu = dev->id;					\ @@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name(		\  	return ret;							\  } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count);  static struct attribute *thermal_throttle_attrs[] = { -	&attr_core_throttle_count.attr, +	&dev_attr_core_throttle_count.attr,  	NULL  }; @@ -223,36 +222,36 @@ static int thresh_event_valid(int event)  #ifdef CONFIG_SYSFS  /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, +static __cpuinit int thermal_throttle_add_dev(struct device *dev,  				unsigned int cpu)  {  	int err;  	struct cpuinfo_x86 *c = &cpu_data(cpu); -	err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); +	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);  	if (err)  		return err;  	if (cpu_has(c, X86_FEATURE_PLN)) -		err = sysfs_add_file_to_group(&sys_dev->kobj, -					      &attr_core_power_limit_count.attr, +		err = sysfs_add_file_to_group(&dev->kobj, +					      &dev_attr_core_power_limit_count.attr,  					      thermal_attr_group.name);  	if (cpu_has(c, X86_FEATURE_PTS)) { -		err = sysfs_add_file_to_group(&sys_dev->kobj, -					      &attr_package_throttle_count.attr, +		err = sysfs_add_file_to_group(&dev->kobj, +					      &dev_attr_package_throttle_count.attr,  					      thermal_attr_group.name);  		if (cpu_has(c, X86_FEATURE_PLN)) -			err = sysfs_add_file_to_group(&sys_dev->kobj, -					&attr_package_power_limit_count.attr, +			err = sysfs_add_file_to_group(&dev->kobj, +					&dev_attr_package_power_limit_count.attr,  					thermal_attr_group.name);  	}  	return err;  } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static __cpuinit void thermal_throttle_remove_dev(struct device *dev)  { -	sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); +	sysfs_remove_group(&dev->kobj, &thermal_attr_group);  }  /* Mutex protecting device creation against CPU hotplug: */ @@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,  			      void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; -	struct sys_device *sys_dev; +	struct device *dev;  	int err = 0; -	sys_dev = get_cpu_sysdev(cpu); +	dev = get_cpu_device(cpu);  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN:  		mutex_lock(&therm_cpu_lock); -		err = thermal_throttle_add_dev(sys_dev, cpu); +		err = thermal_throttle_add_dev(dev, cpu);  		mutex_unlock(&therm_cpu_lock);  		WARN_ON(err);  		break; @@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,  	case CPU_DEAD:  	case CPU_DEAD_FROZEN:  		mutex_lock(&therm_cpu_lock); -		thermal_throttle_remove_dev(sys_dev); +		thermal_throttle_remove_dev(dev);  		mutex_unlock(&therm_cpu_lock);  		break;  	} @@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)  #endif  	/* connect live CPUs to sysfs */  	for_each_online_cpu(cpu) { -		err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); +		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);  		WARN_ON(err);  	}  #ifdef CONFIG_HOTPLUG_CPU @@ -323,17 +322,6 @@ device_initcall(thermal_throttle_init_device);  #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED		(0) -#define CORE_POWER_LIMIT	((__u64)1 << 62) -#define PACKAGE_THROTTLED	((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT	((__u64)3 << 62) -  static void notify_thresholds(__u64 msr_val)  {  	/* check whether the interrupt handler is defined; @@ -363,27 +351,23 @@ static void intel_thermal_interrupt(void)  	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,  				THERMAL_THROTTLING_EVENT,  				CORE_LEVEL) != 0) -		mce_log_therm_throt_event(CORE_THROTTLED | msr_val); +		mce_log_therm_throt_event(msr_val);  	if (this_cpu_has(X86_FEATURE_PLN)) -		if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, +		therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,  					POWER_LIMIT_EVENT, -					CORE_LEVEL) != 0) -			mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); +					CORE_LEVEL);  	if (this_cpu_has(X86_FEATURE_PTS)) {  		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); -		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, +		therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,  					THERMAL_THROTTLING_EVENT, -					PACKAGE_LEVEL) != 0) -			mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); +					PACKAGE_LEVEL);  		if (this_cpu_has(X86_FEATURE_PLN)) -			if (therm_throt_process(msr_val & +			therm_throt_process(msr_val &  					PACKAGE_THERM_STATUS_POWER_LIMIT,  					POWER_LIMIT_EVENT, -					PACKAGE_LEVEL) != 0) -				mce_log_therm_throt_event(PACKAGE_POWER_LIMIT -							  | msr_val); +					PACKAGE_LEVEL);  	}  } @@ -397,8 +381,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;  asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)  { -	exit_idle();  	irq_enter(); +	exit_idle();  	inc_irq_stat(irq_thermal_count);  	smp_thermal_vector();  	irq_exit(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2909c..aa578cadb94 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;  asmlinkage void smp_threshold_interrupt(void)  { -	exit_idle();  	irq_enter(); +	exit_idle();  	inc_irq_stat(irq_threshold_count);  	mce_threshold_vector();  	irq_exit(); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2bda212a001..5adce1040b1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)  	return event->pmu == &pmu;  } +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { +	int	weight; +	int	event;		/* event index */ +	int	counter;	/* counter index */ +	int	unassigned;	/* number of events to be assigned left */ +	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define	SCHED_STATES_MAX	2 + +struct perf_sched { +	int			max_weight; +	int			max_events; +	struct event_constraint	**constraints; +	struct sched_state	state; +	int			saved_states; +	struct sched_state	saved[SCHED_STATES_MAX]; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, +			    int num, int wmin, int wmax) +{ +	int idx; + +	memset(sched, 0, sizeof(*sched)); +	sched->max_events	= num; +	sched->max_weight	= wmax; +	sched->constraints	= c; + +	for (idx = 0; idx < num; idx++) { +		if (c[idx]->weight == wmin) +			break; +	} + +	sched->state.event	= idx;		/* start with min weight */ +	sched->state.weight	= wmin; +	sched->state.unassigned	= num; +} + +static void perf_sched_save_state(struct perf_sched *sched) +{ +	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) +		return; + +	sched->saved[sched->saved_states] = sched->state; +	sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ +	if (!sched->saved_states) +		return false; + +	sched->saved_states--; +	sched->state = sched->saved[sched->saved_states]; + +	/* continue with next counter: */ +	clear_bit(sched->state.counter++, sched->state.used); + +	return true; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) +{ +	struct event_constraint *c; +	int idx; + +	if (!sched->state.unassigned) +		return false; + +	if (sched->state.event >= sched->max_events) +		return false; + +	c = sched->constraints[sched->state.event]; + +	/* Prefer fixed purpose counters */ +	if (x86_pmu.num_counters_fixed) { +		idx = X86_PMC_IDX_FIXED; +		for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { +			if (!__test_and_set_bit(idx, sched->state.used)) +				goto done; +		} +	} +	/* Grab the first unused counter starting with idx */ +	idx = sched->state.counter; +	for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { +		if (!__test_and_set_bit(idx, sched->state.used)) +			goto done; +	} + +	return false; + +done: +	sched->state.counter = idx; + +	if (c->overlap) +		perf_sched_save_state(sched); + +	return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ +	while (!__perf_sched_find_counter(sched)) { +		if (!perf_sched_restore_state(sched)) +			return false; +	} + +	return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ +	struct event_constraint *c; + +	if (!sched->state.unassigned || !--sched->state.unassigned) +		return false; + +	do { +		/* next event */ +		sched->state.event++; +		if (sched->state.event >= sched->max_events) { +			/* next weight */ +			sched->state.event = 0; +			sched->state.weight++; +			if (sched->state.weight > sched->max_weight) +				return false; +		} +		c = sched->constraints[sched->state.event]; +	} while (c->weight != sched->state.weight); + +	sched->state.counter = 0;	/* start with first counter */ + +	return true; +} + +/* + * Assign a counter for each event. + */ +static int perf_assign_events(struct event_constraint **constraints, int n, +			      int wmin, int wmax, int *assign) +{ +	struct perf_sched sched; + +	perf_sched_init(&sched, constraints, n, wmin, wmax); + +	do { +		if (!perf_sched_find_counter(&sched)) +			break;	/* failed */ +		if (assign) +			assign[sched.state.event] = sched.state.counter; +	} while (perf_sched_next_event(&sched)); + +	return sched.state.unassigned; +} +  int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  {  	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];  	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; -	int i, j, w, wmax, num = 0; +	int i, wmin, wmax, num = 0;  	struct hw_perf_event *hwc;  	bitmap_zero(used_mask, X86_PMC_IDX_MAX); -	for (i = 0; i < n; i++) { +	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {  		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);  		constraints[i] = c; +		wmin = min(wmin, c->weight); +		wmax = max(wmax, c->weight);  	}  	/* @@ -521,60 +698,12 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  		if (assign)  			assign[i] = hwc->idx;  	} -	if (i == n) -		goto done; - -	/* -	 * begin slow path -	 */ - -	bitmap_zero(used_mask, X86_PMC_IDX_MAX); -	/* -	 * weight = number of possible counters -	 * -	 * 1    = most constrained, only works on one counter -	 * wmax = least constrained, works on any counter -	 * -	 * assign events to counters starting with most -	 * constrained events. -	 */ -	wmax = x86_pmu.num_counters; +	/* slow path */ +	if (i != n) +		num = perf_assign_events(constraints, n, wmin, wmax, assign);  	/* -	 * when fixed event counters are present, -	 * wmax is incremented by 1 to account -	 * for one more choice -	 */ -	if (x86_pmu.num_counters_fixed) -		wmax++; - -	for (w = 1, num = n; num && w <= wmax; w++) { -		/* for each event */ -		for (i = 0; num && i < n; i++) { -			c = constraints[i]; -			hwc = &cpuc->event_list[i]->hw; - -			if (c->weight != w) -				continue; - -			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { -				if (!test_bit(j, used_mask)) -					break; -			} - -			if (j == X86_PMC_IDX_MAX) -				break; - -			__set_bit(j, used_mask); - -			if (assign) -				assign[i] = j; -			num--; -		} -	} -done: -	/*  	 * scheduling failed or is just a simulation,  	 * free resources if necessary  	 */ @@ -1119,6 +1248,7 @@ static void __init pmu_check_apic(void)  static int __init init_hw_perf_events(void)  { +	struct x86_pmu_quirk *quirk;  	struct event_constraint *c;  	int err; @@ -1147,8 +1277,8 @@ static int __init init_hw_perf_events(void)  	pr_cont("%s PMU driver.\n", x86_pmu.name); -	if (x86_pmu.quirks) -		x86_pmu.quirks(); +	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) +		quirk->func();  	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {  		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -1171,12 +1301,18 @@ static int __init init_hw_perf_events(void)  	unconstrained = (struct event_constraint)  		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, -				   0, x86_pmu.num_counters); +				   0, x86_pmu.num_counters, 0);  	if (x86_pmu.event_constraints) { +		/* +		 * event on fixed counter2 (REF_CYCLES) only works on this +		 * counter, so do not extend mask to generic counters +		 */  		for_each_event_constraint(c, x86_pmu.event_constraints) { -			if (c->cmask != X86_RAW_EVENT_MASK) +			if (c->cmask != X86_RAW_EVENT_MASK +			    || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {  				continue; +			}  			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;  			c->weight += x86_pmu.num_counters; @@ -1566,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)  	return misc;  } + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ +	cap->version		= x86_pmu.version; +	cap->num_counters_gp	= x86_pmu.num_counters; +	cap->num_counters_fixed	= x86_pmu.num_counters_fixed; +	cap->bit_width_gp	= x86_pmu.cntval_bits; +	cap->bit_width_fixed	= x86_pmu.cntval_bits; +	cap->events_mask	= (unsigned int)x86_pmu.events_maskl; +	cap->events_mask_len	= x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index b9698d40ac4..8944062f46e 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -45,6 +45,7 @@ struct event_constraint {  	u64	code;  	u64	cmask;  	int	weight; +	int	overlap;  };  struct amd_nb { @@ -151,15 +152,40 @@ struct cpu_hw_events {  	void				*kfree_on_online;  }; -#define __EVENT_CONSTRAINT(c, n, m, w) {\ +#define __EVENT_CONSTRAINT(c, n, m, w, o) {\  	{ .idxmsk64 = (n) },		\  	.code = (c),			\  	.cmask = (m),			\  	.weight = (w),			\ +	.overlap = (o),			\  }  #define EVENT_CONSTRAINT(c, n, m)	\ -	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) +	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m)	\ +	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)  /*   * Constraint on the Event code. @@ -235,6 +261,11 @@ union perf_capabilities {  	u64	capabilities;  }; +struct x86_pmu_quirk { +	struct x86_pmu_quirk *next; +	void (*func)(void); +}; +  /*   * struct x86_pmu - generic x86 pmu   */ @@ -259,6 +290,11 @@ struct x86_pmu {  	int		num_counters_fixed;  	int		cntval_bits;  	u64		cntval_mask; +	union { +			unsigned long events_maskl; +			unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; +	}; +	int		events_mask_len;  	int		apic;  	u64		max_period;  	struct event_constraint * @@ -268,7 +304,7 @@ struct x86_pmu {  	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,  						 struct perf_event *event);  	struct event_constraint *event_constraints; -	void		(*quirks)(void); +	struct x86_pmu_quirk *quirks;  	int		perfctr_second_write;  	int		(*cpu_prepare)(int cpu); @@ -309,6 +345,15 @@ struct x86_pmu {  	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);  }; +#define x86_add_quirk(func_)						\ +do {									\ +	static struct x86_pmu_quirk __quirk __initdata = {		\ +		.func = func_,						\ +	};								\ +	__quirk.next = x86_pmu.quirks;					\ +	x86_pmu.quirks = &__quirk;					\ +} while (0) +  #define ERF_NO_HT_SHARING	1  #define ERF_HAS_RSP_1		2 diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index aeefd45697a..0397b23be8e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = {  static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);  static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);  static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);  static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);  static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8d601b18bf9..3bd37bdf1b8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =    [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,    [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,    [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c, +  [PERF_COUNT_HW_REF_CPU_CYCLES]	= 0x0300, /* pseudo-encoding */  };  static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* -	 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event -	 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed -	 * ratio between these counters. -	 */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2),  CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */  	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */  	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */  	INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */  	INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */  	INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */  	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */  	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */  	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	EVENT_CONSTRAINT_END  }; @@ -1169,7 +1165,7 @@ again:  		 */  		c = &unconstrained;  	} else if (intel_try_alt_er(event, orig_idx)) { -		raw_spin_unlock(&era->lock); +		raw_spin_unlock_irqrestore(&era->lock, flags);  		goto again;  	}  	raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {  	.guest_get_msrs		= intel_guest_get_msrs,  }; -static void intel_clovertown_quirks(void) +static __init void intel_clovertown_quirk(void)  {  	/*  	 * PEBS is unreliable due to: @@ -1545,19 +1541,60 @@ static void intel_clovertown_quirks(void)  	x86_pmu.pebs_constraints = NULL;  } -static void intel_sandybridge_quirks(void) +static __init void intel_sandybridge_quirk(void)  {  	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");  	x86_pmu.pebs = 0;  	x86_pmu.pebs_constraints = NULL;  } +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { +	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, +	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, +	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, +	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, +	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, +	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, +	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ +	int bit; + +	/* disable event that reported as not presend by cpuid */ +	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { +		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; +		printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", +				intel_arch_events_map[bit].name); +	} +} + +static __init void intel_nehalem_quirk(void) +{ +	union cpuid10_ebx ebx; + +	ebx.full = x86_pmu.events_maskl; +	if (ebx.split.no_branch_misses_retired) { +		/* +		 * Erratum AAJ80 detected, we work it around by using +		 * the BR_MISP_EXEC.ANY event. This will over-count +		 * branch-misses, but it's still much better than the +		 * architectural event which is often completely bogus: +		 */ +		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; +		ebx.split.no_branch_misses_retired = 0; +		x86_pmu.events_maskl = ebx.full; +		printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); +	} +} +  __init int intel_pmu_init(void)  {  	union cpuid10_edx edx;  	union cpuid10_eax eax; +	union cpuid10_ebx ebx;  	unsigned int unused; -	unsigned int ebx;  	int version;  	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -1574,8 +1611,8 @@ __init int intel_pmu_init(void)  	 * Check whether the Architectural PerfMon supports  	 * Branch Misses Retired hw_event or not.  	 */ -	cpuid(10, &eax.full, &ebx, &unused, &edx.full); -	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) +	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); +	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)  		return -ENODEV;  	version = eax.split.version_id; @@ -1589,6 +1626,9 @@ __init int intel_pmu_init(void)  	x86_pmu.cntval_bits		= eax.split.bit_width;  	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1; +	x86_pmu.events_maskl		= ebx.full; +	x86_pmu.events_mask_len		= eax.split.mask_length; +  	/*  	 * Quirk: v2 perfmon does not report fixed-purpose events, so  	 * assume at least 3 events: @@ -1608,6 +1648,8 @@ __init int intel_pmu_init(void)  	intel_ds_init(); +	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ +  	/*  	 * Install the hw-cache-events table:  	 */ @@ -1617,7 +1659,7 @@ __init int intel_pmu_init(void)  		break;  	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ -		x86_pmu.quirks = intel_clovertown_quirks; +		x86_add_quirk(intel_clovertown_quirk);  	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */  	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */  	case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -1651,17 +1693,8 @@ __init int intel_pmu_init(void)  		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */  		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; -		if (ebx & 0x40) { -			/* -			 * Erratum AAJ80 detected, we work it around by using -			 * the BR_MISP_EXEC.ANY event. This will over-count -			 * branch-misses, but it's still much better than the -			 * architectural event which is often completely bogus: -			 */ -			intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; +		x86_add_quirk(intel_nehalem_quirk); -			pr_cont("erratum AAJ80 worked around, "); -		}  		pr_cont("Nehalem events, ");  		break; @@ -1701,7 +1734,7 @@ __init int intel_pmu_init(void)  		break;  	case 42: /* SandyBridge */ -		x86_pmu.quirks = intel_sandybridge_quirks; +		x86_add_quirk(intel_sandybridge_quirk);  	case 45: /* SandyBridge, "Romely-EP" */  		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids)); @@ -1738,5 +1771,6 @@ __init int intel_pmu_init(void)  			break;  		}  	} +  	return 0;  } diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 5abbea297e0..7b3fe56b1c2 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {  	"100mhzsteps",  	"hwpstate",  	"",	/* tsc invariant mapped to constant_tsc */ -		/* nothing */ +	"cpb",  /* core performance boost */ +	"eff_freq_ro", /* Readonly aperf/mperf */  }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 14b23140e81..8022c668148 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)  static int show_cpuinfo(struct seq_file *m, void *v)  {  	struct cpuinfo_x86 *c = v; -	unsigned int cpu = 0; +	unsigned int cpu;  	int i; -#ifdef CONFIG_SMP  	cpu = c->cpu_index; -#endif  	seq_printf(m, "processor\t: %u\n"  		   "vendor_id\t: %s\n"  		   "cpu family\t: %d\n" diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 212a6a42527..a524353d93f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =  	.notifier_call = cpuid_class_cpu_callback,  }; -static char *cpuid_devnode(struct device *dev, mode_t *mode) +static char *cpuid_devnode(struct device *dev, umode_t *mode)  {  	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));  } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 3b97a80ce32..c99f9ed013d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -116,16 +116,16 @@ void show_registers(struct pt_regs *regs)  		for (i = 0; i < code_len; i++, ip++) {  			if (ip < (u8 *)PAGE_OFFSET ||  					probe_kernel_address(ip, c)) { -				printk(" Bad EIP value."); +				printk(KERN_CONT " Bad EIP value.");  				break;  			}  			if (ip == (u8 *)regs->ip) -				printk("<%02x> ", c); +				printk(KERN_CONT "<%02x> ", c);  			else -				printk("%02x ", c); +				printk(KERN_CONT "%02x ", c);  		}  	} -	printk("\n"); +	printk(KERN_CONT "\n");  }  int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 19853ad8afc..6d728d9284b 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -284,16 +284,16 @@ void show_registers(struct pt_regs *regs)  		for (i = 0; i < code_len; i++, ip++) {  			if (ip < (u8 *)PAGE_OFFSET ||  					probe_kernel_address(ip, c)) { -				printk(" Bad RIP value."); +				printk(KERN_CONT " Bad RIP value.");  				break;  			}  			if (ip == (u8 *)regs->ip) -				printk("<%02x> ", c); +				printk(KERN_CONT "<%02x> ", c);  			else -				printk("%02x ", c); +				printk(KERN_CONT "%02x ", c);  		}  	} -	printk("\n"); +	printk(KERN_CONT "\n");  }  int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 65ffd110a81..8071e2f3d6e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -135,7 +135,6 @@ static void __init e820_print_type(u32 type)  		printk(KERN_CONT "(usable)");  		break;  	case E820_RESERVED: -	case E820_RESERVED_EFI:  		printk(KERN_CONT "(reserved)");  		break;  	case E820_ACPI: @@ -739,35 +738,17 @@ core_initcall(e820_mark_nvs_memory);  /*   * pre allocated 4k and reserved it in memblock and e820_saved   */ -u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +u64 __init early_reserve_e820(u64 size, u64 align)  { -	u64 size = 0;  	u64 addr; -	u64 start; -	for (start = startt; ; start += size) { -		start = memblock_x86_find_in_range_size(start, &size, align); -		if (start == MEMBLOCK_ERROR) -			return 0; -		if (size >= sizet) -			break; +	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); +	if (addr) { +		e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); +		printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); +		update_e820_saved();  	} -#ifdef CONFIG_X86_32 -	if (start >= MAXMEM) -		return 0; -	if (start + size > MAXMEM) -		size = MAXMEM - start; -#endif - -	addr = round_down(start + size - sizet, align); -	if (addr < start) -		return 0; -	memblock_x86_reserve_range(addr, addr + sizet, "new next"); -	e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); -	printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); -	update_e820_saved(); -  	return addr;  } @@ -784,7 +765,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)  /*   * Find the highest page frame number we have available   */ -unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)  {  	int i;  	unsigned long last_pfn = 0; @@ -1091,7 +1072,7 @@ void __init memblock_x86_fill(void)  	 * We are safe to enable resizing, beause memblock_x86_fill()  	 * is rather later for x86  	 */ -	memblock_can_resize = 1; +	memblock_allow_resize();  	for (i = 0; i < e820.nr_map; i++) {  		struct e820entry *ei = &e820.map[i]; @@ -1106,22 +1087,36 @@ void __init memblock_x86_fill(void)  		memblock_add(ei->addr, ei->size);  	} -	memblock_analyze();  	memblock_dump_all();  }  void __init memblock_find_dma_reserve(void)  {  #ifdef CONFIG_X86_64 -	u64 free_size_pfn; -	u64 mem_size_pfn; +	u64 nr_pages = 0, nr_free_pages = 0; +	unsigned long start_pfn, end_pfn; +	phys_addr_t start, end; +	int i; +	u64 u; +  	/*  	 * need to find out used area below MAX_DMA_PFN  	 * need to use memblock to get free size in [0, MAX_DMA_PFN]  	 * at first, and assume boot_mem will not take below MAX_DMA_PFN  	 */ -	mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; -	free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; -	set_dma_reserve(mem_size_pfn - free_size_pfn); +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { +		start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); +		end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); +		nr_pages += end_pfn - start_pfn; +	} + +	for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { +		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); +		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); +		if (start_pfn < end_pfn) +			nr_free_pages += end_pfn - start_pfn; +	} + +	set_dma_reserve(nr_pages - nr_free_pages);  #endif  } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a350f7f..9d42a52d233 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -247,7 +247,7 @@ static int __init setup_early_printk(char *buf)  		}  		if (!strncmp(buf, "hsu", 3)) { -			hsu_early_console_init(); +			hsu_early_console_init(buf + 3);  			early_console_register(&early_hsu_console, keep);  		}  #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f534400..22d0e21b4dd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -625,6 +625,8 @@ work_notifysig:				# deal with pending signals and  	movl %esp, %eax  	jne work_notifysig_v86		# returning to kernel-space or  					# vm86-space +	TRACE_IRQS_ON +	ENABLE_INTERRUPTS(CLBR_NONE)  	xorl %edx, %edx  	call do_notify_resume  	jmp resume_userspace_sig @@ -638,6 +640,8 @@ work_notifysig_v86:  #else  	movl %esp, %eax  #endif +	TRACE_IRQS_ON +	ENABLE_INTERRUPTS(CLBR_NONE)  	xorl %edx, %edx  	call do_notify_resume  	jmp resume_userspace_sig diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e74b0..a20e1cb9dc8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64)  	/*CFI_REL_OFFSET	ss,0*/  	pushq_cfi %rax /* rsp */  	CFI_REL_OFFSET	rsp,0 -	pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ +	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */  	/*CFI_REL_OFFSET	rflags,0*/  	pushq_cfi $__KERNEL_CS /* cs */  	/*CFI_REL_OFFSET	cs,0*/ @@ -411,7 +411,7 @@ ENTRY(ret_from_fork)  	RESTORE_REST  	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread? -	je   int_ret_from_sys_call +	jz   retint_restore_args  	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET  	jnz  int_ret_from_sys_call @@ -465,7 +465,7 @@ ENTRY(system_call)  	 * after the swapgs, so that it can do the swapgs  	 * for the guest and jump here on syscall.  	 */ -ENTRY(system_call_after_swapgs) +GLOBAL(system_call_after_swapgs)  	movq	%rsp,PER_CPU_VAR(old_rsp)  	movq	PER_CPU_VAR(kernel_stack),%rsp @@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs)  	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)  	movq  %rcx,RIP-ARGOFFSET(%rsp)  	CFI_REL_OFFSET rip,RIP-ARGOFFSET -	GET_THREAD_INFO(%rcx) -	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) +	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	jnz tracesys  system_call_fastpath:  	cmpq $__NR_syscall_max,%rax @@ -496,10 +495,9 @@ ret_from_sys_call:  	/* edi:	flagmask */  sysret_check:  	LOCKDEP_SYS_EXIT -	GET_THREAD_INFO(%rcx)  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF -	movl TI_flags(%rcx),%edx +	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx  	andl %edi,%edx  	jnz  sysret_careful  	CFI_REMEMBER_STATE @@ -583,7 +581,7 @@ sysret_audit:  	/* Do syscall tracing */  tracesys:  #ifdef CONFIG_AUDITSYSCALL -	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) +	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	jz auditsys  #endif  	SAVE_REST @@ -612,8 +610,6 @@ tracesys:  GLOBAL(int_ret_from_sys_call)  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF -	testl $3,CS-ARGOFFSET(%rsp) -	je retint_restore_args  	movl $_TIF_ALLWORK_MASK,%edi  	/* edi:	mask to check */  GLOBAL(int_with_check) @@ -953,6 +949,7 @@ END(common_interrupt)  ENTRY(\sym)  	INTR_FRAME  	pushq_cfi $~(\num) +.Lcommon_\sym:  	interrupt \do_sym  	jmp ret_from_intr  	CFI_ENDPROC @@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \  	x86_platform_ipi smp_x86_platform_ipi  #ifdef CONFIG_SMP -.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ +	ALIGN +	INTR_FRAME +.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \  	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31  .if NUM_INVALIDATE_TLB_VECTORS > \idx -apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ -	invalidate_interrupt\idx smp_invalidate_interrupt +ENTRY(invalidate_interrupt\idx) +	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) +	jmp .Lcommon_invalidate_interrupt0 +	CFI_ADJUST_CFA_OFFSET -8 +END(invalidate_interrupt\idx)  .endif  .endr +	CFI_ENDPROC +apicinterrupt INVALIDATE_TLB_VECTOR_START, \ +	invalidate_interrupt0, smp_invalidate_interrupt  #endif  apicinterrupt THRESHOLD_APIC_VECTOR \ diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index af0699ba48c..48d9d4ea102 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)  		lowmem = 0x9f000;  	/* reserve all memory between lowmem and the 1MB mark */ -	memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); +	memblock_reserve(lowmem, 0x100000 - lowmem);  } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3bb08509a7a..51ff18616d5 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void)  void __init i386_start_kernel(void)  { -	memblock_init(); - -	memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); +	memblock_reserve(__pa_symbol(&_text), +			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));  #ifdef CONFIG_BLK_DEV_INITRD  	/* Reserve INITRD */ @@ -42,7 +41,7 @@ void __init i386_start_kernel(void)  		u64 ramdisk_image = boot_params.hdr.ramdisk_image;  		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;  		u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size); -		memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); +		memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);  	}  #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5655c2272ad..3a3b779f41d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)  {  	copy_bootdata(__va(real_mode_data)); -	memblock_init(); - -	memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); +	memblock_reserve(__pa_symbol(&_text), +			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));  #ifdef CONFIG_BLK_DEV_INITRD  	/* Reserve INITRD */ @@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data)  		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;  		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;  		unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size); -		memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); +		memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);  	}  #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1bb0bf4d92c..ad0de0c2714 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -2,7 +2,6 @@  #include <linux/clockchips.h>  #include <linux/interrupt.h>  #include <linux/export.h> -#include <linux/sysdev.h>  #include <linux/delay.h>  #include <linux/errno.h>  #include <linux/i8253.h> @@ -32,8 +31,6 @@  #define HPET_MIN_CYCLES			128  #define HPET_MIN_PROG_DELTA		(HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) -#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) -  /*   * HPET address is set in acpi/boot.c, when an ACPI entry exists   */ @@ -55,6 +52,11 @@ struct hpet_dev {  	char				name[10];  }; +inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +{ +	return container_of(evtdev, struct hpet_dev, evt); +} +  inline unsigned int hpet_readl(unsigned int a)  {  	return readl(hpet_virt_address + a); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 429e0c92924..7943e0c21bd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)  	for_each_online_cpu(j)  		seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);  	seq_printf(p, "  IRQ work interrupts\n"); +	seq_printf(p, "%*s: ", prec, "RTR"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); +	seq_printf(p, "  APIC ICR read retries\n");  #endif  	if (x86_platform_ipi_callback) {  		seq_printf(p, "%*s: ", prec, "PLT"); @@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  	sum += irq_stats(cpu)->irq_spurious_count;  	sum += irq_stats(cpu)->apic_perf_irqs;  	sum += irq_stats(cpu)->apic_irq_work_irqs; +	sum += irq_stats(cpu)->icr_read_retry_count;  #endif  	if (x86_platform_ipi_callback)  		sum += irq_stats(cpu)->x86_platform_ipis; @@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)  	unsigned vector = ~regs->orig_ax;  	unsigned irq; -	exit_idle();  	irq_enter(); +	exit_idle();  	irq = __this_cpu_read(vector_irq[vector]); @@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)  	ack_APIC_irq(); -	exit_idle(); -  	irq_enter(); +	exit_idle(); +  	inc_irq_stat(x86_platform_ipis);  	if (x86_platform_ipi_callback) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b3300e6bace..313fb5cddbc 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -9,7 +9,7 @@  #include <linux/kprobes.h>  #include <linux/init.h>  #include <linux/kernel_stat.h> -#include <linux/sysdev.h> +#include <linux/device.h>  #include <linux/bitops.h>  #include <linux/acpi.h>  #include <linux/io.h> diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index ea9d5f2f13e..2889b3d4388 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,  	put_online_cpus();  } -void arch_jump_label_transform_static(struct jump_entry *entry, +__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,  				      enum jump_label_type type)  {  	__jump_label_transform(entry, type, text_poke_early); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d494799aafc..fe86493f3ed 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -1,14 +1,18 @@  /*   *  AMD CPU Microcode Update Driver for Linux - *  Copyright (C) 2008 Advanced Micro Devices Inc. + *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.   *   *  Author: Peter Oruba <peter.oruba@amd.com>   *   *  Based on work by:   *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>   * - *  This driver allows to upgrade microcode on AMD - *  family 0x10 and 0x11 processors. + *  Maintainers: + *  Andreas Herrmann <andreas.herrmann3@amd.com> + *  Borislav Petkov <borislav.petkov@amd.com> + * + *  This driver allows to upgrade microcode on F10h AMD + *  CPUs and later.   *   *  Licensed under the terms of the GNU General Public   *  License version 2. See file COPYING for details. @@ -71,6 +75,9 @@ struct microcode_amd {  static struct equiv_cpu_entry *equiv_cpu_table; +/* page-sized ucode patch buffer */ +void *patch; +  static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)  {  	struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)  	return 0;  } -static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, -				  int rev) +static unsigned int verify_ucode_size(int cpu, u32 patch_size, +				      unsigned int size)  { -	unsigned int current_cpu_id; -	u16 equiv_cpu_id = 0; -	unsigned int i = 0; +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + +	switch (c->x86) { +	case 0x14: +		max_size = F14H_MPB_MAX_SIZE; +		break; +	case 0x15: +		max_size = F15H_MPB_MAX_SIZE; +		break; +	default: +		max_size = F1XH_MPB_MAX_SIZE; +		break; +	} + +	if (patch_size > min_t(u32, size, max_size)) { +		pr_err("patch size mismatch\n"); +		return 0; +	} + +	return patch_size; +} + +static u16 find_equiv_id(void) +{ +	unsigned int current_cpu_id, i = 0;  	BUG_ON(equiv_cpu_table == NULL); +  	current_cpu_id = cpuid_eax(0x00000001);  	while (equiv_cpu_table[i].installed_cpu != 0) { -		if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { -			equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; -			break; -		} +		if (current_cpu_id == equiv_cpu_table[i].installed_cpu) +			return equiv_cpu_table[i].equiv_cpu; +  		i++;  	} +	return 0; +} +/* + * we signal a good patch is found by returning its size > 0 + */ +static int get_matching_microcode(int cpu, const u8 *ucode_ptr, +				  unsigned int leftover_size, int rev, +				  unsigned int *current_size) +{ +	struct microcode_header_amd *mc_hdr; +	unsigned int actual_size; +	u16 equiv_cpu_id; + +	/* size of the current patch we're staring at */ +	*current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + +	equiv_cpu_id = find_equiv_id();  	if (!equiv_cpu_id)  		return 0; +	/* +	 * let's look at the patch header itself now +	 */ +	mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); +  	if (mc_hdr->processor_rev_id != equiv_cpu_id)  		return 0; @@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,  	if (mc_hdr->patch_id <= rev)  		return 0; -	return 1; +	/* +	 * now that the header looks sane, verify its size +	 */ +	actual_size = verify_ucode_size(cpu, *current_size, leftover_size); +	if (!actual_size) +		return 0; + +	/* clear the patch buffer */ +	memset(patch, 0, PAGE_SIZE); + +	/* all looks ok, get the binary patch */ +	get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); + +	return actual_size;  }  static int apply_microcode_amd(int cpu) @@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu)  	return 0;  } -static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) -{ -	struct cpuinfo_x86 *c = &cpu_data(cpu); -	u32 max_size, actual_size; - -#define F1XH_MPB_MAX_SIZE 2048 -#define F14H_MPB_MAX_SIZE 1824 -#define F15H_MPB_MAX_SIZE 4096 - -	switch (c->x86) { -	case 0x14: -		max_size = F14H_MPB_MAX_SIZE; -		break; -	case 0x15: -		max_size = F15H_MPB_MAX_SIZE; -		break; -	default: -		max_size = F1XH_MPB_MAX_SIZE; -		break; -	} - -	actual_size = *(u32 *)(buf + 4); - -	if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) { -		pr_err("section size mismatch\n"); -		return 0; -	} - -	return actual_size; -} - -static struct microcode_header_amd * -get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) -{ -	struct microcode_header_amd *mc = NULL; -	unsigned int actual_size = 0; - -	if (*(u32 *)buf != UCODE_UCODE_TYPE) { -		pr_err("invalid type field in container file section header\n"); -		goto out; -	} - -	actual_size = verify_ucode_size(cpu, buf, size); -	if (!actual_size) -		goto out; - -	mc = vzalloc(actual_size); -	if (!mc) -		goto out; - -	get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size); -	*mc_size = actual_size + SECTION_HDR_SIZE; - -out: -	return mc; -} -  static int install_equiv_cpu_table(const u8 *buf)  {  	unsigned int *ibuf = (unsigned int *)buf; @@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	struct microcode_header_amd *mc_hdr = NULL; -	unsigned int mc_size, leftover; +	unsigned int mc_size, leftover, current_size = 0;  	int offset;  	const u8 *ucode_ptr = data;  	void *new_mc = NULL;  	unsigned int new_rev = uci->cpu_sig.rev; -	enum ucode_state state = UCODE_OK; +	enum ucode_state state = UCODE_ERROR;  	offset = install_equiv_cpu_table(ucode_ptr);  	if (offset < 0) {  		pr_err("failed to create equivalent cpu table\n"); -		return UCODE_ERROR; +		goto out;  	} -  	ucode_ptr += offset;  	leftover = size - offset; -	while (leftover) { -		mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); -		if (!mc_hdr) -			break; +	if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { +		pr_err("invalid type field in container file section header\n"); +		goto free_table; +	} -		if (get_matching_microcode(cpu, mc_hdr, new_rev)) { -			vfree(new_mc); +	while (leftover) { +		mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, +						 new_rev, ¤t_size); +		if (mc_size) { +			mc_hdr  = patch; +			new_mc  = patch;  			new_rev = mc_hdr->patch_id; -			new_mc  = mc_hdr; -		} else -			vfree(mc_hdr); +			goto out_ok; +		} -		ucode_ptr += mc_size; -		leftover  -= mc_size; +		ucode_ptr += current_size; +		leftover  -= current_size;  	}  	if (!new_mc) { @@ -284,19 +298,16 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)  		goto free_table;  	} -	if (!leftover) { -		vfree(uci->mc); -		uci->mc = new_mc; -		pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", -			 cpu, uci->cpu_sig.rev, new_rev); -	} else { -		vfree(new_mc); -		state = UCODE_ERROR; -	} +out_ok: +	uci->mc = new_mc; +	state = UCODE_OK; +	pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", +		 cpu, uci->cpu_sig.rev, new_rev);  free_table:  	free_equiv_cpu_table(); +out:  	return state;  } @@ -337,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; -	vfree(uci->mc);  	uci->mc = NULL;  } @@ -351,5 +361,14 @@ static struct microcode_ops microcode_amd_ops = {  struct microcode_ops * __init init_amd_microcode(void)  { +	patch = (void *)get_zeroed_page(GFP_KERNEL); +	if (!patch) +		return NULL; +  	return µcode_amd_ops;  } + +void __exit exit_amd_microcode(void) +{ +	free_page((unsigned long)patch); +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9d46f5e43b5..fda91c30710 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)  	return err;  } -static ssize_t reload_store(struct sys_device *dev, -			    struct sysdev_attribute *attr, +static ssize_t reload_store(struct device *dev, +			    struct device_attribute *attr,  			    const char *buf, size_t size)  {  	unsigned long val; @@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,  	return ret;  } -static ssize_t version_show(struct sys_device *dev, -			struct sysdev_attribute *attr, char *buf) +static ssize_t version_show(struct device *dev, +			struct device_attribute *attr, char *buf)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;  	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);  } -static ssize_t pf_show(struct sys_device *dev, -			struct sysdev_attribute *attr, char *buf) +static ssize_t pf_show(struct device *dev, +			struct device_attribute *attr, char *buf)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;  	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);  } -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);  static struct attribute *mc_default_attrs[] = { -	&attr_reload.attr, -	&attr_version.attr, -	&attr_processor_flags.attr, +	&dev_attr_reload.attr, +	&dev_attr_version.attr, +	&dev_attr_processor_flags.attr,  	NULL  }; @@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)  	return ustate;  } -static int mc_sysdev_add(struct sys_device *sys_dev) +static int mc_device_add(struct device *dev, struct subsys_interface *sif)  { -	int err, cpu = sys_dev->id; +	int err, cpu = dev->id;  	if (!cpu_online(cpu))  		return 0;  	pr_debug("CPU%d added\n", cpu); -	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); +	err = sysfs_create_group(&dev->kobj, &mc_attr_group);  	if (err)  		return err;  	if (microcode_init_cpu(cpu) == UCODE_ERROR) { -		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); +		sysfs_remove_group(&dev->kobj, &mc_attr_group);  		return -EINVAL;  	}  	return err;  } -static int mc_sysdev_remove(struct sys_device *sys_dev) +static int mc_device_remove(struct device *dev, struct subsys_interface *sif)  { -	int cpu = sys_dev->id; +	int cpu = dev->id;  	if (!cpu_online(cpu))  		return 0;  	pr_debug("CPU%d removed\n", cpu);  	microcode_fini_cpu(cpu); -	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); +	sysfs_remove_group(&dev->kobj, &mc_attr_group);  	return 0;  } -static struct sysdev_driver mc_sysdev_driver = { -	.add			= mc_sysdev_add, -	.remove			= mc_sysdev_remove, +static struct subsys_interface mc_cpu_interface = { +	.name			= "microcode", +	.subsys			= &cpu_subsys, +	.add_dev		= mc_device_add, +	.remove_dev		= mc_device_remove,  };  /** @@ -464,9 +466,9 @@ static __cpuinit int  mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; -	struct sys_device *sys_dev; +	struct device *dev; -	sys_dev = get_cpu_sysdev(cpu); +	dev = get_cpu_device(cpu);  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: @@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)  	case CPU_DOWN_FAILED:  	case CPU_DOWN_FAILED_FROZEN:  		pr_debug("CPU%d added\n", cpu); -		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) +		if (sysfs_create_group(&dev->kobj, &mc_attr_group))  			pr_err("Failed to create group for CPU%d\n", cpu);  		break;  	case CPU_DOWN_PREPARE:  	case CPU_DOWN_PREPARE_FROZEN:  		/* Suspend is in progress, only remove the interface */ -		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); +		sysfs_remove_group(&dev->kobj, &mc_attr_group);  		pr_debug("CPU%d removed\n", cpu);  		break; @@ -525,7 +527,7 @@ static int __init microcode_init(void)  	get_online_cpus();  	mutex_lock(µcode_mutex); -	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); +	error = subsys_interface_register(&mc_cpu_interface);  	mutex_unlock(µcode_mutex);  	put_online_cpus(); @@ -535,7 +537,7 @@ static int __init microcode_init(void)  	error = microcode_dev_init();  	if (error) -		goto out_sysdev_driver; +		goto out_driver;  	register_syscore_ops(&mc_syscore_ops);  	register_hotcpu_notifier(&mc_cpu_notifier); @@ -545,11 +547,11 @@ static int __init microcode_init(void)  	return 0; -out_sysdev_driver: +out_driver:  	get_online_cpus();  	mutex_lock(µcode_mutex); -	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); +	subsys_interface_unregister(&mc_cpu_interface);  	mutex_unlock(µcode_mutex);  	put_online_cpus(); @@ -563,6 +565,8 @@ module_init(microcode_init);  static void __exit microcode_exit(void)  { +	struct cpuinfo_x86 *c = &cpu_data(0); +  	microcode_dev_exit();  	unregister_hotcpu_notifier(&mc_cpu_notifier); @@ -571,7 +575,7 @@ static void __exit microcode_exit(void)  	get_online_cpus();  	mutex_lock(µcode_mutex); -	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); +	subsys_interface_unregister(&mc_cpu_interface);  	mutex_unlock(µcode_mutex);  	put_online_cpus(); @@ -580,6 +584,9 @@ static void __exit microcode_exit(void)  	microcode_ops = NULL; +	if (c->x86_vendor == X86_VENDOR_AMD) +		exit_amd_microcode(); +  	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");  }  module_exit(microcode_exit); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0741b062a30..ca470e4c92d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early)  static void __init smp_reserve_memory(struct mpf_intel *mpf)  { -	unsigned long size = get_mpc_size(mpf->physptr); - -	memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); +	memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));  }  static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)  			       mpf, (u64)virt_to_phys(mpf));  			mem = virt_to_phys(mpf); -			memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); +			memblock_reserve(mem, sizeof(*mpf));  			if (mpf->physptr)  				smp_reserve_memory(mpf); @@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);  void __init early_reserve_e820_mpc_new(void)  { -	if (enable_update_mptable && alloc_mptable) { -		u64 startt = 0; -		mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); -	} +	if (enable_update_mptable && alloc_mptable) +		mpc_new_phys = early_reserve_e820(mpc_new_length, 4);  }  static int __init update_mp_table(void) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 12fcbe2c143..96356762a51 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {  	.notifier_call = msr_class_cpu_callback,  }; -static char *msr_devnode(struct device *dev, mode_t *mode) +static char *msr_devnode(struct device *dev, umode_t *mode)  {  	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));  } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ee5d4fbd53b..15763af7bfe 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)  	regs.orig_ax = -1;  	regs.ip = (unsigned long) kernel_thread_helper;  	regs.cs = __KERNEL_CS | get_kernel_rpl(); -	regs.flags = X86_EFLAGS_IF | 0x2; +	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;  	/* Ok, create the new process.. */  	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 795b79f984c..485204f58cd 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,8 @@ void cpu_idle(void)  	/* endless idle loop with no priority at all */  	while (1) { -		tick_nohz_stop_sched_tick(1); +		tick_nohz_idle_enter(); +		rcu_idle_enter();  		while (!need_resched()) {  			check_pgt_cache(); @@ -116,7 +117,8 @@ void cpu_idle(void)  				pm_idle();  			start_critical_timings();  		} -		tick_nohz_restart_sched_tick(); +		rcu_idle_exit(); +		tick_nohz_idle_exit();  		preempt_enable_no_resched();  		schedule();  		preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6eebf3..9b9fe4a85c8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void)  	/* endless idle loop with no priority at all */  	while (1) { -		tick_nohz_stop_sched_tick(1); +		tick_nohz_idle_enter();  		while (!need_resched()) {  			rmb(); @@ -139,8 +139,14 @@ void cpu_idle(void)  			enter_idle();  			/* Don't trace irqs off for idle */  			stop_critical_timings(); + +			/* enter_idle() needs rcu for notifiers */ +			rcu_idle_enter(); +  			if (cpuidle_idle_call())  				pm_idle(); + +			rcu_idle_exit();  			start_critical_timings();  			/* In many cases the interrupt that ended idle @@ -149,7 +155,7 @@ void cpu_idle(void)  			__exit_idle();  		} -		tick_nohz_restart_sched_tick(); +		tick_nohz_idle_exit();  		preempt_enable_no_resched();  		schedule();  		preempt_disable(); @@ -293,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));  	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { -		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); +		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, +						  IO_BITMAP_BYTES, GFP_KERNEL);  		if (!p->thread.io_bitmap_ptr) {  			p->thread.io_bitmap_max = 0;  			return -ENOMEM;  		} -		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, -				IO_BITMAP_BYTES);  		set_tsk_thread_flag(p, TIF_IO_BITMAP);  	} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 82528799c5d..89a04c7b5bb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -749,7 +749,8 @@ put:  /*   * Handle PTRACE_POKEUSR calls for the debug register area.   */ -int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +static int ptrace_set_debugreg(struct task_struct *tsk, int n, +			       unsigned long val)  {  	struct thread_struct *thread = &(tsk->thread);  	int rc = 0; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9a9e40fb091..d05444ac2ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -306,7 +306,8 @@ static void __init cleanup_highmap(void)  static void __init reserve_brk(void)  {  	if (_brk_end > _brk_start) -		memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); +		memblock_reserve(__pa(_brk_start), +				 __pa(_brk_end) - __pa(_brk_start));  	/* Mark brk area as locked down and no longer taking any  	   new allocations */ @@ -331,13 +332,13 @@ static void __init relocate_initrd(void)  	ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,  					 PAGE_SIZE); -	if (ramdisk_here == MEMBLOCK_ERROR) +	if (!ramdisk_here)  		panic("Cannot find place for new RAMDISK of size %lld\n",  			 ramdisk_size);  	/* Note: this includes all the lowmem currently occupied by  	   the initrd, we rely on that fact to keep the data intact. */ -	memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); +	memblock_reserve(ramdisk_here, area_size);  	initrd_start = ramdisk_here + PAGE_OFFSET;  	initrd_end   = initrd_start + ramdisk_size;  	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -393,7 +394,7 @@ static void __init reserve_initrd(void)  	initrd_start = 0;  	if (ramdisk_size >= (end_of_lowmem>>1)) { -		memblock_x86_free_range(ramdisk_image, ramdisk_end); +		memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);  		printk(KERN_ERR "initrd too large to handle, "  		       "disabling initrd\n");  		return; @@ -416,7 +417,7 @@ static void __init reserve_initrd(void)  	relocate_initrd(); -	memblock_x86_free_range(ramdisk_image, ramdisk_end); +	memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);  }  #else  static void __init reserve_initrd(void) @@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)  {  	struct setup_data *data;  	u64 pa_data; -	char buf[32];  	if (boot_params.hdr.version < 0x0209)  		return;  	pa_data = boot_params.hdr.setup_data;  	while (pa_data) {  		data = early_memremap(pa_data, sizeof(*data)); -		sprintf(buf, "setup data %x", data->type); -		memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); +		memblock_reserve(pa_data, sizeof(*data) + data->len);  		pa_data = data->next;  		early_iounmap(data, sizeof(*data));  	} @@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void)  		crash_base = memblock_find_in_range(alignment,  			       CRASH_KERNEL_ADDR_MAX, crash_size, alignment); -		if (crash_base == MEMBLOCK_ERROR) { +		if (!crash_base) {  			pr_info("crashkernel reservation failed - No suitable area found.\n");  			return;  		} @@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void)  			return;  		}  	} -	memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); +	memblock_reserve(crash_base, crash_size);  	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "  			"for crashkernel (System RAM: %ldMB)\n", @@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void)  	addr = find_ibft_region(&size);  	if (size) -		memblock_x86_reserve_range(addr, addr + size, "* ibft"); +		memblock_reserve(addr, size);  }  static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; @@ -691,8 +690,6 @@ early_param("reservelow", parse_reservelow);  void __init setup_arch(char **cmdline_p)  { -	unsigned long end_pfn; -  #ifdef CONFIG_X86_32  	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));  	visws_early_detect(); @@ -934,24 +931,7 @@ void __init setup_arch(char **cmdline_p)  	init_gbpages();  	/* max_pfn_mapped is updated here */ -	end_pfn = max_low_pfn; - -#ifdef CONFIG_X86_64 -	/* -	 * There may be regions after the last E820_RAM region that we -	 * want to include in the kernel direct mapping, such as -	 * EFI_RUNTIME_SERVICES_DATA. -	 */ -	if (efi_enabled) { -		unsigned long efi_end; - -		efi_end = e820_end_pfn(MAXMEM>>PAGE_SHIFT, E820_RESERVED_EFI); -		if (efi_end > max_low_pfn) -			end_pfn = efi_end; -	} -#endif - -	max_low_pfn_mapped = init_memory_mapping(0, end_pfn << PAGE_SHIFT); +	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);  	max_pfn_mapped = max_low_pfn_mapped;  #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb4a95..e38e21754ee 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -840,7 +840,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)  	pr_debug("++++++++++++++++++++=_---CPU UP  %u\n", cpu);  	if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || -	    !physid_isset(apicid, phys_cpu_present_map)) { +	    !physid_isset(apicid, phys_cpu_present_map) || +	    (!x2apic_mode && apicid >= 255)) {  		printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);  		return -EINVAL;  	} diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a91ae7709b4..a73b61055ad 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -14,11 +14,11 @@ void __init setup_trampolines(void)  	/* Has to be in very low memory so we can execute real-mode AP code. */  	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); -	if (mem == MEMBLOCK_ERROR) +	if (!mem)  		panic("Cannot allocate trampoline\n");  	x86_trampoline_base = __va(mem); -	memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); +	memblock_reserve(mem, size);  	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",  	       x86_trampoline_base, (unsigned long long)mem, size); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb83466..fa1191fb679 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)  			== NOTIFY_STOP)  		return;  #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ -#ifdef CONFIG_KPROBES +  	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)  			== NOTIFY_STOP)  		return; -#else -	if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) -			== NOTIFY_STOP) -		return; -#endif  	preempt_conditional_sti(regs);  	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db483369f10..2c9cf0fd78f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;     erroneous rdtsc usage on !cpu_has_tsc processors */  static int __read_mostly tsc_disabled = -1; -static int tsc_clocksource_reliable; +int tsc_clocksource_reliable;  /*   * Scheduler clock - returns current time in nanosec units.   */ @@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)  }  #define CAL_MS		10 -#define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_LATCH	(PIT_TICK_RATE / (1000 / CAL_MS))  #define CAL_PIT_LOOPS	1000  #define CAL2_MS		50 -#define CAL2_LATCH	(CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_LATCH	(PIT_TICK_RATE / (1000 / CAL2_MS))  #define CAL2_PIT_LOOPS	5000 diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0aa5fed8b9e..9eba29b46cb 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)  	if (unsynchronized_tsc())  		return; -	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { +	if (tsc_clocksource_reliable) {  		if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)  			pr_info(  			"Skipped synchronization checks as TSC is reliable.\n"); @@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)  {  	int cpus = 2; -	if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) +	if (unsynchronized_tsc() || tsc_clocksource_reliable)  		return;  	/* diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e4d4a22e8b9..b07ba939356 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =  	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),  }; -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;  static int __init vsyscall_setup(char *str)  { @@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)  	return nr;  } +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ +	/* +	 * XXX: if access_ok, get_user, and put_user handled +	 * sig_on_uaccess_error, this could go away. +	 */ + +	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { +		siginfo_t info; +		struct thread_struct *thread = ¤t->thread; + +		thread->error_code	= 6;  /* user fault, no page, write */ +		thread->cr2		= ptr; +		thread->trap_no		= 14; + +		memset(&info, 0, sizeof(info)); +		info.si_signo		= SIGSEGV; +		info.si_errno		= 0; +		info.si_code		= SEGV_MAPERR; +		info.si_addr		= (void __user *)ptr; + +		force_sig_info(SIGSEGV, &info, current); +		return false; +	} else { +		return true; +	} +} +  bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  {  	struct task_struct *tsk;  	unsigned long caller;  	int vsyscall_nr; +	int prev_sig_on_uaccess_error;  	long ret;  	/* @@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	if (seccomp_mode(&tsk->seccomp))  		do_exit(SIGKILL); +	/* +	 * With a real vsyscall, page faults cause SIGSEGV.  We want to +	 * preserve that behavior to make writing exploits harder. +	 */ +	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; +	current_thread_info()->sig_on_uaccess_error = 1; + +	/* +	 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and +	 * 64-bit, so we don't need to special-case it here.  For all the +	 * vsyscalls, 0 means "don't write anything" not "write it at +	 * address 0". +	 */ +	ret = -EFAULT;  	switch (vsyscall_nr) {  	case 0: +		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || +		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) +			break; +  		ret = sys_gettimeofday(  			(struct timeval __user *)regs->di,  			(struct timezone __user *)regs->si);  		break;  	case 1: +		if (!write_ok_or_segv(regs->di, sizeof(time_t))) +			break; +  		ret = sys_time((time_t __user *)regs->di);  		break;  	case 2: +		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || +		    !write_ok_or_segv(regs->si, sizeof(unsigned))) +			break; +  		ret = sys_getcpu((unsigned __user *)regs->di,  				 (unsigned __user *)regs->si,  				 0);  		break;  	} +	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; +  	if (ret == -EFAULT) { -		/* -		 * Bad news -- userspace fed a bad pointer to a vsyscall. -		 * -		 * With a real vsyscall, that would have caused SIGSEGV. -		 * To make writing reliable exploits using the emulated -		 * vsyscalls harder, generate SIGSEGV here as well. -		 */ +		/* Bad news -- userspace fed a bad pointer to a vsyscall. */  		warn_bad_vsyscall(KERN_INFO, regs,  				  "vsyscall fault (exploit attempt?)"); -		goto sigsegv; + +		/* +		 * If we failed to generate a signal for any reason, +		 * generate one here.  (This should be impossible.) +		 */ +		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && +				 !sigismember(&tsk->pending.signal, SIGSEGV))) +			goto sigsegv; + +		return true;  /* Don't emulate the ret. */  	}  	regs->ax = ret; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c1d6cd54939..91f83e21b98 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {  struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {  	.setup_percpu_clockev		= setup_secondary_APIC_clock, +	.fixup_cpu_id			= x86_default_fixup_cpu_id,  };  static void default_nmi_init(void) { };  |